In [1]:
import os

os.environ["HF_HOME"] = "/Data/shash/.cache"
os.environ["HF_HUB_CACHE"] = "/Data/shash/.cache/hub"

print(os.environ["HF_HOME"])

from dotenv import load_dotenv
import wandb

from huggingface_hub import login

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
wandb_token = os.getenv("WANDB_TOKEN")

if hf_token:
    login(hf_token)
    print("Logged in to Hugging Face successfully!")
else:
    print("HF_TOKEN not found in .env file.")

if wandb_token:
    wandb.login(key=wandb_token)
    print("Logged in to Weights & Biases successfully!")
else:
    print("WANDB_TOKEN not found in .env file.")


/Data/shash/.cache


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to Hugging Face successfully!


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /users/eleves-b/2024/shashwat.sharma/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mzshashwatz[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Logged in to Weights & Biases successfully!


In [2]:
# Phi-4 Fine-tuning Debug Notebook
# ===========================
# This notebook will help diagnose issues with the fine-tuned model for multilingual summarization

import os
import torch
import json
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from rouge import Rouge
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define paths and parameters
BASE_MODEL = "microsoft/Phi-4-mini-instruct"
FINETUNED_MODEL = "/Data/shash/mul/finetuned_models/checkpoint-1497"
DATASET_PATH = "/Data/shash/mul/hf_dataset"
LANGUAGE = "fr"  # French
MAX_INPUT_LENGTH = 3072
MAX_OUTPUT_LENGTH = 512


In [3]:

print(f"Base model: {BASE_MODEL}")
print(f"Fine-tuned model: {FINETUNED_MODEL}")
print(f"Dataset path: {DATASET_PATH}/{LANGUAGE}")

# 1. Load the dataset
print("\n=== Loading Dataset ===")
dataset = load_from_disk(os.path.join(DATASET_PATH, LANGUAGE))
test_dataset = dataset["test"]
print(f"Loaded {len(test_dataset)} test examples")

# Display a sample
sample_idx = 0
sample = test_dataset[sample_idx]
print(f"\nSample example (index {sample_idx}):")
print(f"Title: {sample['title']}")
print(f"Article length: {len(sample['text'])} chars")
print(f"Reference summary length: {len(sample['summary'])} chars")
print(f"\nFirst 300 chars of article text:")
print(sample['text'][:300] + "...")
print(f"\nReference summary:")
print(sample['summary'])

Base model: microsoft/Phi-4-mini-instruct
Fine-tuned model: /Data/shash/mul/finetuned_models/checkpoint-1497
Dataset path: /Data/shash/mul/hf_dataset/fr

=== Loading Dataset ===
Loaded 500 test examples

Sample example (index 0):
Title: Nihonium
Article length: 7829 chars
Reference summary length: 749 chars

First 300 chars of article text:
{{confusion|Technétium#À la recherche de l'élément chimique 43{{!}}Nipponium}}
{{Infobox Élément/Nihonium}}
Le '''nihonium''' ([[Symbole chimique|symbole]] '''Nh''') est l'[[élément chimique]] de [[numéro atomique]] 113. Il correspond à l'ununtrium (Uut) de la [[dénomination systématique]] de l'[[Un...

Reference summary:
Le nihonium (symbole Nh) est un élément chimique de numéro atomique 113, initialement appelé ununtrium. Il a été synthétisé pour la première fois en juillet 2004 au RIKEN près de Tokyo, et son identification a été validée par l'IUPAC en décembre 2015. Le nihonium est un transactinide radioactif, avec l'isotope le plus stable, le 28

In [4]:

# 2. Load base model and tokenizer
print("\n=== Loading Base Model ===")
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Ensure padding token is set correctly
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token
    print("Set pad_token to eos_token")

print(f"Tokenizer vocab size: {len(base_tokenizer)}")
print(f"Special tokens: {base_tokenizer.special_tokens_map}")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
)
print(f"Base model loaded: {type(base_model).__name__}")



=== Loading Base Model ===
Tokenizer vocab size: 200029
Special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded: Phi3ForCausalLM


In [5]:

# 3. Load fine-tuned model
print("\n=== Loading Fine-tuned Model ===")

# Check if adapter config exists (PEFT/LoRA)
adapter_config_path = os.path.join(FINETUNED_MODEL, "adapter_config.json")
is_peft_model = os.path.exists(adapter_config_path)

if is_peft_model:
    print(f"Found adapter_config.json - loading as PEFT/LoRA model")
    try:
        # Load PEFT configuration
        peft_config = PeftConfig.from_pretrained(FINETUNED_MODEL)
        print(f"PEFT config: {peft_config}")
        
        # Load as PeftModel (adapter on top of base model)
        finetuned_model = PeftModel.from_pretrained(
            base_model, 
            FINETUNED_MODEL
        )
        print("Successfully loaded as PeftModel")
    except Exception as e:
        print(f"Error loading as PEFT model: {e}")
        is_peft_model = False
        
if not is_peft_model:
    print("Loading as standard model (not PEFT/LoRA)")
    try:
        finetuned_model = AutoModelForCausalLM.from_pretrained(
            FINETUNED_MODEL,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        print("Successfully loaded as standard model")
    except Exception as e:
        print(f"Error loading model: {e}")
        raise e


=== Loading Fine-tuned Model ===
Found adapter_config.json - loading as PEFT/LoRA model
PEFT config: LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='microsoft/Phi-4-mini-instruct', revision=None, inference_mode=True, r=16, target_modules={'Wqkv', 'down_proj', 'up_proj', 'out_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)
Successfully loaded as PeftModel


In [6]:


# 4. Helper functions for prompt creation and generation
def get_system_message(language="fr"):
    """Get the appropriate system message based on language"""
    if language == "fr":
        return """Vous êtes un expert en résumé. Votre tâche est de créer des résumés concis et complets 
qui capturent tous les points clés et les arguments principaux tout en évitant les détails inutiles. 
Ne copiez pas simplement les premières phrases de l'article. Créez un résumé qui se tient 
par lui-même et couvre tout le contenu important de l'article."""
    else:
        return f"""You are an expert summarizer. Your task is to create concise, comprehensive summaries 
in {language} that capture all key points and main arguments while avoiding unnecessary details. 
Do not simply copy the first few sentences of the article. Create a summary that stands 
on its own and covers the entire article's important content."""

def create_phi_prompt(text, language="fr"):
    """Create a language-specific prompt for summarization using Phi-4 chat format"""
    system_message = get_system_message(language)
    user_message = f"Please summarize the following article: {text}"
    prompt = f"<|system|>{system_message}<|end|><|user|>{user_message}<|end|><|assistant|>"
    return prompt

def generate_summary(model, tokenizer, text, language="fr", max_length=3072, max_new_tokens=512, debug=False):
    """Generate a summary with debugging info"""
    # Create prompt
    prompt = create_phi_prompt(text, language)
    
    if debug:
        print(f"\nPrompt begins with: {prompt[:100]}...")
        print(f"Prompt ends with: ...{prompt[-50:]}")
    
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
    ).to(model.device)
    
    if debug:
        print(f"Input length in tokens: {len(inputs.input_ids[0])}")
    
    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode and extract only the summary part
    input_length = inputs.input_ids[0].size(0)
    new_tokens = outputs[0][input_length:]
    summary = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    if debug:
        print(f"Generated {len(new_tokens)} new tokens")
        print(f"Raw output begins with: {summary[:100]}...")
    
    return summary.strip()

In [7]:


# 5. Generate summaries for analysis
print("\n=== Generating Summaries ===")

print("Generating with base model...")
base_summary = generate_summary(base_model, base_tokenizer, sample['text'], LANGUAGE, debug=True)

print("\nGenerating with fine-tuned model...")
finetuned_summary = generate_summary(finetuned_model, base_tokenizer, sample['text'], LANGUAGE, debug=True)

# 6. Analyze results
print("\n=== Comparing Summaries ===")
print("\nReference summary:")
print(sample['summary'])
print("\nBase model summary:")
print(base_summary)
print("\nFine-tuned model summary:")
print(finetuned_summary)

# Calculate ROUGE scores
try:
    rouge = Rouge()
    base_scores = rouge.get_scores(base_summary, sample['summary'])[0]
    ft_scores = rouge.get_scores(finetuned_summary, sample['summary'])[0]
    
    print("\n=== ROUGE Scores ===")
    print("Base model:")
    print(f"ROUGE-1: {base_scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {base_scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {base_scores['rouge-l']['f']:.4f}")
    
    print("\nFine-tuned model:")
    print(f"ROUGE-1: {ft_scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {ft_scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {ft_scores['rouge-l']['f']:.4f}")
except Exception as e:
    print(f"Error calculating ROUGE scores: {e}")


=== Generating Summaries ===
Generating with base model...

Prompt begins with: <|system|>Vous êtes un expert en résumé. Votre tâche est de créer des résumés concis et complets 
qu...
Prompt ends with: ...]]
[[Catégorie:Transactinide]]<|end|><|assistant|>
Input length in tokens: 2524
Generated 512 new tokens
Raw output begins with: Le nihonium (Nh), élément de numéro atomique 113, a été synthétisé pour la première fois en juillet ...

Generating with fine-tuned model...

Prompt begins with: <|system|>Vous êtes un expert en résumé. Votre tâche est de créer des résumés concis et complets 
qu...
Prompt ends with: ...]]
[[Catégorie:Transactinide]]<|end|><|assistant|>
Input length in tokens: 2524
Generated 512 new tokens
Raw output begins with: Le nihonium (Nh) est l'élément chimique de numéro atomique 113, synthétisé en juillet 2004 par une é...

=== Comparing Summaries ===

Reference summary:
Le nihonium (symbole Nh) est un élément chimique de numéro atomique 113, initialement appelé unu

In [8]:


# 7. Check training artifacts
print("\n=== Examining Training Artifacts ===")

# Check for training args
training_args_path = os.path.join(FINETUNED_MODEL, "training_args.json")
if os.path.exists(training_args_path):
    try:
        with open(training_args_path, "r") as f:
            training_args = json.load(f)
        print("Found training_args.json with keys:", list(training_args.keys()))
        
        # Extract important training parameters
        if training_args:
            print(f"Learning rate: {training_args.get('learning_rate')}")
            print(f"Batch size: {training_args.get('per_device_train_batch_size')}")
            print(f"Gradient accumulation: {training_args.get('gradient_accumulation_steps')}")
            print(f"Training epochs: {training_args.get('num_train_epochs')}")
    except Exception as e:
        print(f"Error reading training args: {e}")
else:
    print("No training_args.json found")


=== Examining Training Artifacts ===
No training_args.json found


In [9]:


# 8. Verify dataset preprocessing
print("\n=== Testing Dataset Preprocessing ===")

def create_prompt_as_in_training(text, summary, language="fr"):
    """Recreate the prompt format used during training"""
    system_message = get_system_message(language)
    user_message = f"Please summarize the following article: {text}"
    full_prompt = f"<|system|>{system_message}<|end|><|user|>{user_message}<|end|><|assistant|>{summary}"
    return full_prompt

# Get a training example
train_sample = dataset["train"][0]
training_prompt = create_prompt_as_in_training(train_sample['text'], train_sample['summary'], LANGUAGE)
print(f"Training prompt format begins with: {training_prompt[:100]}...")
print(f"Training prompt format ends with: ...{training_prompt[-100:]}")


=== Testing Dataset Preprocessing ===
Training prompt format begins with: <|system|>Vous êtes un expert en résumé. Votre tâche est de créer des résumés concis et complets 
qu...
Training prompt format ends with: ...otiques à large spectre. Les complications peuvent inclure un choc septique, une anurie, et un coma.


In [10]:


# 9. Issues and recommendations
print("\n=== Potential Issues ===")
print("1. Model loading: Successfully loaded model as", "PEFT/LoRA" if is_peft_model else "standard model")
print("2. Prompt format: Using Phi-4 chat format with system + user message structure")
print("3. Generation parameters: Using temperature=0.7, top_p=0.9, max_new_tokens=512")

# Check output quality
if "ession" in base_summary[:50] or "ession" in finetuned_summary[:50]:
    print("4. Output quality issue: Both models may be continuing from middle of article text")
    print("   - This suggests the model might be receiving text from the middle of the article")
    print("   - Or the model is not properly trained to summarize from the beginning")

# Generate recommendations
print("\n=== Recommendations ===")
print("1. Verify the article text format - ensure it starts from the beginning not middle of content")
print("2. Check training data quality - ensure summaries are proper summaries not article sections")
print("3. Try different checkpoint from training or retrain with more focused data")
print("4. Add explicit instructions in the prompt like 'Create a brief summary about this person:'")
print("5. Try truncating article text to just first few paragraphs for clearer summarization")


=== Potential Issues ===
1. Model loading: Successfully loaded model as PEFT/LoRA
2. Prompt format: Using Phi-4 chat format with system + user message structure
3. Generation parameters: Using temperature=0.7, top_p=0.9, max_new_tokens=512

=== Recommendations ===
1. Verify the article text format - ensure it starts from the beginning not middle of content
2. Check training data quality - ensure summaries are proper summaries not article sections
3. Try different checkpoint from training or retrain with more focused data
4. Add explicit instructions in the prompt like 'Create a brief summary about this person:'
5. Try truncating article text to just first few paragraphs for clearer summarization
