In [None]:
!pip install bitsandbytes

In [None]:
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel
import pandas as pd
import re
from tqdm import tqdm
import gc
import os
import json
import logging
import numpy as np
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ImprovedQwenFineTuner:
    def __init__(self, model_id="Qwen/Qwen2.5-7B-Instruct", token=None):
        self.model_id = model_id
        self.token = token
        self.model = None
        self.tokenizer = None
        self.peft_model = None

    def setup_model_and_tokenizer(self):
        """Setup base model and tokenizer with quantization for LoRA"""
        logger.info(f"Loading model: {self.model_id}")

        # Improved quantization config
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )

        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
            token=self.token,
            torch_dtype=torch.bfloat16,
        )

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_id,
            trust_remote_code=True,
            token=self.token
        )

        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.tokenizer.padding_side = "right"
        self.model = prepare_model_for_kbit_training(self.model)

        logger.info("Model and tokenizer loaded successfully!")

    def setup_improved_lora_config(self, r=32, lora_alpha=64, lora_dropout=0.05):
        """Setup improved LoRA configuration with higher rank and better targeting"""
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=r,  # Increased rank for better capacity
            lora_alpha=lora_alpha,  # Higher alpha for stronger adaptation
            lora_dropout=lora_dropout,  # Lower dropout
            target_modules=[
                # Core attention modules
                "q_proj", "k_proj", "v_proj", "o_proj",
                # MLP modules
                "gate_proj", "up_proj", "down_proj",
                # Additional modules that might help
                "embed_tokens", "lm_head"
            ],
            bias="none",
            modules_to_save=["embed_tokens", "lm_head"],  # Save these completely
        )

        self.peft_model = get_peft_model(self.model, lora_config)
        self.peft_model.print_trainable_parameters()

        return lora_config

    def load_and_prepare_data(self, task="subtask2", validation_split=0.2):
        """Load and prepare training data with improved data handling"""
        logger.info(f"Loading datasets for {task}")

        if task == "subtask1":
            dataset_name = "UBC-NLP/palmx_2025_subtask1_culture"
        else:
            dataset_name = "UBC-NLP/palmx_2025_subtask2_islamic"

        # Clear cache and load fresh data
        import shutil
        from datasets import config

        try:
            # Clear dataset cache
            cache_dir = config.HF_DATASETS_CACHE
            if os.path.exists(cache_dir):
                logger.info("Clearing dataset cache...")
                for item in os.listdir(cache_dir):
                    if dataset_name.replace("/", "___") in item:
                        item_path = os.path.join(cache_dir, item)
                        if os.path.isdir(item_path):
                            shutil.rmtree(item_path)
                        else:
                            os.remove(item_path)
        except Exception as e:
            logger.warning(f"Could not clear cache: {e}")

        # Load training data with download_mode to force fresh download
        try:
            train_data = load_dataset(dataset_name, split="train", download_mode="force_redownload")
        except:
            # Fallback: try without download_mode
            train_data = load_dataset(dataset_name, split="train", cache_dir=None)

        logger.info(f"Loaded training data: {len(train_data)} samples")

        # Check if dev/validation split exists
        try:
            try:
                eval_data = load_dataset(dataset_name, split="dev", download_mode="force_redownload")
            except:
                eval_data = load_dataset(dataset_name, split="dev", cache_dir=None)
            logger.info(f"Using existing dev split: {len(eval_data)} samples")
        except:
            logger.info(f"No dev split found, creating {validation_split*100:.0f}% validation split from training data")
            train_eval_split = train_data.train_test_split(
                test_size=validation_split,
                seed=42,
                shuffle=True,
                stratify_by_column="answer"
            )
            train_data = train_eval_split['train']
            eval_data = train_eval_split['test']

            logger.info(f"Split created - Train: {len(train_data)}, Validation: {len(eval_data)}")

        return train_data, eval_data

    def format_training_prompt_improved(self, example):
        """Improved prompt formatting with better structure and consistency"""
        question = example['question']
        options = {
            'A': example['A'],
            'B': example['B'],
            'C': example['C'],
            'D': example['D']
        }
        answer = example['answer']

        # Use Arabic format similar to baseline for consistency
        prompt = f"""<|im_start|>system
أنت خبير في الثقافة الإسلامية. أجب على السؤال متعدد الخيارات بتقديم حرف الإجابة الصحيحة فقط (A، B، C، أو D).<|im_end|>
<|im_start|>user
السؤال: {question}

A. {options['A']}
B. {options['B']}
C. {options['C']}
D. {options['D']}

الجواب:<|im_end|>
<|im_start|>assistant
{answer}<|im_end|>"""

        return prompt

    def prepare_dataset_improved(self, dataset, max_length=1024):
        """Improved dataset preparation with better tokenization"""
        logger.info("Preparing dataset for training...")
        formatted_examples = []

        for i, item in enumerate(tqdm(dataset, desc="Processing examples")):
            try:
                formatted_prompt = self.format_training_prompt_improved(item)

                tokenized = self.tokenizer(
                    formatted_prompt,
                    truncation=True,
                    padding=False,
                    max_length=max_length,  # Increased max length
                    return_tensors=None
                )

                tokenized["labels"] = tokenized["input_ids"].copy()
                formatted_examples.append(tokenized)

            except Exception as e:
                logger.error(f"Error processing example {i}: {e}")
                continue

        return formatted_examples

    def create_improved_data_collator(self):
        """Improved data collator with label smoothing"""
        def data_collator(features):
            max_length = max(len(f["input_ids"]) for f in features)
            batch = {}

            for key in ["input_ids", "attention_mask", "labels"]:
                batch[key] = []
                for feature in features:
                    seq = feature[key]
                    if key == "input_ids":
                        padded_seq = seq + [self.tokenizer.pad_token_id] * (max_length - len(seq))
                    elif key == "attention_mask":
                        padded_seq = seq + [0] * (max_length - len(seq))
                    elif key == "labels":
                        padded_seq = seq + [-100] * (max_length - len(seq))
                    batch[key].append(padded_seq)

            for key in batch:
                batch[key] = torch.tensor(batch[key], dtype=torch.long)

            return batch

        return data_collator

    def compute_metrics(self, eval_pred):
        """Compute detailed metrics during training"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=-1)

        # Filter out ignored labels (-100)
        mask = labels != -100
        filtered_predictions = predictions[mask]
        filtered_labels = labels[mask]

        accuracy = (filtered_predictions == filtered_labels).mean()

        return {
            "accuracy": accuracy,
            "eval_samples": len(filtered_labels)
        }

    def fine_tune_improved(self, train_data, eval_data, output_dir="./qwen_improved_finetuned"):
        """Improved fine-tuning with better hyperparameters and techniques"""
        logger.info("Preparing training data...")
        train_dataset = self.prepare_dataset_improved(train_data)
        eval_dataset = self.prepare_dataset_improved(eval_data)

        logger.info(f"Training samples: {len(train_dataset)}")
        logger.info(f"Evaluation samples: {len(eval_dataset)}")

        # Improved training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=5,  # More epochs
            per_device_train_batch_size=2,  # Smaller batch size for stability
            per_device_eval_batch_size=4,
            gradient_accumulation_steps=8,  # Larger effective batch size
            warmup_steps=200,  # More warmup
            learning_rate=1e-4,  # Lower learning rate
            weight_decay=0.01,  # Add weight decay
            fp16=True,
            logging_steps=25,
            save_steps=100,
            eval_steps=100,
            eval_strategy="steps",
            save_strategy="steps",
            load_best_model_at_end=True,
            metric_for_best_model="eval_accuracy",
            greater_is_better=True,
            remove_unused_columns=False,
            dataloader_pin_memory=False,
            gradient_checkpointing=True,
            report_to=None,
            save_total_limit=3,  # Keep only best 3 checkpoints
            # Advanced training techniques
            lr_scheduler_type="cosine",  # Cosine learning rate schedule
            warmup_ratio=0.1,
            max_grad_norm=1.0,  # Gradient clipping
        )

        # Create trainer with early stopping
        trainer = Trainer(
            model=self.peft_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            data_collator=self.create_improved_data_collator(),
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        logger.info("Starting improved fine-tuning...")
        trainer.train()

        # Save the final model
        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)

        logger.info(f"Fine-tuning completed! Model saved to {output_dir}")

        return trainer

    def evaluate_with_baseline_format(self, eval_data, max_length=1024):
        """Evaluate using the same format as baseline for fair comparison"""
        if self.peft_model is None:
            raise ValueError("No model loaded.")

        self.peft_model.eval()
        correct_predictions = 0
        total_questions = len(eval_data)
        submission_data = []

        logger.info(f"Evaluating on {total_questions} validation samples...")

        for i, example in enumerate(tqdm(eval_data, desc="Evaluating")):
            try:
                question = example['question']
                options = [example['A'], example['B'], example['C'], example['D']]

                # Use the same format as baseline
                prompt = f"""{question}

A. {options[0]}
B. {options[1]}
C. {options[2]}
D. {options[3]}

الجواب:"""

                # Calculate log-likelihood for each choice like baseline
                choice_scores = []
                for choice_idx, choice_label in enumerate(['A', 'B', 'C', 'D']):
                    full_text = prompt + f" {choice_label}"

                    inputs = self.tokenizer(
                        full_text,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length
                    ).to(self.peft_model.device)

                    prompt_inputs = self.tokenizer(
                        prompt,
                        return_tensors="pt",
                        truncation=True,
                        max_length=max_length
                    ).to(self.peft_model.device)

                    with torch.no_grad():
                        outputs = self.peft_model(**inputs)
                        logits = outputs.logits

                        # Calculate log probability of the choice token
                        choice_token = self.tokenizer.encode(f" {choice_label}", add_special_tokens=False)[0]
                        choice_logit = logits[0, prompt_inputs.input_ids.shape[1]-1, choice_token]
                        choice_scores.append(choice_logit.item())

                # Select best choice
                predicted_idx = np.argmax(choice_scores)
                predicted_answer = ['A', 'B', 'C', 'D'][predicted_idx]

                submission_data.append({
                    "id": example.get("id", f"sample_{i}"),
                    "prediction": predicted_answer,
                    "correct_answer": example["answer"],
                    "is_correct": predicted_answer == example["answer"],
                    "scores": choice_scores
                })

                if predicted_answer == example["answer"]:
                    correct_predictions += 1

                # Debug first few examples
                if i < 5:
                    logger.info(f"\nExample {i}:")
                    logger.info(f"Question: {question[:100]}...")
                    logger.info(f"Scores: {choice_scores}")
                    logger.info(f"Predicted: {predicted_answer}")
                    logger.info(f"Correct: {example['answer']}")
                    logger.info(f"Match: {'✓' if predicted_answer == example['answer'] else '✗'}")

            except Exception as e:
                logger.error(f"Error processing example {i}: {e}")
                submission_data.append({
                    "id": example.get("id", f"sample_{i}"),
                    "prediction": "A",
                    "correct_answer": example["answer"],
                    "is_correct": False,
                    "scores": [0, 0, 0, 0]
                })

            if i % 50 == 0:
                torch.cuda.empty_cache()

        accuracy = (correct_predictions / total_questions) * 100

        logger.info(f"\n=== Evaluation Results ===")
        logger.info(f"Total Questions: {total_questions}")
        logger.info(f"Correct Predictions: {correct_predictions}")
        logger.info(f"Accuracy: {accuracy:.2f}%")

        return submission_data, accuracy

    def analyze_errors(self, results, eval_data):
        """Analyze errors to understand model weaknesses"""
        error_analysis = {
            'by_answer': {'A': 0, 'B': 0, 'C': 0, 'D': 0},
            'confusion_matrix': {},
            'difficult_questions': []
        }

        for i, result in enumerate(results):
            correct_answer = result['correct_answer']
            predicted_answer = result['prediction']

            if not result['is_correct']:
                error_analysis['by_answer'][correct_answer] += 1

                if correct_answer not in error_analysis['confusion_matrix']:
                    error_analysis['confusion_matrix'][correct_answer] = {}
                if predicted_answer not in error_analysis['confusion_matrix'][correct_answer]:
                    error_analysis['confusion_matrix'][correct_answer][predicted_answer] = 0
                error_analysis['confusion_matrix'][correct_answer][predicted_answer] += 1

                # Check if this was a difficult question (low confidence)
                scores = result['scores']
                max_score = max(scores)
                second_max = sorted(scores)[-2]
                confidence = max_score - second_max

                if confidence < 1.0:  # Low confidence threshold
                    error_analysis['difficult_questions'].append({
                        'id': result['id'],
                        'question': eval_data[i]['question'][:200] + "...",
                        'confidence': confidence,
                        'predicted': predicted_answer,
                        'correct': correct_answer
                    })

        return error_analysis

def main_improved_finetune():
    """Main function with improved fine-tuning strategy"""
    TOKEN = "hf_token"  # Replace with your token
    MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
    TASK = "subtask2"
    OUTPUT_DIR = "./qwen_improved_finetuned"

    try:
        # Initialize improved fine-tuner
        ft = ImprovedQwenFineTuner(model_id=MODEL_ID, token=TOKEN)

        # Setup model with improvements
        logger.info("Setting up improved model and tokenizer...")
        ft.setup_model_and_tokenizer()
        ft.setup_improved_lora_config(r=32, lora_alpha=64, lora_dropout=0.05)

        # Load data
        logger.info("Loading and preparing data...")
        train_data, eval_data = ft.load_and_prepare_data(TASK, validation_split=0.2)

        # Print data summary
        logger.info(f"\n=== Data Summary ===")
        logger.info(f"Training samples: {len(train_data)}")
        logger.info(f"Validation samples: {len(eval_data)}")

        # Fine-tune with improvements
        logger.info("\n=== Starting Improved Fine-tuning ===")
        trainer = ft.fine_tune_improved(train_data, eval_data, OUTPUT_DIR)

        # Evaluate with baseline format
        logger.info("\n=== Evaluating with Baseline Format ===")
        results, accuracy = ft.evaluate_with_baseline_format(eval_data)

        # Error analysis
        logger.info("\n=== Analyzing Errors ===")
        error_analysis = ft.analyze_errors(results, eval_data)

        logger.info(f"Errors by correct answer: {error_analysis['by_answer']}")
        logger.info(f"Number of difficult questions: {len(error_analysis['difficult_questions'])}")

        # Save results
        results_df = pd.DataFrame(results)
        results_file = f"improved_results_{accuracy:.1f}acc.csv"
        results_df.to_csv(results_file, index=False)

        # Save error analysis
        with open(f"error_analysis_{accuracy:.1f}acc.json", "w") as f:
            json.dump(error_analysis, f, indent=2, ensure_ascii=False)

        logger.info(f"\n=== Final Results ===")
        logger.info(f"Baseline (NileChat-3B): 69.5%")
        logger.info(f"Previous Fine-tuned: 65.9%")
        logger.info(f"Improved Fine-tuned: {accuracy:.2f}%")
        logger.info(f"Improvement over previous: {accuracy - 65.9:+.2f}%")

        if accuracy > 69.5:
            logger.info("🎉 Successfully beat the baseline!")
        else:
            logger.info(f"Need {69.5 - accuracy:.1f}% more to beat baseline")

    except Exception as e:
        logger.error(f"Error during improved fine-tuning: {e}")
        import traceback
        traceback.print_exc()

    finally:
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    main_improved_finetune()

In [None]:
from huggingface_hub import login
login(token="hf_token")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import shutil
from datasets import config
import os

# Clear the entire datasets cache
cache_dir = config.HF_DATASETS_CACHE
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print("Dataset cache cleared!")

Dataset cache cleared!
