<a href="https://colab.research.google.com/github/suleiman-odeh/NLP_Project_Team16/blob/main/fine_tuning/fine_tuning_indirect_Qwen2_5_7B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install scikit-learn pandas

In [1]:
import os
import torch
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [2]:
# Configuration
BASE_PATH = "/content/drive/MyDrive/NLP_Project_QEvasion"
DATA_FILE = os.path.join(BASE_PATH, "QEvasion_cleaned.jsonl")

# Model Definitions
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
NAME_STANDARD = "Qwen2.5-7B-Indirect-Standard"
NAME_OVERSAMPLED = "Qwen2.5-7B-Indirect-Oversampled"

# Paths for saving
OUTPUT_DIR_STANDARD = os.path.join(BASE_PATH, "models", NAME_STANDARD)
OUTPUT_DIR_OVERSAMPLED = os.path.join(BASE_PATH, "models", NAME_OVERSAMPLED)

# Hyperparams
MAX_SEQ_LENGTH = 2048
DTYPE = None
LOAD_IN_4BIT = True

# Verify GPU
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU = {gpu_stats.name}. Max Memory = {round(gpu_stats.total_memory / 1024**3, 1)} GB.")

GPU = NVIDIA A100-SXM4-40GB. Max Memory = 39.6 GB.


In [3]:
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"{file_path} not found. Please upload it to Drive.")

    try:
        df = pd.read_json(file_path, lines=True)
    except ValueError:
        df = pd.read_json(file_path)

    df = df.dropna(how='all')
    train_df = df[df['split_type'] == 'train'].copy()
    test_df = df[df['split_type'] == 'test'].copy()

    if len(test_df) == 0:
        print("Note: 'test' split empty. Using 'dev' split as test.")
        test_df = df[df['split_type'] == 'dev'].copy()

    return train_df, test_df

# Load Initial Data
train_df, test_df = load_data(DATA_FILE)

# Labels
evasion_label_map = {
    0: "Explicit", 1: "Implicit", 2: "General", 3: "Partial",
    4: "Dodging", 5: "Deflection", 6: "Declining to answer",
    7: "Claims ignorance", 8: "Clarification"
}

def get_evasion_text(row):
    return evasion_label_map.get(row.get('evasion_id'), "Unknown")

train_df['evasion_text'] = train_df.apply(get_evasion_text, axis=1)

print(f"Initial Data Loaded. Total Train Rows: {len(train_df)} | Hold-out Test Rows: {len(test_df)}")

Initial Data Loaded. Total Train Rows: 3448 | Hold-out Test Rows: 308


In [None]:
# Format Function
def format_instruction(sample):
    instruction = (
        "Based on a part of the interview where the interviewer asks a set of questions, "
        "classify the type of answer the interviewee provided for the following question "
        "into one of these evasion categories:\n"
        "1. Explicit\n2. Implicit\n3. General\n4. Partial\n"
        "5. Dodging\n6. Deflection\n7. Declining to answer\n"
        "8. Claims ignorance\n9. Clarification"
    )

    prompt = f"{instruction}\n\n### Part of the interview ###\n{sample['cleaned_answer']}\n\n### Question ###\n{sample['question']}\n\nLabel: "
    return { "text": f"{prompt}{sample['evasion_text']}<|endoftext|>" }

# Split Logic
print("Splitting training data...")
full_train_dataset = Dataset.from_pandas(train_df)

# 750 / 3448 â‰ˆ 0.2175
dataset_split = full_train_dataset.train_test_split(test_size=0.2175, seed=42)
hf_train_standard = dataset_split['train']
hf_eval = dataset_split['test']

print(f"Standard Train Size: {len(hf_train_standard)}")
print(f"Validation Size:     {len(hf_eval)}")

# Oversampling Logic (Applied only to the 'train' split)
print("Preparing Oversampled Dataset...")
# Convert the training split back to pandas for manipulation
train_split_df = hf_train_standard.to_pandas()

# Calculate max count in this training split
counts = train_split_df['evasion_id'].value_counts()
max_count = counts.max()

dfs_to_concat = []
for label_id, group in train_split_df.groupby('evasion_id'):
    # Resample each group to match max_count
    upsampled_group = group.sample(max_count, replace=True, random_state=42)
    dfs_to_concat.append(upsampled_group)

# Shuffle final dataframe
train_df_oversampled = pd.concat(dfs_to_concat).sample(frac=1, random_state=42).reset_index(drop=True)
hf_train_oversampled = Dataset.from_pandas(train_df_oversampled)

print(f"Oversampled Train Size: {len(hf_train_oversampled)}")

# Apply Formatting to All
print("Formatting datasets...")
hf_train_standard = hf_train_standard.map(format_instruction)
hf_train_oversampled = hf_train_oversampled.map(format_instruction)
hf_eval = hf_eval.map(format_instruction)
print("Data Preparation Complete.")

Splitting training data...
Standard Train Size: 2698
Validation Size:     750
Preparing Oversampled Dataset...
Oversampled Train Size: 7569
Formatting datasets...


Map:   0%|          | 0/2698 [00:00<?, ? examples/s]

Map:   0%|          | 0/7569 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Data Preparation Complete.


In [None]:
def train_model(train_dataset, eval_dataset, output_name, output_dir, epochs=1):
    print(f"\n{'='*40}")
    print(f"STARTING TRAINING: {output_name}")
    print(f"{'='*40}")

    # Load Fresh Model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = MODEL_ID,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = DTYPE,
        load_in_4bit = LOAD_IN_4BIT,
    )

    # Attach LoRA
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 32,
        lora_dropout = 0.05,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
    )

    # Trainer
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        eval_dataset = eval_dataset,
        dataset_text_field = "text",
        max_seq_length = MAX_SEQ_LENGTH,
        dataset_num_proc = 2,
        packing = False,
        args = TrainingArguments(
            per_device_train_batch_size = 4,
            gradient_accumulation_steps = 4,
            warmup_steps = 10,
            num_train_epochs = epochs,
            learning_rate = 2e-4,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 10,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = os.path.join(output_dir, "checkpoints"),

            # Save final model manually, but eval during training
            save_strategy = "no",
            eval_strategy = "steps",
            eval_steps = 50,
            report_to = "none"
        ),
    )

    trainer.train()

    # Save to Drive
    print(f"Saving to {output_dir}...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Free memory
    del model
    del trainer
    torch.cuda.empty_cache()
    print("Training Complete & Memory Cleared.")

# EXECUTE TRAINING 

# Run 1: Standard (3 Epochs)
train_model(hf_train_standard, hf_eval, NAME_STANDARD, OUTPUT_DIR_STANDARD, epochs=3)

# Run 2: Oversampled (1 Epoch)
train_model(hf_train_oversampled, hf_eval, NAME_OVERSAMPLED, OUTPUT_DIR_OVERSAMPLED, epochs=1)


STARTING TRAINING: Qwen2.5-7B-Indirect-Oversampled
==((====))==  Unsloth 2026.1.4: Fast Qwen2 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.4 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/7569 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/750 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,569 | Num Epochs = 1 | Total steps = 474
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


Step,Training Loss,Validation Loss
50,1.7715,1.850859
100,1.6371,1.813723
150,1.4306,1.76006
200,1.3127,1.692796
250,1.2318,1.657414
300,1.1909,1.606908
350,1.0148,1.57635
400,0.7999,1.546137
450,0.8649,1.538183


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Saving to /content/drive/MyDrive/NLP_Project_QEvasion/models/Qwen2.5-7B-Indirect-Oversampled...
Training Complete & Memory Cleared.


In [None]:
def evaluate_model(adapter_path, dataset_df):
    print(f"\n{'='*60}")
    print(f"LOADING ADAPTER: {adapter_path}")
    print(f"{'='*60}")

    # Load Model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = adapter_path,
        max_seq_length = MAX_SEQ_LENGTH,
        dtype = DTYPE,
        load_in_4bit = LOAD_IN_4BIT,
    )
    FastLanguageModel.for_inference(model)

    # Mappings
    text_to_evasion_id = {v: k for k, v in evasion_label_map.items()}
    evasion_to_clarity_id = {0:0, 1:1, 2:1, 3:1, 4:1, 5:1, 6:2, 7:2, 8:2}

    # Instruction
    test_instruction = (
        "Based on a part of the interview where the interviewer asks a set of questions, "
        "classify the type of answer the interviewee provided for the following question "
        "into one of these evasion categories:\n"
        "1. Explicit\n2. Implicit\n3. General\n4. Partial\n"
        "5. Dodging\n6. Deflection\n7. Declining to answer\n"
        "8. Claims ignorance\n9. Clarification"
    )

    y_true_evasion = []
    y_pred_evasion = []
    y_true_clarity = []
    y_pred_clarity = []

    print(f"Evaluating {len(dataset_df)} samples...")

    for index, row in tqdm(dataset_df.iterrows(), total=len(dataset_df)):

        # INPUTS
        q_text = row['question']
        a_text = row['cleaned_answer']
        prompt = f"{test_instruction}\n\n### Part of the interview ###\n{a_text}\n\n### Question ###\n{q_text}\n\nLabel: "

        inputs = tokenizer([prompt], return_tensors = "pt").to("cuda")

        outputs = model.generate(
            **inputs,
            max_new_tokens = 10,
            use_cache = True,
            pad_token_id = tokenizer.eos_token_id
        )

        decoded_output = tokenizer.batch_decode(outputs)[0]

        # PARSING
        if "Label: " in decoded_output:
            raw_prediction = decoded_output.split("Label: ")[-1]
        else:
            raw_prediction = decoded_output

        # Strip EOS tokens explicitly
        # We replace the tokenizer's specific EOS, but also the string literal if it appears
        prediction_text = raw_prediction.replace(tokenizer.eos_token, "")
        prediction_text = prediction_text.replace("<|endoftext|>", "") 
        prediction_text = prediction_text.split('\n')[0].strip()

        # Normalize punctuation
        clean_pred = prediction_text.strip().rstrip('.').rstrip(',').strip()

        # Map Prediction to ID
        pred_ev_id = -1

        # Case A: Text Match ("Dodging")
        if clean_pred in text_to_evasion_id:
            pred_ev_id = text_to_evasion_id[clean_pred]
        else:
            # Case B: Case-insensitive Match ("dodging")
            for key_text, key_id in text_to_evasion_id.items():
                if key_text.lower() == clean_pred.lower():
                    pred_ev_id = key_id
                    break

            # Case C: Number Match ("6")
            if pred_ev_id == -1 and clean_pred.isdigit():
                try:
                    val_int = int(clean_pred)
                    # Check if map aligns (1->0, 2->1)
                    # Map: 1. Explicit -> ID 0
                    derived_id = val_int - 1
                    if 0 <= derived_id <= 8:
                        pred_ev_id = derived_id
                except:
                    pass

        # GROUND TRUTH LOGIC
        annotator_votes = []
        for col in ['annotator1_id', 'annotator2_id', 'annotator3_id']:
            if col in row and pd.notna(row[col]):
                try:
                    val = int(float(row[col]))
                    if 0 <= val <= 8: annotator_votes.append(val)
                except: pass

        if not annotator_votes: continue

        gold_ev_id = Counter(annotator_votes).most_common(1)[0][0]
        gold_clarity_id = evasion_to_clarity_id[gold_ev_id]

        # STORE RESULTS
        if pred_ev_id != -1:
            pred_clarity_id = evasion_to_clarity_id[pred_ev_id]

            y_true_evasion.append(gold_ev_id)
            y_pred_evasion.append(pred_ev_id)
            y_true_clarity.append(gold_clarity_id)
            y_pred_clarity.append(pred_clarity_id)

    # Print Reports
    evasion_names = [evasion_label_map[i] for i in range(9)]
    clarity_names = ["Clear Reply (0)", "Ambivalent (1)", "Clear Non-Reply (2)"]

    print(f"\nRESULTS FOR: {adapter_path}")
    print("="*60)
    print("REPORT 1: EVASION CLASSIFICATION")
    print(classification_report(y_true_evasion, y_pred_evasion, labels=range(9), target_names=evasion_names, zero_division=0))

    print("\nREPORT 2: CLARITY CLASSIFICATION")
    print(classification_report(y_true_clarity, y_pred_clarity, labels=range(3), target_names=clarity_names, zero_division=0))

    del model
    torch.cuda.empty_cache()

# RUN EVALUATION
evaluate_model(OUTPUT_DIR_STANDARD, test_df)
evaluate_model(OUTPUT_DIR_OVERSAMPLED, test_df)


LOADING ADAPTER: /content/drive/MyDrive/NLP_Project_QEvasion/models/Qwen2.5-7B-Indirect-Standard
==((====))==  Unsloth 2026.1.4: Fast Qwen2 patching. Transformers: 4.57.6.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating 308 samples...


  0%|          | 0/308 [00:00<?, ?it/s]


RESULTS FOR: /content/drive/MyDrive/NLP_Project_QEvasion/models/Qwen2.5-7B-Indirect-Standard
REPORT 1: EVASION CLASSIFICATION
                     precision    recall  f1-score   support

           Explicit       0.00      0.00      0.00         7
           Implicit       0.00      0.00      0.00         4
            General       0.00      0.00      0.00        11
            Partial       0.07      0.10      0.08        21
            Dodging       0.20      0.38      0.26        53
         Deflection       0.32      0.48      0.39        91
Declining to answer       0.07      0.02      0.03        52
   Claims ignorance       0.00      0.00      0.00        60
      Clarification       0.00      0.00      0.00         6

           accuracy                           0.22       305
          macro avg       0.07      0.11      0.08       305
       weighted avg       0.15      0.22      0.17       305


REPORT 2: CLARITY CLASSIFICATION
                     precision    recall  f

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating 308 samples...


  0%|          | 0/308 [00:00<?, ?it/s]


RESULTS FOR: /content/drive/MyDrive/NLP_Project_QEvasion/models/Qwen2.5-7B-Indirect-Oversampled
REPORT 1: EVASION CLASSIFICATION
                     precision    recall  f1-score   support

           Explicit       0.40      0.25      0.31         8
           Implicit       0.50      1.00      0.67         3
            General       0.00      0.00      0.00        12
            Partial       0.13      0.29      0.18        21
            Dodging       0.26      0.61      0.36        54
         Deflection       0.38      0.33      0.35        87
Declining to answer       0.29      0.19      0.23        52
   Claims ignorance       0.00      0.00      0.00        60
      Clarification       0.00      0.00      0.00         6

           accuracy                           0.27       303
          macro avg       0.22      0.30      0.23       303
       weighted avg       0.23      0.27      0.23       303


REPORT 2: CLARITY CLASSIFICATION
                     precision    recall