<a href="https://colab.research.google.com/github/suleiman-odeh/NLP_Project_Team16/blob/main/fine_tuning/fine_tuning_direct_Qwen2_5_7B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers peft trl bitsandbytes accelerate datasets

In [2]:
"""
Setup
"""

import torch
import gc
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

print("Ready to go.")

Ready to go.


In [None]:
# Base model
model_name = "Qwen/Qwen2.5-7B"

# Define 4-Bit Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# fix pad tokens
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

# LoRA Config according to the paper
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="CAUSAL_LM", # text generation model
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )

print(f"Loaded BASE model: {model_name}")

In [8]:
import pandas as pd
from datasets import Dataset

# Load dataset
filename = "QEvasion_cleaned.jsonl"

try:
    df = pd.read_json(filename, lines=True)
    print(f"Loaded {len(df)} rows using Pandas.")
except ValueError:
    # Fallback if it's a standard JSON list (not lines)
    df = pd.read_json(filename)
    print(f"Loaded {len(df)} rows using Pandas (Standard JSON).")

# Filter train and test sets
train_df = df[df['split_type'] == 'train']
# We keep the test set safe for later
test_df  = df[df['split_type'] == 'test']

print(f"   - Training pool: {len(train_df)}")
print(f"   - Test pool: {len(test_df)}")

# convert to hugging face dataset
full_train_dataset = Dataset.from_pandas(train_df)

# SPLIT (TRAIN vs VALIDATION)
# so it can match the paper
dataset_split = full_train_dataset.train_test_split(test_size=0.2175, seed=42)

train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

print(f"Final Setup:")
print(f"   - Training Samples: {len(train_dataset)}")
print(f"   - Validation Samples: {len(eval_dataset)}")

# ---------------------------------------------------------
# prompt based on the paper
# ---------------------------------------------------------

def formatting_prompts_func(examples):
    output_texts = []

    # FIXED: Added \n for formatting and changed "Ambiguous" to "Ambivalent"
    instruction = (
        "Based on a part of the interview where the interviewer asks a set of questions, "
        "classify the type of answer the interviewee provided for the following question "
        "into one of these categories:\n"
        "1. Clear Reply - The information requested is explicitly stated (in the requested form)\n"
        "2. Clear Non-Reply - The information requested is not given at all due to ignorance, need for clarification or declining to answer\n"
        "3. Ambivalent - The information requested is given in an incomplete way e.g. the answer is too general, partial, implicit, dodging or deflection."
    )

    # Define columns
    answers = examples['cleaned_answer']
    questions = examples['question']
    labels = examples['clarity_label']

    for i in range(len(questions)):
        # Get the label text
        label_text = str(labels[i])

        # Construct Prompt
        text = f"{instruction}\n\n### Part of the interview ###\n{answers[i]}\n\n### Question ###\n{questions[i]}\n\nLabel: {label_text}"

        # Add EOS token
        text = text + tokenizer.eos_token
        output_texts.append(text)

    return output_texts

# VERIFY
print("\n--- Sample Training Input ---")
print(formatting_prompts_func(train_dataset[:1])[0])

Loaded 3756 rows using Pandas.
   - Training pool: 3448
   - Test pool: 308
Final Setup:
   - Training Samples: 2698
   - Validation Samples: 750

--- Sample Training Input ---
Based on a part of the interview where the interviewer asks a set of questions, classify the type of answer the interviewee provided for the following question into one of these categories:
1. Clear Reply - The information requested is explicitly stated (in the requested form)
2. Clear Non-Reply - The information requested is not given at all due to ignorance, need for clarification or declining to answer
3. Ambivalent - The information requested is given in an incomplete way e.g. the answer is too general, partial, implicit, dodging or deflection.

### Part of the interview ###
Am I going to meet with the Iranians directly, is that the question?

### Question ###
Is it necessary to deal with Iranian directly?

Label: Clear Non-Reply<|endoftext|>


In [None]:
"""
Training Loop
"""

from transformers import TrainingArguments
from trl import SFTTrainer

# 1. DEFINE TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./qwen-finetuned-evasion", # Where to save results
    per_device_train_batch_size=2,         # Low batch size to prevent crashing
    gradient_accumulation_steps=8,         # Accumulate to simulate batch_size=16
    gradient_checkpointing=True,           # Saves huge memory (trades speed for RAM)
    learning_rate=2e-4,                    # Standard QLoRA learning rate
    lr_scheduler_type="cosine",
    max_steps=200,                         # 200 steps is enough for a strong demo
    logging_steps=10,                      # Print stats every 10 steps
    evaluation_strategy="steps",           # Check Validation Set? YES
    eval_steps=50,                         # Check validation every 50 steps
    save_strategy="no",                    # Don't save intermediate checkpoints (saves disk)
    fp16=True,                             # Use 16-bit precision for speed
    optim="paged_adamw_8bit",              # 8-bit Optimizer (Crucial for T4)
    report_to="none"                       # Turn off WandB (keeps output clean)
)

# 2. INITIALIZE TRAINER
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,             # The validation set we created
    peft_config=peft_config,
    formatting_func=formatting_prompts_func, # Your prompt function
    args=training_args,
    max_seq_length=1024,                   # Cuts off really long interviews
    packing=False                          # Keep samples separate
)

# 3. START TRAINING
print("ðŸš€ Starting Training...")
trainer.train()

# 4. SAVE THE ADAPTER
trainer.model.save_pretrained("./final_adapter_qwen")
print("âœ… Training Complete. Adapter saved to './final_adapter_qwen'")

In [None]:
"""
Inference: Check the testset
"""

In [None]:
"""
Evaluation
"""