In [None]:
!pip install transformers datasets optuna peft torch psutil wandb matplotlib rouge-score -q
!pip install evaluate
!pip install rouge-score bert-score -q

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

csv_path = "arabic_summaries_5000_v2.csv"
df = pd.read_csv(csv_path)

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}")

# Experiments: Normal fine tune

In [None]:
import pandas as pd

import importlib
import normal_finetuning,  qlora_finetuning, utils, prefix_finetuning, evaluation
from normal_finetuning import finetune_model as normal_finetune
from qlora_finetuning import finetune_model as qlora_finetune
from prefix_finetuning import finetune_model as prefix_finetune
from evaluation import run_evluation

importlib.reload(prefix_finetuning)
importlib.reload(normal_finetuning)
importlib.reload(utils)
importlib.reload(qlora_finetuning)

In [None]:
arabic_prompt = None  # Or "لخص النص التالي:"
template = "{text} {summary}"
OUTPUT_DIR = "./"
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
trainer, stats = normal_finetune(model_name, train_dataset, val_dataset,
                                 learning_rate=2e-5, batch_size=2, num_epochs=2,
                                 use_wandb=True, prompt=arabic_prompt,
                                 template = template,
                                 grad_acc_step=16, OUTPUT_DIR=OUTPUT_DIR)

print(f"Normal Fine-Tuning Stats: {stats}")

# Experiments: Lora/QLora

In [None]:
#!pip install -U bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Using cached bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.39.0
    Uninstalling bitsandbytes-0.39.0:
      Successfully uninstalled bitsandbytes-0.39.0
Successfully installed bitsandbytes-0.45.3


In [None]:
arabic_prompt = None  # Or "لخص النص التالي:"
template = "النص:{text}الملخص:{summary}"
OUTPUT_DIR = "/content/drive/MyDrive/NLPProject"
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
#first parameter can be lora or qlora
trainer, stats = qlora_finetune("lora",model_name, train_dataset, val_dataset,
                                 learning_rate=2e-5, batch_size=2, num_epochs=3,
                                 use_wandb=True, prompt=arabic_prompt,
                                 template = template, one_shot_text = None,
                                 one_shot_summary=None,
                                 lora_r = 64,
                                 grad_acc_step=16, OUTPUT_DIR=OUTPUT_DIR)

print(f"Normal Fine-Tuning Stats: {stats}")

# Experiments: Prefix tuning

In [None]:
wlarabic_prompt = "لخص:"  # Or "لخص النص التالي:"
template = "النص:{text}الملخص:{summary}"
OUTPUT_DIR = "./"
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

trainer, stats = prefix_finetune(model_name, train_dataset, val_dataset,
                                 learning_rate=2e-4, batch_size=2, num_epochs=3,
                                 use_wandb=True, prompt=arabic_prompt,
                                 template = template, one_shot_text = None,
                                 one_shot_summary=None,
                                 num_virtual_tokens = 16,
                                 grad_acc_step=16, prefix_projection=True,OUTPUT_DIR=OUTPUT_DIR)

print(f"Normal Fine-Tuning Stats: {stats}")

# Experiemtns: One shot with prefix tuning

In [None]:
df = pd.read_csv(csv_path)

shot_index = 1
one_shot_text = df['text'][shot_index]
one_shot_summary = df['summary'][shot_index]
df = df.drop(index=shot_index)

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}")

Train size: 3599, Val size: 400, Test size: 1000


In [None]:
wlarabic_prompt = "لخص:"  # Or "لخص النص التالي:"
template = "النص:{text}الملخص:{summary}"
OUTPUT_DIR = "./"
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

trainer, stats = prefix_finetune(model_name, train_dataset, val_dataset,
                                 learning_rate=2e-4, batch_size=2, num_epochs=3,
                                 use_wandb=True, prompt=arabic_prompt,
                                 template = template, one_shot_text = one_shot_text,
                                 one_shot_summary=one_shot_summary,
                                 num_virtual_tokens = 16,
                                 grad_acc_step=16, prefix_projection=True,OUTPUT_DIR=OUTPUT_DIR)

print(f"Normal Fine-Tuning Stats: {stats}")

# Evaluation

In [None]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
checkpoint_path = "../"
template = "النص:{text}الملخص:{summary}"
output_split = "الملخص:"
batch_size=10
run_evluation(test_dataset,model_name, checkpoint_path, template, output_split,batch_size, prompt=None):