In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv('/kaggle/input/mahed-task-2/train.csv')
test_df = pd.read_csv('/kaggle/input/mahed-task-2/test.csv')
val_df = pd.read_csv('/kaggle/input/mahed-task-2/validation.csv')

In [4]:
train_df

Unnamed: 0,id,text,Emotion,Offensive,Hate
0,2537,أحد التجار الشباب العمانيين يقول للاسف لما يكو...,neutral,no,
1,5579,@JALHARBISKY مجموعه القدرة الجنسيه👍<LF> <LF>بد...,optimism,no,
2,6092,@rwn4o حبيبييي والله اكثثثرر يارب امين🥺♥️♥️,love,no,
3,2540,#وصال_دوت_FM<LF>مع سميرة الفطيسية @Samira_Alfu...,neutral,no,
4,3159,من ينتزع ارواح اطفالنا من أجسادها بكل وحشية عل...,anticipation,no,
...,...,...,...,...,...
5955,5734,في لبنان.. حاشية وزلم وحراس وخواريف تُصفق له و...,anger,yes,not_hate
5956,5191,#فايق_وزاهر_اعلام_عاهر <LF>امبارح سيف زاهر الل...,anger,yes,not_hate
5957,5390,@baladtv بيحمي نفسه وامثاله اللي ناس<LF>ي مواق...,anger,yes,not_hate
5958,860,اليوم احلى يوووم وربي امفي و قوينق تحديثات 😭😭😭...,joy,no,


In [5]:
unique_val = []
for idx, row in train_df.iterrows():
    # print(row)
    hate_value = row['Hate']
    if pd.isna(hate_value):
        hate_value = 'Not applicable'
    # print(row['Hate'])
    values = row['Emotion'] + '+' + row['Offensive'] 
    unique_val.append(values)

In [6]:
pd.Series(unique_val).value_counts().sort_index()


anger+no            614
anger+yes           937
anticipation+no     446
anticipation+yes     45
confidence+no       197
confidence+yes       13
disgust+no          158
disgust+yes         619
fear+no              53
joy+no              527
joy+yes               6
love+no             584
love+yes              9
neutral+no          603
neutral+yes          58
optimism+no         419
pessimism+no        152
pessimism+yes        42
sadness+no          333
sadness+yes           2
surprise+no         130
surprise+yes         13
Name: count, dtype: int64

In [7]:
train_df['Emotion'].value_counts()

Emotion
anger           1551
disgust          777
neutral          661
love             593
joy              533
anticipation     491
optimism         419
sadness          335
confidence       210
pessimism        194
surprise         143
fear              53
Name: count, dtype: int64

In [8]:
train_df['Offensive'].value_counts()

Offensive
no     4216
yes    1744
Name: count, dtype: int64

In [9]:
train_df['Hate'].value_counts()

Hate
not_hate    1441
hate         303
Name: count, dtype: int64

In [10]:
train_df.iloc[5934]['text']

'@AJArabic والله لو يعزمني الشبخ محمد بن زائد حفظه الله لروح معه سر يا قائد الأمجاد وحنا معك بالروح بالنفس نفديك وخل كلاب قطر ينعقون هم وعيال الشيطان الاخوان الكذابين'

In [11]:
train_df

Unnamed: 0,id,text,Emotion,Offensive,Hate
0,2537,أحد التجار الشباب العمانيين يقول للاسف لما يكو...,neutral,no,
1,5579,@JALHARBISKY مجموعه القدرة الجنسيه👍<LF> <LF>بد...,optimism,no,
2,6092,@rwn4o حبيبييي والله اكثثثرر يارب امين🥺♥️♥️,love,no,
3,2540,#وصال_دوت_FM<LF>مع سميرة الفطيسية @Samira_Alfu...,neutral,no,
4,3159,من ينتزع ارواح اطفالنا من أجسادها بكل وحشية عل...,anticipation,no,
...,...,...,...,...,...
5955,5734,في لبنان.. حاشية وزلم وحراس وخواريف تُصفق له و...,anger,yes,not_hate
5956,5191,#فايق_وزاهر_اعلام_عاهر <LF>امبارح سيف زاهر الل...,anger,yes,not_hate
5957,5390,@baladtv بيحمي نفسه وامثاله اللي ناس<LF>ي مواق...,anger,yes,not_hate
5958,860,اليوم احلى يوووم وربي امفي و قوينق تحديثات 😭😭😭...,joy,no,


In [12]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-24 12:12:50.830853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753359171.179762      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753359171.280347      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.8: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.7.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [14]:
emotion_prompt = """The following text is an Arabic text. Your task is to classify the emotion expressed in the text into one of the following categories:
1. anger
2. disgust
3. neutral
4. love
5. joy
6. anticipation
7. optimism
8. sadness
9. confidence
10. pessimism
11. surprise
12. fear

### Text:
{}

### Response:
{}"""


In [15]:
EOS_TOKEN = tokenizer.eos_token  # Make sure your tokenizer has an EOS token

def format_emotion_prompts(examples):
    texts = []
    for text, emotion in zip(examples["text"], examples["Emotion"]):
        prompt = emotion_prompt.format(text, emotion) + EOS_TOKEN
        texts.append(prompt)
    return {"text": texts}


In [16]:
from datasets import Dataset

dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(format_emotion_prompts, batched=True)


Map:   0%|          | 0/5960 [00:00<?, ? examples/s]

In [17]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",  # your formatted prompts are in "text"
    max_seq_length=2048,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=1000,  # or num_train_epochs=3, adjust as needed
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="llama3-emotion-finetune",
        report_to="none",
    ),
)

trainer.train()


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/5960 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,960 | Num Epochs = 3 | Total steps = 1,000
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.4129
20,1.5646
30,1.5259
40,1.6294
50,1.5464
60,1.5561
70,1.6068
80,1.5579
90,1.5311
100,1.5224


TrainOutput(global_step=1000, training_loss=1.3825845079421997, metrics={'train_runtime': 12306.6769, 'train_samples_per_second': 1.3, 'train_steps_per_second': 0.081, 'total_flos': 1.4405219588898816e+17, 'train_loss': 1.3825845079421997})

In [18]:
test_df = pd.read_csv('/kaggle/input/mahed-task-2/test.csv')

In [19]:
import pandas as pd
from tqdm import tqdm
import torch

# Ensure model is optimized for inference
FastLanguageModel.for_inference(model)
model.eval()

# Define the prompt template once
emotion_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
The following text is an Arabic text. Your goal is to classify the emotion of the text into one of the following labels:
1. anger
2. disgust
3. neutral
4. love
5. joy
6. anticipation
7. optimism
8. sadness
9. confidence
10. pessimism
11. surprise
12. fear

### Input:
{}

### Response:
{}"""

# Inference function for a single text
def generate_prediction(text):
    prompt = emotion_prompt.format(text, "")
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=32,
            do_sample=False,
            temperature=0.0
        )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    predicted = response.split("### Response:")[-1].strip().split("\n")[0]
    return predicted

# Apply to all rows in test_df
tqdm.pandas()
test_df["Predicted_Emotion"] = test_df["text"].progress_apply(generate_prediction)

# Save the results
test_df.to_csv("test_predictions.csv", index=False)
print("✅ Inference complete. Results saved to test_predictions.csv")


100%|██████████| 1278/1278 [11:09<00:00,  1.91it/s]

✅ Inference complete. Results saved to test_predictions.csv



