In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("sData/df_train.csv")
df_train['Label'] = df_train['Label'].replace({0.0 : 'Negative', 1.0 : 'Positive', 2.0 : 'Neutral'})
df_train.head()

Unnamed: 0,Comment,Label
0,"সিনেমাটি একবার দেখার মতো, দ্বিতীয়বার দেখার ইচ্...",Neutral
1,"তুফান কেন মানুষ না, পশু না, রাক্ষস, সেই গল্প ন...",Positive
2,"পরিচালক যদি আরও উন্নত গল্প নিয়ে আসতেন, তবে আর...",Negative
3,"চঞ্চল চৌধুরী ও জয়া আহসান ভালো অভিনয় করেছেন, কি...",Neutral
4,"আমি মনে করি, আরও দশ-পনের মিনিট ছোট করা যেত।",Negative


In [3]:
df_val = pd.read_csv("sData/df_valid.csv")
df_val['Label'] = df_val['Label'].replace({0.0 : 'Negative', 1.0 : 'Positive', 2.0 : 'Neutral'})
df_val.head()

Unnamed: 0,Comment,Label
0,মুজিবের ঘন গোঁফের সঠিক রূপ ফুটিয়ে তুলতে পারেনি...,Negative
1,নতুন জুটি হিসেবে সিয়াম আর পূজার অভিনয় মনোমুগ...,Positive
2,আর অভিনেতা হিসাবে চঞ্চাল চৌধুরী একদম ১০০ তে ১০...,Positive
3,সিনেমাটি খুবই মসৃণ ছিল না। শেষ পর্যন্ত আশা ছিল...,Negative
4,"ভালো চেষ্টার পরেও নির্মাণটা জোরালো হয়নি, কয়ে...",Negative


In [4]:
df_test = pd.read_csv("sData/df_test.csv")
df_test['Label'] = df_test['Label'].replace({0.0 : 'Negative', 1.0 : 'Positive', 2.0 : 'Neutral'})
df_test.head()

Unnamed: 0,Comment,Label
0,"চরিত্রের মধ্যে গভীরতা কম ছিল, আরো ইমোশনাল কন্ট...",Negative
1,"গল্পটি যে সমাজের জন্য কতটা নেতিবাচক, সেটা ভাবল...",Negative
2,"ট্রেলার দেখে মনে হচ্ছে, পোস্ট প্রোডাকশন এখনও ব...",Positive
3,"খুব ভালো লাগেনি, তবে পুরোপুরি খারাপও ছিল না।",Neutral
4,"আয়নাবাজি বেশ ভালো ছিল, তবে কাহিনী কিছুটা পূর্...",Neutral


In [5]:
print(f"Training Data size : {df_train.shape}")
print(f"Validation Data size : {df_val.shape}")
print(f"Testing Data size : {df_test.shape}")

Training Data size : (7200, 2)
Validation Data size : (900, 2)
Testing Data size : (900, 2)


In [6]:
X_train = df_train
X_eval = df_val
X_test = df_val

def generate_prompt(data_point):
    return f"""
           Classify the text into Neutral, Positive, and Negative, and return the answer as the corresponding text label.
text: {data_point["Comment"]}
label: {data_point["Label"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into Neutral, Positive, and Negative, and return the answer as the corresponding text label.
text: {data_point["Comment"]}
label: """.strip()

X_train.loc[:,'Comment'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'Comment'] = X_eval.apply(generate_prompt, axis=1)

y_true = X_test.loc[:,'Label']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["Comment"])

In [7]:
X_train.Label.value_counts()

Label
Positive    2427
Negative    2387
Neutral     2386
Name: count, dtype: int64

In [8]:
y_true.value_counts()

Label
Negative    314
Neutral     309
Positive    277
Name: count, dtype: int64

In [9]:
from datasets import Dataset

train_data = Dataset.from_pandas(X_train[["Comment"]])
eval_data = Dataset.from_pandas(X_eval[["Comment"]])

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

In [11]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.36s/it]


In [12]:
from transformers import AutoTokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [14]:
from transformers import pipeline
from tqdm import tqdm

In [15]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["Neutral", "Positive", "Negative"]
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["Comment"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=2, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

In [16]:
y_pred = predict(X_test, model, tokenizer)

  0%|          | 0/900 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/900 [00:00<07:13,  2.07it/s]Device set to use cuda:0
  0%|          | 2/900 [00:00<04:42,  3.18it/s]Device set to use cuda:0
  0%|          | 3/900 [00:00<03:47,  3.95it/s]Device set to use cuda:0
  0%|          | 4/900 [00:01<03:25,  4.35it/s]Device set to use cuda:0
  1%|          | 5/900 [00:01<03:20,  4.47it/s]Device set to use cuda:0
  1%|          | 6/900 [00:01<03:06,  4.78it/s]Device set to use cuda:0
  1%|          | 7/900 [00:01<03:00,  4.96it/s]Device set to use cuda:0
  1%|          | 8/900 [00:01<03:00,  4.95it/s]Device set to use cuda:0
  1%|          | 9/900 [00:02<02:55,  5.08it/s]Device set to use cuda:0
  1%|          | 10/900 [00:02<02:53,  5.12it/s]Device set to use cuda:0
  1%|          | 11/900 [00:02<02:53,  5.14it/s]Device set to use cuda:0
  1%|▏         | 12/900 [00:02<02:47,  5.30it/s]Device set to use cuda:0
  1%|▏         | 13/900 [00:02<02:44,  5.38it/s]Device set to use cud

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
def evaluate(y_true, y_pred):
    labels = ["Neutral", "Positive", "Negative"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    unique_labels = set(y_true_mapped)
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)
    # plt.figure(figsize=(8, 6))
    # sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=y_true_mapped, yticklabels=y_pred_mapped)
    # plt.title('Confusion Matrix')
    # plt.xlabel('Predicted Labels')
    # plt.ylabel('True Labels')
    # plt.show()

In [19]:
evaluate(y_true, y_pred)

Accuracy: 0.346
Accuracy for label Neutral: 0.165
Accuracy for label Positive: 0.567
Accuracy for label Negative: 0.328

Classification Report:
              precision    recall  f1-score   support

     Neutral       0.85      0.17      0.28       309
    Positive       0.58      0.57      0.57       277
    Negative       0.92      0.33      0.48       314

   micro avg       0.70      0.35      0.46       900
   macro avg       0.78      0.35      0.44       900
weighted avg       0.79      0.35      0.44       900


Confusion Matrix:
[[ 51 102   4]
 [  1 157   5]
 [  8  12 103]]


In [20]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [21]:
modules = find_all_linear_names(model)
modules

['o_proj', 'q_proj', 'gate_proj', 'v_proj', 'down_proj', 'k_proj', 'up_proj']

In [33]:
print(train_data.column_names)
print(eval_data.column_names)

['text']
['text']


In [None]:
train_data = train_data.rename_column("Comment", "text")
eval_data = eval_data.rename_column("Comment", "text")

In [22]:
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer

In [25]:
output_dir="FineTuned-Llama-3.1-8B"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,            
    num_train_epochs=1,             
    per_device_train_batch_size=1,        
    gradient_accumulation_steps=8,    
    gradient_checkpointing=True,         
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,        
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,          
    max_steps=-1,
    warmup_ratio=0.03,             
    group_by_length=False,
    lr_scheduler_type="cosine",       
    report_to="wandb",            
    eval_strategy="steps",              
    eval_steps = 0.2
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    # dataset_text_field="text",
    # dataset_text_field="Comment",
    tokenizer=tokenizer,
    # max_seq_length=512,
    # packing=False,
    # dataset_kwargs={
    # "add_special_tokens": False,
    # "append_concat_token": False,
    # }
)

  trainer = SFTTrainer(
Map: 100%|██████████| 7200/7200 [00:00<00:00, 25263.68 examples/s]
Map: 100%|██████████| 900/900 [00:00<00:00, 25883.49 examples/s]


In [28]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mpramanik-souvik[0m ([33mpramanik-souvik-north-south-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
180,0.6576,0.512654
360,0.4278,0.488463
540,0.4334,0.466724
720,0.4018,0.453596
900,0.3819,0.450291


  return fn(*args, **kwargs)


TrainOutput(global_step=900, training_loss=0.49254389425118766, metrics={'train_runtime': 117107.7628, 'train_samples_per_second': 0.061, 'train_steps_per_second': 0.008, 'total_flos': 3.5552913928863744e+16, 'train_loss': 0.49254389425118766, 'epoch': 1.0})

In [29]:
import wandb

In [30]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▅▃▁▁
eval/runtime,▅▅█▆▁
eval/samples_per_second,▅▅▁▅█
eval/steps_per_second,▁▁▁▁█
train/epoch,▁▁▁▁▁▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇██
train/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▇▇▇█▆▃▄█▇▅▇▅▄▆▇▅▅▄▃▆▄▅▅▂▃▄▃▃▃▅▃▁▄▃▂▃▅▃▁▄
train/learning_rate,▄████▇▇▇▇▇▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
train/loss,▄▃▄▄▂▃▄▃▄▄▄▃▅▃▃█▃▂▁▄▂▁▄▂▃▂▂▂▃▃▃▂▁▂▃▁▁▄▂▂

0,1
eval/loss,0.45029
eval/runtime,3175.087
eval/samples_per_second,0.283
eval/steps_per_second,0.036
total_flos,3.5552913928863744e+16
train/epoch,1.0
train/global_step,900.0
train/grad_norm,0.1794
train/learning_rate,0.0
train/loss,0.3819


In [31]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('FineTuned-Llama-3.1-8B\\tokenizer_config.json',
 'FineTuned-Llama-3.1-8B\\special_tokens_map.json',
 'FineTuned-Llama-3.1-8B\\tokenizer.json')

In [32]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/900 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/900 [00:08<2:10:16,  8.69s/it]Device set to use cuda:0
  0%|          | 2/900 [00:14<1:46:54,  7.14s/it]Device set to use cuda:0
  0%|          | 3/900 [00:19<1:28:04,  5.89s/it]Device set to use cuda:0
  0%|          | 4/900 [00:24<1:23:45,  5.61s/it]Device set to use cuda:0
  1%|          | 5/900 [00:29<1:23:21,  5.59s/it]Device set to use cuda:0
  1%|          | 6/900 [00:33<1:14:58,  5.03s/it]Device set to use cuda:0
  1%|          | 7/900 [00:38<1:10:55,  4.77s/it]Device set to use cuda:0
  1%|          | 8/900 [00:43<1:12:16,  4.86s/it]Device set to use cuda:0
  1%|          | 9/900 [00:48<1:13:44,  4.97s/it]Device set to use cuda:0
  1%|          | 10/900 [00:53<1:13:53,  4.98s/it]Device set to use cuda:0
  1%|          | 11/900 [00:58<1:13:57,  4.99s/it]Device set to use cuda:0
  1%|▏         | 12/900 [01:02<1:09:22,  4.69s/it]Device set to use cuda:0
  1%|▏         | 13/900 [01:06<1:05:50,  4.45

Accuracy: 0.566
Accuracy for label Neutral: 0.269
Accuracy for label Positive: 0.704
Accuracy for label Negative: 0.736

Classification Report:
              precision    recall  f1-score   support

     Neutral       0.40      0.27      0.32       309
    Positive       0.62      0.70      0.66       277
    Negative       0.60      0.74      0.66       314

    accuracy                           0.57       900
   macro avg       0.54      0.57      0.55       900
weighted avg       0.54      0.57      0.55       900


Confusion Matrix:
[[ 83 104 122]
 [ 53 195  29]
 [ 69  14 231]]



