In [1]:
!pip install git+https://github.com/huggingface/accelerate.git

Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-t27_eyjj
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-t27_eyjj
  Resolved https://github.com/huggingface/accelerate.git to commit f0b030554cbcd01c5541c449e92066715f21a99e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
!pip install transformers datasets pandas torch openpyxl bitsandbytes trl



In [3]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForCausalLM, BitsAndBytesConfig
import accelerate
from datasets import Dataset
import numpy as np
import wandb
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

In [4]:
# Load the dataset
df = pd.read_excel("dataset_rv.xlsx")

# Rename the column to text
df.rename(columns={'Headline':'text', 'Sentiment':'labels'}, inplace=True)

In [5]:
def english_to_bangla_number_text(number):
    """Converts an English number to Bangla textual representation."""
    bangla_numbers = {
        0: "শূন্য", 1: "এক", 2: "দুই", 3: "তিন", 4: "চার",
        5: "পাঁচ", 6: "ছয়", 7: "সাত", 8: "আট", 9: "নয়",
        10: "দশ", 11: "এগারো", 12: "বারো", 13: "তেরো", 14: "চৌদ্দ",
        15: "পনেরো", 16: "ষোলো", 17: "সতেরো", 18: "আঠারো", 19: "উনিশ",
        20: "বিশ", 21: "একুশ", 22: "বাইশ", 23: "তেইশ", 24: "চব্বিশ",
        25: "পঁচিশ", 26: "ছাব্বিশ", 27: "সাতাশ", 28: "আটাশ", 29: "ঊনত্রিশ",
        30: "ত্রিশ", 31: "একত্রিশ", 32: "বত্রিশ", 33: "তেত্রিশ", 34: "চৌত্রিশ",
        35: "পঁত্রিশ", 36: "ছত্রিশ", 37: "সাঁইত্রিশ", 38: "আটত্রিশ", 39: "ঊনচল্লিশ",
        40: "চল্লিশ", 41: "একচল্লিশ", 42: "বিয়াল্লিশ", 43: "তেতাল্লিশ", 44: "চুয়াল্লিশ",
        45: "পঁইয়াল্লিশ", 46: "ছিয়াল্লিশ", 47: "সাতচল্লিশ", 48: "আটচল্লিশ", 49: "ঊনপঞ্চাশ",
        50: "পঞ্চাশ", 51: "একান্ন", 52: "বাহান্ন", 53: "তিপ্পান্ন", 54: "চুয়ান্ন",
        55: "পঞ্চান্ন", 56: "ছাপ্পান্ন", 57: "সাতান্ন", 58: "আটান্ন", 59: "ঊনষাট",
        60: "ষাট", 61: "একষট্টি", 62: "বাষট্টি", 63: "তেষট্টি", 64: "চৌষট্টি",
        65: "পঁষট্টি", 66: "ছেষট্টি", 67: "সাতষট্টি", 68: "আটষট্টি", 69: "ঊনসত্তর",
        70: "সত্তর", 71: "একাত্তর", 72: "বাহাত্তর", 73: "তিয়াত্তর", 74: "চুয়াত্তর",
        75: "পঁচাত্তর", 76: "ছিয়াত্তর", 77: "সাতাত্তর", 78: "আটাত্তর", 79: "ঊনআশি",
        80: "আশি", 81: "একাশি", 82: "বিরাশি", 83: "তিরাশি", 84: "চুরাশি",
        85: "পঁচাশি", 86: "ছিয়াশি", 87: "সাতাশি", 88: "আটাশি", 89: "ঊননব্বই",
        90: "নব্বই", 91: "একানব্বই", 92: "বিরানব্বই", 93: "তিরানব্বই", 94: "চুরানব্বই",
        95: "পঁচানব্বই", 96: "ছিয়ানব্বই", 97: "সাতানব্বই", 98: "আটানব্বই", 99: "নিরানব্বই",
    }

    def convert_two_digit_number(n):
        """Handles numbers from 0 to 99 based on direct mapping."""
        if n in bangla_numbers:
            return bangla_numbers[n]
        tens = (n // 10) * 10
        units = n % 10
        return f"{bangla_numbers[tens]} {bangla_numbers[units]}" if units > 0 else bangla_numbers[tens]

    if number < 100:
        return convert_two_digit_number(number)
    elif number < 1000:
        hundreds = number // 100
        remainder = number % 100
        if remainder == 0:
            return f"{bangla_numbers[hundreds]} শত"
        return f"{bangla_numbers[hundreds]} শত {convert_two_digit_number(remainder)}"
    elif number < 100000: #Handle numbers from 1000 to 99,999
        parts = []
        if number >= 1000:
            parts.append(f"{english_to_bangla_number_text(number // 1000)} হাজার")
            number %= 1000
        if number > 0:
            if number >= 100:
              parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number))  #handles cases less than 100
        return " ".join(parts)
    elif number < 10000000: #handles numbers from 100,000 to 9,999,999
      parts = []
      if number >= 100000:
          parts.append(f"{english_to_bangla_number_text(number // 100000)} লক্ষ")
          number %= 100000
      if number > 0:
          if number >= 1000:
            parts.append(english_to_bangla_number_text(number))
          elif number > 0:
            if number >= 100:
                parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number)) #handle numbers less than 100
      return " ".join(parts)
    else: # Handle numbers >= 10,000,000 (Crore)
        parts = []
        if number >= 10000000:
            parts.append(f"{english_to_bangla_number_text(number // 10000000)} কোটি")
            number %= 10000000
        if number > 0:
            parts.append(english_to_bangla_number_text(number))
        return " ".join(parts)

In [6]:
import re

def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\u2600-\u26FF"          # miscellaneous symbols
                               u"\u2700-\u27BF"          # dingbats
                               u"\u2000-\u206F"          # general punctuations
                               "]+", flags=re.UNICODE)
    english_pattern = re.compile('[a-zA-Z]+', flags=re.I)

    text = emoji_pattern.sub(r'', text)
    text = english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰🤣⚽️✌�￰৷￰'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct += char

    return no_punct

def convert_numbers_to_bangla(text):
    words = text.split()
    converted_words = []
    for word in words:
        if word.isdigit():  # Check if the word is an integer
            bangla_number = english_to_bangla_number_text(int(word))
            converted_words.append(bangla_number)
        else:
            converted_words.append(word)
    return ' '.join(converted_words)

def preprocessing(text):
    text = replace_strings(text)
    text = convert_numbers_to_bangla(text)  # Convert numbers to Bangla
    text = remove_punctuations(text)
    return text

In [7]:
# Load the model and tokenizer
model_name = "BanglaLLM/bangla-llama-13b-base-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    # Explicitly enable CPU offloading within the quantization config
    llm_int8_enable_fp32_cpu_offload=True
)

# Initialize the Accelerator
accelerator = accelerate.Accelerator()

# Load the model with a custom device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    # Remove device_map="auto" to avoid conflict with CPU offloading
    # device_map="auto",
    offload_folder="offload",  # Optional: specify offload folder for larger models
)

# Prepare the model for the current device (managed by Accelerator)
model = accelerator.prepare(model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
df['text'] =df['text'].apply(lambda x: preprocessing(str(x)))
df.head()

Unnamed: 0,text,labels
0,ঢাকা বিশ্ববিদ্যালয় কোটাবিরোধী আন্দোলনের নেতাকে...,outrage
1,কোটাবিরোধী আন্দোলনে ঢাকা বিশ্ববিদ্যালয়ের বিএনপ...,hope
2,কোটাবিরোধী আন্দোলন আজও জিরো পয়েন্ট অবরোধ করে ...,outrage
3,সর্বজনীন পেনশন সরকার অনড় আন্দোলন চালিয়ে যাবেন ...,outrage
4,আজ শনিবার সকাল সাড়ে টার দিকে মহাসড়কের শহর বাইপ...,outrage


In [9]:
# Create the formatted text column
df["formatted_text"] = df.apply(lambda row: f"human: {row['text']} \n bot: {row['labels']}", axis=1)

# Save to CSV
df[["formatted_text"]].to_csv("formatted_dataset.csv", index=False)
df["formatted_text"].head()


Unnamed: 0,formatted_text
0,human: ঢাকা বিশ্ববিদ্যালয় কোটাবিরোধী আন্দোলনের...
1,human: কোটাবিরোধী আন্দোলনে ঢাকা বিশ্ববিদ্যালয়ে...
2,human: কোটাবিরোধী আন্দোলন আজও জিরো পয়েন্ট অবর...
3,human: সর্বজনীন পেনশন সরকার অনড় আন্দোলন চালিয়ে...
4,human: আজ শনিবার সকাল সাড়ে টার দিকে মহাসড়কের শ...


In [31]:
# Split the data into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df[["formatted_text"]], train_size=0.8, test_size=0.2, random_state=42)

# Randomly select 500 samples for train_small_df
train_df_small = train_df.sample(n=300, random_state=42)

# Randomly select 20 samples for test_df_small
test_df_small = test_df.sample(n=30, random_state=42)

# Create Datasets
train_dataset = Dataset.from_pandas(train_df_small)
test_dataset = Dataset.from_pandas(test_df_small)

In [32]:
# Set pad_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['formatted_text'], padding="max_length", truncation=True, max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [33]:
# LoRA Configuration
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA matrices
    lora_alpha=32, #Scaling factor for LoRA weights
    lora_dropout=0.05, #Dropout rate
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"], #The target modules on which adapters will be added.
)

In [34]:
# Get the model with adapters
model = get_peft_model(model, lora_config)

In [35]:
training_args = TrainingArguments(
    output_dir="./bangla-llama-13b-base_sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [36]:
# Remove dataset_text_field argument
trainer = SFTTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_test_dataset,
   tokenizer=tokenizer,
)

  trainer = SFTTrainer(


In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,4.160639


TrainOutput(global_step=100, training_loss=4.711818237304687, metrics={'train_runtime': 1026.3272, 'train_samples_per_second': 0.292, 'train_steps_per_second': 0.097, 'total_flos': 2984365522944000.0, 'train_loss': 4.711818237304687, 'epoch': 1.0})

In [38]:
# --- Prediction/Evaluation ---

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
       outputs = model.generate(**inputs, max_new_tokens=10)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [39]:
# Get random samples from test_dataset
num_samples = 10
random_samples = test_dataset.select(range(num_samples))

In [40]:
# Make predictions and display results
print("\n--- Inference Results ---")
for sample in random_samples:
    formatted_text = sample["formatted_text"]
    parts = formatted_text.split("\n bot: ")
    text = parts[0].replace("human: ", "").strip()
    actual_label = parts[1].strip()
    predicted_label = predict_sentiment(f"human: {text} \n bot:")
    print(f"Text: {text}")
    print(f"Actual Sentiment: {actual_label}")
    print(f"Predicted Sentiment: {predicted_label}")
    print("-" * 50)


--- Inference Results ---
Text: জামায়াতে ইসলামীর নামব্যানার ব্যবহার করে কেউ দুর্বৃত্তপনা করলে সুনির্দিষ্ট তথ্য দিয়ে সহযোগিতা করতে হিন্দু সম্প্রদায়ের নেতাদের অনুরোধ করেছেন দলটির আমির শফিকুর রহমান
Actual Sentiment: hope
Predicted Sentiment: human: জামায়াতে ইসলামীর নামব্যানার ব্যবহার করে কেউ দুর্বৃত্তপনা করলে সুনির্দিষ্ট তথ্য দিয়ে সহযোগিতা করতে হিন্দু সম্প্রদায়ের নেতাদের অনুরোধ করেছেন দলটির আমির শফিকুর রহমান 
 bot: human: জামায়াতে ইসলামীর নাম
--------------------------------------------------
Text: গাছের ডাব থেকে লেপতোষক গণভবনে কিছুই অবশিষ্ট নেই
Actual Sentiment: outrage
Predicted Sentiment: human: গাছের ডাব থেকে লেপতোষক গণভবনে কিছুই অবশিষ্ট নেই 
 bot: Fruit fly is not found in the flower bud
--------------------------------------------------
Text: চট্টগ্রামে শিক্ষামন্ত্রীর বাসায় হামলা সংসদ সদস্যের কার্যালয়ে আগুন
Actual Sentiment: outrage
Predicted Sentiment: human: চট্টগ্রামে শিক্ষামন্ত্রীর বাসায় হামলা সংসদ সদস্যের কার্যালয়ে আগুন 
 bot: @-2022152030203020
-------------------------------

In [None]:
trainer.save_model("./fine_tuned_bangla-llama-13b-base")
tokenizer.save_pretrained("./fine_tuned_bangla-llama-13b-base")

In [None]:
!zip -r /content/fine_tuned_banglabert_sentiment.zip /content/fine_tuned_banglabert_sentiment