# [Training] Gemma 7b Bangla News Summarization

In [1]:
!nvidia-smi

Sun Mar 10 11:41:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:08:00.0 Off |                  Off |
| 55%   81C    P2             299W / 300W |  32774MiB / 49140MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -qqq kaggle

In [7]:
!sudo apt-get install unzip

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  zip
The following NEW packages will be installed:
  unzip
0 upgraded, 1 newly installed, 0 to remove and 9 not upgraded.
Need to get 175 kB of archives.
After this operation, 386 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Fetched 175 kB in 0s (626 kB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package unzip.
(Reading database ... 16755 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
Unpacking unzip (6.0-26ubuntu3.2) ...
Setting up unzip (6.0-26ubuntu3.2) ...


In [3]:
!mkdir ~/.kaggle
!cp /home/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
!kaggle datasets download -d prithwirajsust/bengali-news-summarization-dataset
!unzip /home/bengali-news-summarization-dataset.zip

bengali-news-summarization-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /home/bengali-news-summarization-dataset.zip
  inflating: Bengali-News-Summarization-Dataset/article.txt  
  inflating: Bengali-News-Summarization-Dataset/summary.txt  


In [11]:
import numpy as np
import random
import pandas as pd
import os

def split_dataset(path):
    dataset = open(path, encoding='utf-8').read().split('\n')

    dataset_len = len(dataset)
    print("There are %s summary-article pairs" % dataset_len)

    # Random shuffle data
    random.seed(11)
    dataset = np.array(random.sample(dataset, len(dataset)))

    # Split dataset to 70% training, 20% validation, and 10% testing.
    train_size = int(dataset_len * 0.7)
    eval_size = int(dataset_len * 0.2)
    train, val, test = dataset[:train_size], dataset[train_size:train_size + eval_size], dataset[train_size + eval_size:]
    return train, val, test

def write_csv(filename, enc_data, dec_data):
    df = pd.DataFrame({'Text': enc_data, 'Summary': dec_data})
    df.to_csv(filename, index=False)

def main():
    summary_datapath = '/home/Bengali-News-Summarization-Dataset/summary.txt'
    article_datapath = '/home/Bengali-News-Summarization-Dataset/article.txt'
    
    dec_train, dec_val, dec_test = split_dataset(summary_datapath)
    enc_train, enc_val, enc_test = split_dataset(article_datapath)
    
    # Create the directory if it doesn't exist
    directory = './datasetFull'
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    write_csv(os.path.join(directory, 'train.csv'), enc_train, dec_train)
    write_csv(os.path.join(directory, 'val.csv'), enc_val, dec_val)
    write_csv(os.path.join(directory, 'test.csv'), enc_test, dec_test)
    
    print("Finished splitting and saving dataset!")

if __name__ == "__main__":
    main()


There are 19097 summary-article pairs
There are 19097 summary-article pairs
Finished splitting and saving dataset!


In [10]:
!pip install -qqq torch==2.0.1 loralib==0.1.1 einops==0.6.1 pandas
!pip install -q -U bitsandbytes transformers peft accelerate datasets scipy

[0m

In [14]:
!pip install -q --upgrade huggingface_hub
from huggingface_hub import login
login()

[0m

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import pandas as pd
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login

from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
model = "google/gemma-7b-it"
new_model = "gemma7b-it_banglaNewsSum"

MODEL_NAME = model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model = prepare_model_for_kbit_training(model)

In [5]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [6]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=get_last_layer_linears(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [7]:
get_last_layer_linears(model)

['base_model.model.model.layers.27.self_attn.q_proj.base_layer',
 'base_model.model.model.layers.27.self_attn.q_proj.lora_A.default',
 'base_model.model.model.layers.27.self_attn.q_proj.lora_B.default',
 'base_model.model.model.layers.27.self_attn.k_proj.base_layer',
 'base_model.model.model.layers.27.self_attn.k_proj.lora_A.default',
 'base_model.model.model.layers.27.self_attn.k_proj.lora_B.default',
 'base_model.model.model.layers.27.self_attn.v_proj.base_layer',
 'base_model.model.model.layers.27.self_attn.v_proj.lora_A.default',
 'base_model.model.model.layers.27.self_attn.v_proj.lora_B.default',
 'base_model.model.model.layers.27.self_attn.o_proj.base_layer',
 'base_model.model.model.layers.27.self_attn.o_proj.lora_A.default',
 'base_model.model.model.layers.27.self_attn.o_proj.lora_B.default',
 'base_model.model.model.layers.27.mlp.gate_proj.base_layer',
 'base_model.model.model.layers.27.mlp.gate_proj.lora_A.default',
 'base_model.model.model.layers.27.mlp.gate_proj.lora_B.defa

In [8]:
train_df = pd.read_csv("/home/datasetFull/train.csv", encoding = 'utf-8')
train_df.columns = [str(q) for q in train_df.columns]

val_df = pd.read_csv("/home/datasetFull/val.csv", encoding = 'utf-8')
val_df.columns = [str(q) for q in val_df.columns]

test_df = pd.read_csv("/home/datasetFull/test.csv", encoding = 'utf-8')
test_df.columns = [str(q) for q in test_df.columns]

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)
test_data = Dataset.from_pandas(test_df)

In [9]:
train_df["Text"].values[0:5]

array(['ঢাকার আশুলিয়ার এক বাড়িতে আগুন লেগে পাঁচটি ঘর পুড়ে গেছে।',
       'পায়ুপথে লুকিয়ে সোনার বিস্কুট পাচারের সময় চট্টগ্রামের শাহ আমানত বিমানবন্দর থেকে এক যাত্রীকে গ্রেপ্তার করেছেন শুল্ক গোয়েন্দারা।',
       'সিরাজগঞ্জের চৌহালী উপজেলার এনায়েতপুরে যমুনা নদীর স্পার বাঁধ এলাকা থেকে এক শিশু ও দুই নারীর বস্তাবন্দি লাশ উদ্ধার করেছে পুলিশ।',
       'পুরান ঢাকার নাজিমউদ্দিন রোডের পরিত্যাক্ত ঢাকা কেন্দ্রীয় কারাগারে প্রধানমন্ত্রী প্রতিশ্রুত প্রকল্প দ্রুত বাস্তবায়নের দাবি জানিয়েছে ঢাকা মহানগরী সমিতি। সেখানে যেন অন্য কোনো স্থাপনা তৈরি না করা হয়, সেই অনুরোধও জানিয়েছেন সংগঠনের নেতারা।',
       'নড়াইলে ইসলামী ছাত্রশিবিরের এক নেতাকে হাতবোমা, ও উগ্র মতবাদের বইসহ গ্রেপ্তার করেছে পুলিশ।'],
      dtype=object)

In [10]:
train_df["Summary"].values[0:5]

array(['আশুলিয়ায় পুড়ল ৫ ঘর', 'পায়ুপথে লুকিয়ে ১২ সোনার বিস্কুট',
       'সিরাজগঞ্জে এক শিশু ও দুই নারীর বস্তাবন্দি লাশ',
       'ঢাকা কারাগারে প্রতিশ্রুত প্রকল্প বাস্তবায়নের দাবি ঢাকা সমিতির',
       'নড়াইলে হাতবোমাসহ শিবির নেতা গ্রেপ্তার'], dtype=object)

# Check

In [11]:
prompt = f"""
<start_of_turn>
Provide a concise Bengali summary of the following news article, focusing on the most important information. 

Note:
Use only Bengali for the summary.
Stay objective and factual in your summary.

####

Article: {train_df["Text"].values[0]}

####
<end_of_turn>
"""
prompt

'\n<start_of_turn>\nProvide a concise Bengali summary of the following news article, focusing on the most important information. \n\nNote:\nUse only Bengali for the summary.\nStay objective and factual in your summary.\n\n####\n\nArticle: ঢাকার আশুলিয়ার এক বাড়িতে আগুন লেগে পাঁচটি ঘর পুড়ে গেছে।\n\n####\n<end_of_turn>\n'

In [12]:
generation_config = model.generation_config
generation_config.max_new_tokens = 50
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [13]:
%%time
device = "cuda"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model.generate(
        input_ids = encoding.input_ids,
        attention_mask = encoding.attention_mask,
        generation_config = generation_config
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print('Actual Summary:', train_df["Summary"].values[0])





Provide a concise Bengali summary of the following news article, focusing on the most important information. 

Note:
Use only Bengali for the summary.
Stay objective and factual in your summary.

####

Article: ঢাকার আশুলিয়ার এক বাড়িতে আগুন লেগে পাঁচটি ঘর পুড়ে গেছে।

####

**Summary:**

ঢাকার আশুলিয়ার এক বাড়িতে আগুন লেগে পাঁচটি ঘর পুড়ে গেছে। আগুনটি পুড়ে প্রথমে
Actual Summary: আশুলিয়ায় পুড়ল ৫ ঘর
CPU times: user 4.82 s, sys: 84.1 ms, total: 4.9 s
Wall time: 4.89 s


# Train

In [14]:
def generate_prompt(data_point):
    return f"""
<start_of_turn>
Provide a concise Bengali summary of the following news article, focusing on the most important information. 

Note:
Use only Bengali for the summary.
Stay objective and factual in your summary.

####

Article: {data_point["Text"]}

####
<end_of_turn>

<start_of_turn>
####

Summary: {data_point["Summary"]} 

####
<end_of_turn>
""".strip()

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

train_data = train_data.shuffle().map(generate_and_tokenize_prompt)
val_data = val_data.shuffle().map(generate_and_tokenize_prompt)
test_data = test_data.shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/13367 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3819 [00:00<?, ? examples/s]

Map:   0%|          | 0/1911 [00:00<?, ? examples/s]

In [15]:
training_args = transformers.TrainingArguments(
    output_dir=f"./{new_model}",
    save_strategy="steps",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=16,
    num_train_epochs=5,
    logging_steps=20,
    eval_steps=20,
    save_steps=20,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none",
    save_total_limit=3,
    load_best_model_at_end=True
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
20,10.5057,10.128615
40,9.3784,8.253196
60,6.626,4.804269
80,3.205,2.168969
100,1.9319,1.64388
120,1.5579,1.494015
140,1.4582,1.430442
160,1.4017,1.370473
180,1.355,1.335944
200,1.3151,1.310295


TrainOutput(global_step=260, training_loss=3.2770693999070386, metrics={'train_runtime': 16852.8414, 'train_samples_per_second': 3.966, 'train_steps_per_second': 0.015, 'total_flos': 7.048804415292273e+17, 'train_loss': 3.2770693999070386, 'epoch': 4.98})

In [None]:
training_args = transformers.TrainingArguments(
    output_dir=f"./{new_model}",
    save_strategy="steps",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=16,
    num_train_epochs=5,
    logging_steps=20,
    eval_steps=20,
    save_steps=20,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none",
    save_total_limit=3,
    load_best_model_at_end=True
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False
trainer.train()

# Inference

In [16]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.save_pretrained(new_model)

In [18]:
PEFT_MODEL = f"samanjoy2/{new_model}"

model.push_to_hub(
    PEFT_MODEL, use_auth_token=True
)



adapter_model.safetensors:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/samanjoy2/gemma7b-it_banglaNewsSum/commit/66dd1222146dce9e0989eb90c969602651f092bd', commit_message='Upload model', commit_description='', oid='66dd1222146dce9e0989eb90c969602651f092bd', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
# model.save_pretrained("trained-model")

PEFT_MODEL = new_model

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [43]:
generation_config = model.generation_config
generation_config.max_new_tokens = 50
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [44]:
import numpy as np

In [45]:
test_df

Unnamed: 0,Text,Summary
0,নাশকতা ও রাষ্ট্রদ্রোহের দশ মামলায় আদালতে হাজির...,১০ মামলায় খালেদার জামিন
1,"ভুল চুলের ছাঁট, রং বা স্টাইলের কারণে বয়স বেশি ...",কেশশৈলীর ৫ ভুল: দেখতে লাগবে বুড়োটে
2,ঢাকা সিটি করপোরেশন নির্বাচনে নাগরিক সমস্যার সম...,সমস্যা সমাধানে প্রাধান্য উত্তরের ভোটারদের
3,লন্ডন থেকে বাংলাদেশ দলকে বিদায় জানিয়েছে বৃষ্টি...,নিউ জিল্যান্ড বলেই আশায় সাকিব
4,নিজেদের পণ্য ও সেবার সাইবার নিরাপত্তা দূর্বলতা...,ত্রুটি খুঁজতে পারিশ্রমিক দেবে গুগল
...,...,...
1906,কাজাখস্তানের রাজধানী আসতানায় অনুষ্ঠিত সিরিয়া ব...,সিরিয়ায় নিরাপদ অঞ্চল প্রতিষ্ঠার চুক্তি কার্যকর...
1907,যুক্তরাজ্যের প্রভাবশালী দ্য গার্ডিয়ান পত্রিকার...,গার্ডিয়ানএ প্রথম নারী সম্পাদক
1908,গত বছরের শুরুতে ওয়ানডেতে অভিষিক্ত হওয়া আরাফাত ...,বিশ্বকাপ দলে ডাক পেয়ে রোমাঞ্চিত সানি
1909,"সেক্টর কমান্ডার্স ফোরাম, যুক্তরাষ্ট্র শাখার উদ...",যুক্তরাষ্ট্রে সেক্টর কমান্ডার্স ফোরামের মতবিনিময়


In [46]:
test_df["Text"].values[0]

'নাশকতা ও রাষ্ট্রদ্রোহের দশ মামলায় আদালতে হাজির হয়ে জামিন পেয়েছেন বিএনপি চেয়ারপারসন খালেদা জিয়া।'

In [50]:
%%time

prompt = f"""
<start_of_turn>
Provide a concise Bengali summary of the following news article, focusing on the most important information. 

Note:
Use only Bengali for the summary.
Stay objective and factual in your summary.

####

Article: {train_df["Text"].values[7]}

####
<end_of_turn>
"""

device = "cuda"
encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[1].split('\n')
# print(generated_text)
print('Models Summary', tokenizer.decode(outputs[0], skip_special_tokens=True).split('Summary:')[1].split('\n')[0].strip())
print('Actual Summary:', train_df["Summary"].values[7])

Models Summary অর্থমন্ত্রী আবুল মাল আবদুল মুহিত
Actual Summary: দেশে ২০২৪ সালের পর কেউ গরীব থাকবে না: অর্থমন্ত্রী
CPU times: user 4.78 s, sys: 7.72 ms, total: 4.78 s
Wall time: 4.78 s
