In [1]:
import os
os.environ["WANDB_PROJECT"]="mistral_trans_finetuning"

from enum import Enum
from functools import partial

import numpy as np
import pandas as pd
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, DatasetDict, load_dataset
from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          set_seed)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

seed = 42
set_seed(seed)

In [2]:
# Initialize lists to hold the English and Telugu sentences
english_sentences = []
telugu_sentences = []
instruction= "You are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations."

# Open the file and read line by line
with open('english_telugu_data.txt', 'r', encoding='utf-8') as f:
    for line in f:
        # Split the line based on the delimiter '++++$++++'
        parts = line.split('++++$++++')
        if len(parts) == 2:  # Ensure that there are exactly two parts
            english_sentences.append(parts[0].strip())
            telugu_sentences.append(parts[1].strip())

# Create a DataFrame with the English and Telugu sentences
df = pd.DataFrame({
    'Input': english_sentences,
    'Output': telugu_sentences,
    'Instruction': instruction
})

# Print the first few rows of the DataFrame
print(df.head())

                                               Input  \
0                                 His legs are long.   
1                Who taught Tom how to speak French?   
2                       I swim in the sea every day.   
3  Tom popped into the supermarket on his way hom...   
4                             Smoke filled the room.   

                                              Output  \
0                       అతని కాళ్ళు పొడవుగా ఉన్నాయి.   
1            టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?   
2              నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.   
3  టామ్ కొంచెం పాలు కొనడానికి ఇంటికి వెళ్ళేటప్పుడ...   
4                                పొగ గదిని నింపింది.   

                                         Instruction  
0  You are a helpful assistant who translates Eng...  
1  You are a helpful assistant who translates Eng...  
2  You are a helpful assistant who translates Eng...  
3  You are a helpful assistant who translates Eng...  
4  You are a helpful assistant who translates Eng..

In [3]:
print(len(df))

155798


In [4]:
# Function to create the 'Conversation' content
def create_conversation(row):
    return [
        {"content": row['Input'], "role": "user"},
        {"content": row['Output'], "role": "assistant"}
    ]

# Apply the function to each row in the DataFrame
df['Conversation'] = df.apply(create_conversation, axis=1)

# Display the DataFrame with the new 'Conversation' column
print(df)

                                                    Input  \
0                                      His legs are long.   
1                     Who taught Tom how to speak French?   
2                            I swim in the sea every day.   
3       Tom popped into the supermarket on his way hom...   
4                                  Smoke filled the room.   
...                                                   ...   
155793                       I didn't do it deliberately.   
155794                       That book isn't interesting.   
155795             Tom has never seen the Atlantic Ocean.   
155796     Tom didn't even discuss the problem with Mary.   
155797                          I've lost my best friend.   

                                                   Output  \
0                            అతని కాళ్ళు పొడవుగా ఉన్నాయి.   
1                 టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?   
2                   నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.   
3       టామ్ కొంచెం పాల

In [5]:
# Adjust display options to avoid truncation
pd.set_option('display.max_colwidth', None)

In [6]:
print(df.head())

                                                               Input  \
0                                                 His legs are long.   
1                                Who taught Tom how to speak French?   
2                                       I swim in the sea every day.   
3  Tom popped into the supermarket on his way home to buy some milk.   
4                                             Smoke filled the room.   

                                                                             Output  \
0                                                      అతని కాళ్ళు పొడవుగా ఉన్నాయి.   
1                                           టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?   
2                                             నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.   
3  టామ్ కొంచెం పాలు కొనడానికి ఇంటికి వెళ్ళేటప్పుడు సూపర్ మార్కెట్లోకి ప్రవేశించాడు.   
4                                                               పొగ గదిని నింపింది.   

                                            

In [7]:
# Convert pandas DataFrame to a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(df)
})

# Print the dataset_dict structure
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['Input', 'Output', 'Instruction', 'Conversation'],
        num_rows: 155798
    })
})


In [8]:
dataset=dataset_dict

In [9]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

def preprocess(samples):
    batch = []
    for system_prompt, conversation in zip(samples["Instruction"], samples["Conversation"]):
        system_message = {"role": "system", "content": f"{system_prompt}"}
        conversation.insert(0, system_message)
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"content": batch}

dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)
dataset = dataset["train"].train_test_split(0.1)
print(dataset)
print(dataset["train"][0])



Map:   0%|          | 0/155798 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 140218
    })
    test: Dataset({
        features: ['content'],
        num_rows: 15580
    })
})
{'content': '<|im_start|>system\nYou are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations.<|im_end|>\n<|im_start|>user\nWe saw a mountain in the distance.<|im_end|>\n<|im_start|>assistant\nదూరం లో ఒక పర్వతం చూశాము.<|im_end|>\n'}


In [10]:
print(dataset["train"][6]["content"])

<|im_start|>system
You are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations.<|im_end|>
<|im_start|>user
I saw a mouse.<|im_end|>
<|im_start|>assistant
నేను ఎలుకను చూశాను.<|im_end|>



In [11]:
print(dataset["test"][0]["content"])

<|im_start|>system
You are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations.<|im_end|>
<|im_start|>user
We'll succeed.<|im_end|>
<|im_start|>assistant
మేము విజయం సాధిస్తాము.<|im_end|>



In [12]:
peft_config = LoraConfig(r=8,
                         lora_alpha=16,
                         lora_dropout=0.1,
                         target_modules=["gate_proj","q_proj","lm_head","o_proj","k_proj","embed_tokens","down_proj","up_proj","v_proj"],
                         task_type=TaskType.CAUSAL_LM)

In [13]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

In [14]:
class ChatmlSpecialTokens(str, Enum):
    user = "<|im_start|>user"
    assistant = "<|im_start|>assistant"
    system = "<|im_start|>system"
    function_call = "<|im_start|>function-call"
    function_response = "<|im_start|>function-response"
    eos_token = "<|im_end|>"
    bos_token = "<s>"
    pad_token = "<pad>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        pad_token=ChatmlSpecialTokens.pad_token.value,
        bos_token=ChatmlSpecialTokens.bos_token.value,
        eos_token=ChatmlSpecialTokens.eos_token.value,
        additional_special_tokens=ChatmlSpecialTokens.list(),
        trust_remote_code=True
    )
tokenizer.chat_template = template

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             attn_implementation="flash_attention_2")
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Embedding(32008, 4096)

In [15]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32008, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
   

In [16]:
output_dir = "Translator_Eng_Tel_instruct"
per_device_train_batch_size = 2
per_device_eval_batch_size = 2
gradient_accumulation_steps = 4
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
num_train_epochs=2
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    bf16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

In [17]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field="content",
    max_seq_length=max_seq_length,
    peft_config=peft_config,
    dataset_kwargs={
        "append_concat_token": False,
        "add_special_tokens": False,
    },
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Currently logged in as: [33mrohithreddy24242[0m ([33mmrr24[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Epoch,Training Loss,Validation Loss
0,0.2549,0.255433
1,0.2138,0.212012




events.out.tfevents.1719070142.23ec79e11277.2261.0:   0%|          | 0.00/5.64k [00:00<?, ?B/s]

events.out.tfevents.1719070531.23ec79e11277.2799.0:   0%|          | 0.00/86.6k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

In [19]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import torch

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

peft_model_id = "Translator_Eng_Tel_instruct"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model = PeftModel.from_pretrained(model, peft_model_id)
# model.to(torch.bfloat16)
# model.cuda()
model.eval()



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): lora.Embedding(
          (base_layer): Embedding(32008, 4096)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 8x32008 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 4096x8 (cuda:0)])
        )
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
         

In [None]:
system_prompt = """You are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations."""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": "what are you doing"},
]
text = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs = {k: v.to("cuda") for k,v in inputs.items()}
with torch.autocast(dtype=torch.bfloat16, device_type="cuda"):
    outputs = model.generate(**inputs, 
                             max_new_tokens=128, 
                             do_sample=True, 
                             top_p=0.95, 
                             temperature=0.2, 
                             repetition_penalty=1.0, 
                             eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
