In [30]:
import os
os.environ["WANDB_PROJECT"]="mistral_lang_finetuning"

from enum import Enum
from functools import partial

import numpy as np
import pandas as pd
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, DatasetDict, load_dataset
from peft import get_peft_model, PeftModel, LoraConfig, TaskType
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging,
                          set_seed)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

seed = 42
set_seed(seed)

In [31]:
model_name="mistralai/Mistral-7B-Instruct-v0.2"

In [32]:
# Initialize lists to hold the English and Telugu sentences
english_sentences = []
telugu_sentences = []
instruction= "You are a helpful assistant who translates English statements, words, or lines into Telugu. Use your language skills to provide accurate and meaningful translations."

# Open the file and read line by line
with open('english_telugu_data.txt', 'r', encoding='utf-8') as f:
    for line in f:
        # Split the line based on the delimiter '++++$++++'
        parts = line.split('++++$++++')
        if len(parts) == 2:  # Ensure that there are exactly two parts
            english_sentences.append(parts[0].strip())
            telugu_sentences.append(parts[1].strip())

# Create a DataFrame with the English and Telugu sentences
df = pd.DataFrame({
    'Input': english_sentences,
    'Output': telugu_sentences,
    'Instruction': instruction
})

# Print the first few rows of the DataFrame
print(df.head())

                                               Input  \
0                                 His legs are long.   
1                Who taught Tom how to speak French?   
2                       I swim in the sea every day.   
3  Tom popped into the supermarket on his way hom...   
4                             Smoke filled the room.   

                                              Output  \
0                       అతని కాళ్ళు పొడవుగా ఉన్నాయి.   
1            టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?   
2              నేను ప్రతి రోజు సముద్రంలో ఈత కొడతాను.   
3  టామ్ కొంచెం పాలు కొనడానికి ఇంటికి వెళ్ళేటప్పుడ...   
4                                పొగ గదిని నింపింది.   

                                         Instruction  
0  You are a helpful assistant who translates Eng...  
1  You are a helpful assistant who translates Eng...  
2  You are a helpful assistant who translates Eng...  
3  You are a helpful assistant who translates Eng...  
4  You are a helpful assistant who translates Eng..

In [33]:
print(len(df))

155798


In [34]:
# Convert pandas DataFrame to a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_pandas(df)
})

# Print the dataset_dict structure
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['Input', 'Output', 'Instruction'],
        num_rows: 155798
    })
})


In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

def preprocess(samples):
    batch = []
    for system_prompt, function_desc, conversation in zip(samples["system_message"], samples["function_description"], samples["conversations"]):
        try:
            function_desc_formatted = json.dumps(json.loads(f"[{function_desc}]"), indent=2, sort_keys=True)
        except:
            function_desc_formatted = f"[{function_desc}]"
        system_message = {"role": "system", "content": f"{system_prompt}\nfunctions: {function_desc_formatted}"}
        conversation.insert(0, system_message)
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"content": batch}

dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)
dataset = dataset["train"].train_test_split(0.1)
print(dataset)
print(dataset["train"][0])