In [None]:
!pip install -U datasets transformers trl accelerate peft bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer
import torch

In [None]:
file_eng_latn_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.eng_Latn"
file_mal_mlym = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-mal_Mlym/train.mal_Mlym"

with open(file_eng_latn_mal_mlym, 'r', encoding='utf-8') as f_eng_latn, open(file_mal_mlym, 'r', encoding='utf-8') as f_mal_mlym:
    eng_latn_mal_mlym_data = f_eng_latn.readlines()
    mal_mlym_data = f_mal_mlym.readlines()

# Combine into one dataset
eng_latn_mal_mlym_pairs = list(zip(eng_latn_mal_mlym_data, mal_mlym_data))


In [None]:
file_eng_latn_tam_taml = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-hin_Deva/train.eng_Latn"
file_tam_taml = "/kaggle/input/wiki-datatrans/wiki/eng_Latn-hin_Deva/train.hin_Deva"

with open(file_eng_latn_tam_taml, 'r', encoding='utf-8') as f_eng_latn_tam_taml, open(file_tam_taml, 'r', encoding='utf-8') as f_tam_taml:
    eng_latn_tam_taml_data = f_eng_latn_tam_taml.readlines()
    tam_taml_data = f_tam_taml.readlines()

# Combine into one dataset
eng_latn_tam_taml_pairs = list(zip(eng_latn_tam_taml_data, tam_taml_data))


In [None]:
# Format English to Malayalam dataset
formatted_eng_mal_dataset = [
    f"{eng.strip()} #ml#> {mal.strip()}" for eng, mal in eng_latn_mal_mlym_pairs
]

# Format English to Hindi dataset
formatted_eng_hi_dataset = [
    f"{eng.strip()} #hi#> {tam.strip()}" for eng, tam in eng_latn_tam_taml_pairs
]


In [None]:
import random

# Combine the datasets
combined_dataset = formatted_eng_mal_dataset + formatted_eng_hi_dataset

In [None]:
import random

# # Shuffle the combined dataset
# random.shuffle(combined_dataset)

# Define the number of examples for each split
train_size = 30000
test_size = 2000
validation_size = 2000

# Initialize counters for each category (#ml#> and #hi#>)
ml_count_train = 0
hi_count_train = 0
ml_count_test = 0
hi_count_test = 0
ml_count_val = 0
hi_count_val = 0

# Initialize lists for train, test, and validation datasets
train_dataset = []
test_dataset = []
validation_dataset = []

# Iterate through the combined dataset
for pair in combined_dataset:
    if '#ml#>' in pair:
        if ml_count_train < train_size / 2:
            train_dataset.append(pair)
            ml_count_train += 1
        elif ml_count_test < test_size / 2:
            test_dataset.append(pair)
            ml_count_test += 1
        elif ml_count_val < validation_size / 2:
            validation_dataset.append(pair)
            ml_count_val += 1
    elif '#hi#>' in pair:
        if hi_count_train < train_size / 2:
            train_dataset.append(pair)
            hi_count_train += 1
        elif hi_count_test < test_size / 2:
            test_dataset.append(pair)
            hi_count_test += 1
        elif hi_count_val < validation_size / 2:
            validation_dataset.append(pair)
            hi_count_val += 1

# Verify the sizes of each dataset
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")
print(f"Validation Dataset Size: {len(validation_dataset)}")

In [None]:
from datasets import Dataset, DatasetDict

# Combine the datasets into a single list of dictionaries
combined_data = {
    "translations": train_dataset + validation_dataset + test_dataset
}

# Define lengths for each split
train_length = len(train_dataset)
validation_length = len(validation_dataset)
test_length = len(test_dataset)

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset.from_dict(combined_data).select(range(train_length)),
    "validation": Dataset.from_dict(combined_data).select(range(train_length, train_length + validation_length)),
    "test": Dataset.from_dict(combined_data).select(range(train_length + validation_length, train_length + validation_length + test_length)),
})

# Print the structure and sizes of the DatasetDict
print(dataset_dict)

In [None]:
dataset_dict['validation']['translations'][-1]

In [None]:
dataset_dict['validation']['translations'][1]

In [None]:
dataset_dict_shuffled = dataset_dict.shuffle(seed=42)

print(dataset_dict_shuffled)

In [None]:
from datasets import Dataset, DatasetDict

# Assume dataset_dict_shuffled is already defined and shuffled

# Filter out English to Hindi pairs (#hi#>) from the 'test' split
def filter_hi(example):
    return '#hi#>' in example['translations']

# Apply the filter to the 'test' split
test_hi_pairs = dataset_dict_shuffled['test'].filter(filter_hi)

# Create a new DatasetDict for the filtered pairs
dataset_dict_hi_test = DatasetDict({
    "test_hi": test_hi_pairs
})

# Print the structure and size of the new DatasetDict
print(dataset_dict_hi_test)


In [None]:
dataset_dict_hi_test['test_hi']['translations'][4]

In [None]:
from datasets import Dataset, DatasetDict

# Assume dataset_dict_shuffled is already defined and shuffled

# Filter out English to Malayalam pairs (#ml#>) from the 'test' split
def filter_ml(example):
    return '#ml#>' in example['translations']

# Apply the filter to the 'test' split
test_ml_pairs = dataset_dict_shuffled['test'].filter(filter_ml)

# Create a new DatasetDict for the filtered pairs
dataset_dict_ml_test = DatasetDict({
    "test_ml": test_ml_pairs
})

# Print the structure and size of the new DatasetDict
print(dataset_dict_ml_test)

# Accessing information about the new DatasetDict



In [None]:
dataset_dict_ml_test['test_ml']['translations'][-4]

In [None]:
from huggingface_hub import login

# Log in to Hugging Face Hub
api_token = 'Your token'
login(api_token)

In [None]:
!pip install evaluate sacrebleu

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

model_name = "ABHIiiii1/FineTuned-Trans-oneTomany-llama-2-7b"

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load the model with the quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def translator(text, language_code):
    prompt = text + " " + language_code
    tokenized_input = tokenizer(prompt, return_tensors="pt")
    input_ids = tokenized_input["input_ids"].cuda()

    generation_output = model.generate(
        input_ids=input_ids,
        num_beams=6,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=130
    )
    for seq in generation_output.sequences:
        output = tokenizer.decode(seq, skip_special_tokens=True)
        return output.split(language_code)[1].strip()


In [None]:
text = "hello, what is your name?"
language_code = "#ml#>"
translated_text = translator(text, language_code)
print(translated_text)

In [None]:
text = "hello, what is your name?"
language_code = "#hi#>"
translated_text = translator(text, language_code)
print(translated_text)

In [None]:
def translate_texts(translator, dataset, language_code):
    tgt_texts, trans_texts = [], []

    for translation in dataset['translations']:
        src_text, tgt_text = translation.split(language_code)
        translated_text = translator(src_text.strip(), language_code)
        tgt_texts.append(tgt_text.strip())
        trans_texts.append(translated_text)

    return tgt_texts, trans_texts

In [None]:
tgt_texts, trans_texts = translate_texts(translator, dataset_dict_hi_test['test_hi'], "#hi#>")

In [None]:
tgt_textsML, trans_textsML = translate_texts(translator, dataset_dict_ml_test['test_ml'], "#ml#>")

In [None]:
# Specify the file name
file_name = "MT5_Bi_en_hi_pred.txt"

# Open the file in write mode and save the list
with open(file_name, "w") as file:
    for item in trans_texts2:
        file.write("%s\n" % item)

In [2]:
!pip uninstall attrs

^C


In [9]:
pip install --upgrade attrs

Note: you may need to restart the kernel to use updated packages.


In [7]:
!pip install --upgrade evaluate sacrebleu

^C


In [3]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\\IASNLP2\\mT5\\Results\\MT5_Bi_hi_en_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\\IASNLP2\\mT5\\Results\\MT5_Bi_hi_en_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

14.144445924670746
33.827890772364086
74.7157615282507


In [4]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\MT5_Bi_en_hi_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\MT5_Bi_en_hi_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

11.710726570086187
31.063934840100497
74.16267942583733


In [5]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\en-hi-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\en-hi-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

3.480294983932118
19.61848203050378
84.78219444338579


In [6]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\en-bgl-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\en-bgl-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

1.0885475872120058
16.23827656125403
91.93984371160367


In [7]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\hi-bgl-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\hi-bgl-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

0.7545549128295486
15.699038778529001
92.93261905932079


In [8]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\hi-en-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\hi-en-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

5.223732432350877
23.225896622542557
84.66852067820938


In [10]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:/IASNLP2/mT5/Results/bgl-en-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:/IASNLP2/mT5/Results/bgl-en-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

3.9469035214522026
21.58559951457612
86.72288508118775


In [11]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:/IASNLP2/mT5/Results/bgl-hi-pred-mt5.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:/IASNLP2/mT5/Results/bgl-hi-tgt-mt5.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

2.145890057066503
16.923500821126947
88.20839208811311


In [12]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\llama_1_M_en_hi_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\llama_1_M_en_hi_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

0.02659026043067876
7.121786236387546
94.09500609013398


In [1]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

with open("D:\IASNLP2\mT5\Results\llama_one2many_en_ml_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\IASNLP2\mT5\Results\llama_one2many_en_ml_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

new_ref = []
for sent in ref:
    new_ref.append([sent])

print(sacrebleu.compute(predictions=pred, references=new_ref)["score"])
print(chrf.compute(predictions=pred, references=new_ref)["score"])
print(ter.compute(predictions=pred, references=new_ref)["score"])

  from .autonotebook import tqdm as notebook_tqdm


0.04095977898080804
6.853082539094396
96.4312546957175


In [2]:
import evaluate

# Load evaluation metrics
sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
ter = evaluate.load("ter")

# Read predictions and references with the correct encoding
with open("D:\\IASNLP2\\mT5\\Results\\MT5_Bi_hi_en_pred.txt", "r", encoding="utf-8") as f:
    pred = f.readlines()

with open("D:\\IASNLP2\\mT5\\Results\\MT5_Bi_hi_en_tgt.txt", "r", encoding="utf-8") as f:
    ref = f.readlines()

# Prepare the reference list
new_ref = []
for sent in ref:
    new_ref.append([sent.strip()])  # Strip newline characters

# Calculate metrics
sacrebleu_result = sacrebleu.compute(predictions=pred, references=new_ref)
chrf_result = chrf.compute(predictions=pred, references=new_ref)
ter_result = ter.compute(predictions=pred, references=new_ref)

print("SacreBLEU:", sacrebleu_result)
print("chrF:", chrf_result)
print("TER:", ter_result)


SacreBLEU: {'score': 14.144445924670746, 'counts': [6595, 3326, 1977, 1245], 'totals': [12614, 11614, 10614, 9614], 'precisions': [52.283177421912164, 28.63785086964009, 18.62634256642171, 12.949864780528396], 'bp': 0.5769818284614675, 'sys_len': 12614, 'ref_len': 19551}
chrF: {'score': 33.827890772364086, 'char_order': 6, 'word_order': 0, 'beta': 2}
TER: {'score': 74.7157615282507, 'num_edits': 12946, 'ref_length': 17327.0}
