# Data Preparation


In [None]:
prompt = """
You are a csv generator. Do not talk anything outside generate csv. keep silence and generate csv.
Generate a CSV file with two columns: "Hindi Sentence" and "English Transliteration." No csv heading is needed. Each row should contain a unique and creative Hindi sentence and its corresponding transliteration in English. The CSV should contain exactly 100 rows. Do not include any other text except the Hindi sentence and its transliteration. The format should be as follows:

```
अरे वाह! तुमने तो कमाल कर दिया!, Are waah! Tumne to kamaal kar diya!
समय का पहिया कभी किसी के लिए नहीं रुकता।, Samay ka pahiya kabhi kisi ke liye nahin rukta.
"जैसा बोओगे, वैसा काटोगे।", "Jaisa booge, waisa kaatoge."
...
(continue until there are 100 rows)
```
Generate unique sentences for each row.
"""

In [None]:
%%capture
!pip install openai

In [None]:
import os
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = "API KEY"
client = OpenAI()

In [None]:
import multiprocessing
from tqdm import tqdm  # For progress tracking
import openai  

# Define the worker function
def fetch_and_write_response(message_history):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=message_history,
        temperature=1.0,
    )
    response = completion.choices[0].message.content
    response = response.replace("```", "").strip()
    response = "\n".join(response.splitlines()[3:])

    # Use a lock to ensure safe writing to the file from multiple processes
    with lock:
        with open("data.csv", 'a') as f:
            f.write(response)
            f.write("\n")

if __name__ == '__main__':
    # Use a multiprocessing Lock to avoid writing collisions
    lock = multiprocessing.Lock()

    # Assuming you have a predefined message history (to be used 100 times)
    message_history = [
        {"role": "system", "content": prompt}
    ]

    # Total number of iterations
    total_iterations = 1000

    # Create a process for each iteration, and track with tqdm
    processes = []
    for _ in tqdm(range(total_iterations), desc="Processing"):
        # Create a new process for each iteration
        p = multiprocessing.Process(target=fetch_and_write_response, args=(message_history,))
        processes.append(p)
        p.start()  # Start the process

    # Wait for all processes to complete
    for p in processes:
        p.join()

    print("All processes completed.")

In [None]:
!wc -l /content/data.csv

In [None]:
import pandas as pd

# Load the datasets
file1 = 'cleaned_data.csv'
file2 = 'cleaned_data_2.csv'

# Read the CSV files
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Renaming the columns for consistency
df1.columns = ['Hindi Sentence', 'English Translation']
df2.columns = ['Hindi Sentence', 'English Translation']

# Concatenate the two datasets
merged_df = pd.concat([df1, df2])

# Remove duplicate rows
final_df = merged_df.drop_duplicates()

# Save file
output_file = 'data.csv'
final_df.to_csv(output_file, index=False)


# Training

In [None]:
%%capture
!pip install tqdm transformers accelerate datasets sacrebleu evaluate sentencepiece sacremoses
import datasets
print(datasets.__version__)

In [None]:
import csv
import evaluate
from transformers import MarianConfig, MarianMTModel, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# set special tokens, not sure if it's needed but adding them for sanity...
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

# Load the train, validation, and test splits explicitly
dataset = load_dataset("csv", data_files = "/content/data.csv")

train_test_split = dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def split_translation(example):
    return {
        'en': example['English Translation'],
        'hi': example['Hindi Sentence']
    }

train_mapped_dataset = train_dataset.map(split_translation)

tr_mapd_dt = train_mapped_dataset.remove_columns(['English Translation', 'Hindi Sentence'])

val_mapped_dataset = test_dataset.map(split_translation)

val_mapd_dt = val_mapped_dataset.remove_columns(['English Translation', 'Hindi Sentence'])

print(tr_mapd_dt, val_mapd_dt)

Map:   0%|          | 0/75375 [00:00<?, ? examples/s]

Map:   0%|          | 0/8376 [00:00<?, ? examples/s]

Dataset({
    features: ['en', 'hi'],
    num_rows: 75375
}) Dataset({
    features: ['en', 'hi'],
    num_rows: 8376
})


In [None]:
tokenizer

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-hi', vocab_size=61950, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	61949: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
def preprocess_function(batch):
    # Print input for debugging
    # print("Batch 'en' sample:", batch['en'][:3])  # Print the first 3 examples
    # print("Batch 'hi' sample:", batch['hi'][:3])  # Print the first 3 examples

    # Clean the input strings by stripping leading/trailing whitespace and handling None
    cleaned_en = [sentence.strip() if sentence is not None else "" for sentence in batch['en']]
    cleaned_hi = [sentence.strip() if sentence is not None else "" for sentence in batch['hi']]

    # Check if inputs are lists of strings
    if not isinstance(cleaned_en, list) or not isinstance(cleaned_hi, list):
        raise ValueError("Batch inputs are not in the expected format. 'en' and 'hi' must be lists of strings.")

    # Tokenize the 'en' and 'hi' sentences as lists
    inputs = tokenizer(cleaned_en, max_length=64, truncation=True, padding="max_length")
    outputs = tokenizer(cleaned_hi, max_length=64, truncation=True, padding="max_length")

    # Return input_ids and labels (copy labels to avoid mutation issues)
    return {
        "input_ids": inputs["input_ids"],
        "labels": outputs["input_ids"].copy()  # make sure this copies the list, not a reference
    }

# Apply the preprocess function to train and validation datasets
train_data_with_token = tr_mapd_dt.map(preprocess_function, batched=True, batch_size=1000)
val_data_with_token  = val_mapd_dt.map(preprocess_function, batched=True, batch_size=1000)


Map:   0%|          | 0/75375 [00:00<?, ? examples/s]

Map:   0%|          | 0/8376 [00:00<?, ? examples/s]

In [None]:
print(train_data_with_token)
print(train_data_with_token[75374])

# print(tokenizer.convert_ids_to_tokens(64109))
# print(tokenizer.convert_ids_to_tokens(60))

Dataset({
    features: ['en', 'hi', 'input_ids', 'labels'],
    num_rows: 75375
})
{'en': ' Yahaan par kyon aana chaahoge?', 'hi': 'यहाँ पर क्यूँ आना चाहोगे?', 'input_ids': [31615, 292, 980, 21630, 6984, 667, 1442, 19, 12104, 26980, 292, 6716, 6390, 22, 0, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949], 'labels': [44, 1801, 260, 5401, 44, 2703, 428, 44, 716, 1185, 15976, 44, 3605, 314, 260, 44, 4228, 260, 2451, 917, 3355, 174, 22, 0, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949, 61949,

In [None]:
mt_metrics = evaluate.combine(
    ["bleu", "chrf"], force_prefix=True
)

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    predictions = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    references = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    outputs = mt_metrics.compute(predictions=predictions,
                             references=references)

    return outputs

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [None]:
hyp = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
ref = ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.']

#chrf = evaluate.load('chrf')
#print(chrf.compute(predictions=hyp, references=ref))

chrf = evaluate.load('chrf', force_prefix=True)
print(chrf.compute(predictions=hyp, references=ref))

print("------------------------------------------------")

mt_metrics = evaluate.combine(
    ["bleu", "chrf"], force_prefix=True
)
print(mt_metrics.compute(predictions=hyp, references=ref))

{'score': 50.043063606582294, 'char_order': 6, 'word_order': 0, 'beta': 2}
------------------------------------------------
{'bleu_bleu': 0.45067506321061157, 'bleu_precisions': [0.7058823529411765, 0.42857142857142855, 0.36363636363636365, 0.375], 'bleu_brevity_penalty': 1.0, 'bleu_length_ratio': 1.0, 'bleu_translation_length': 17, 'bleu_reference_length': 17, 'chr_f_score': 50.043063606582294, 'chr_f_char_order': 6, 'chr_f_word_order': 0, 'chr_f_beta': 2}


In [None]:
%%capture
!pip install wandb

In [None]:
import wandb
wandb.init(project="Transliteration-en-hi")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='Transliteration-en-hi',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=1,
    save_steps=10,
    eval_steps=10,
    max_steps=800,
    evaluation_strategy="steps",
    predict_with_generate=True,
    report_to=["wandb"],
    metric_for_best_model="chr_f_score",
    load_best_model_at_end=True,
    save_total_limit=3,
    learning_rate=5e-5, # If I don't mention it, it will be 5e-5 by default
    push_to_hub = True
)

# To use multiple gpu:
"""n_gpu=-1  # Use all available GPUs"""

# To upload to huggingface:
"""push_to_hub=True   # Repo will be same as output directory"""



'push_to_hub=True   # Repo will be same as output directory'

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data_with_token.with_format("torch"),
    eval_dataset=val_data_with_token.with_format("torch"),
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss
