In [None]:
### Install Dependencies
!pip install torch transformers datasets bitsandbytes unsloth peft

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metad

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

### Enable CPU (since Unsloth requires NVIDIA GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

### Load Pretrained Model
MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


### Manually Created Hindi Colloquial Dataset
hindi_data = [
    {"input": "How are you?", "output": "Tu kaisa hai?"},
    {"input": "Where are you?", "output": "Tu kahan hai?"},
    {"input": "What’s up?", "output": "Kya chal raha hai?"},
    {"input": "Let’s go!", "output": "Chal nikal!"},
    {"input": "Don’t stress.", "output": "Tension mat le."},
    {"input": "Are you coming?", "output": "Tu aa raha hai?"},
    {"input": "What’s the plan?", "output": "Plan kya hai?"},
    {"input": "Can’t believe it!", "output": "Yeh nahi ho sakta!"},
    {"input": "It’s too late.", "output": "Bahut der ho gayi hai."},
    {"input": "I’ll just check.", "output": "Main dekh leta hoon."},
    {"input": "I’m broke.", "output": "Mere paas paise nahi hain."},
    {"input": "This is crazy.", "output": "Yeh bilkul pagalpan hai."},
    {"input": "Stop messing with me.", "output": "Mujhse khelna band kar."},
    {"input": "You’re annoying.", "output": "Tu tang kar raha hai."},
    {"input": "Mind your business.", "output": "Apna kaam dekh."},
    {"input": "I was just joking.", "output": "Main bas mazaak kar raha tha."},
    {"input": "What’s wrong with you?", "output": "Tere saath kya ho raha hai?"},
    {"input": "Leave me alone.", "output": "Mujhe akela chhod de."},
    {"input": "Don’t make excuses.", "output": "Bahane mat bana."},
    {"input": "What’s the point?", "output": "Kya faayda?"},
    {"input": "I don’t care.", "output": "Mujhe farak nahi padta."},
    {"input": "No way!", "output": "Bilkul nahi!"},
    {"input": "Let it be.", "output": "Jaane de."},
    {"input": "Hurry up!", "output": "Jaldi kar!"},
    {"input": "What’s going on?", "output": "Kya ho raha hai?"},
    {"input": "I’m hungry.", "output": "Mujhe bhookh lagi hai."},
    {"input": "I’m tired.", "output": "Main thak gaya hoon."},
    {"input": "It’s not my problem.", "output": "Yeh mera problem nahi hai."},
    {"input": "Don’t worry.", "output": "Fikar mat kar."},
    {"input": "Let’s chill.", "output": "Aaram se baithte hain."},
    {"input": "He’s acting smart.", "output": "Woh bada tez ban raha hai."},
    {"input": "Forget it.", "output": "Bhool ja."},
    {"input": "You deserve it.", "output": "Tujhe yeh milna hi chahiye."},
    {"input": "Keep it up!", "output": "Aise hi karte raho!"},
    {"input": "That’s awesome!", "output": "Wah! Zabardast!"},
    {"input": "It’s so boring.", "output": "Yeh bohot bore kar raha hai."},
    {"input": "You did well.", "output": "Tune accha kiya."},
    {"input": "It’s too expensive.", "output": "Yeh bohot mehenga hai."},
    {"input": "Don’t be shy.", "output": "Sharma mat."},
    {"input": "You never listen.", "output": "Tu kabhi nahi sunta."},
    {"input": "Come here!", "output": "Idhar aa!"},
    {"input": "Don’t touch it.", "output": "Usse mat chho."},
    {"input": "It’s your turn.", "output": "Ab teri baari hai."},
    {"input": "Take care.", "output": "Apna khayal rakh."},
    {"input": "I’m feeling sleepy.", "output": "Mujhe neend aa rahi hai."},
    {"input": "Stop shouting.", "output": "Chillane band kar."},
    {"input": "I’m just kidding.", "output": "Main mazaak kar raha hoon."},
    {"input": "Listen to me.", "output": "Meri baat sun."},
    {"input": "Tell me the truth.", "output": "Sach bata."},
    {"input": "This is ridiculous!", "output": "Yeh to hadd ho gayi!"},
    {"input": "Everything will be fine.", "output": "Sab theek ho jayega."},
    {"input": "It’s not a big deal.", "output": "Koi badi baat nahi hai."},
    {"input": "I don’t get it.", "output": "Mujhe samajh nahi aaya."},
    {"input": "Don’t overthink.", "output": "Zyada mat soch."},
    {"input": "It’s all good.", "output": "Sab theek hai."},
    {"input": "I knew it!", "output": "Mujhe pehle se pata tha!"},
    {"input": "What nonsense!", "output": "Kya bakwaas hai!"},
    {"input": "I’ll call you later.", "output": "Baad mein call karunga."},
    {"input": "Come fast!", "output": "Jaldi aa!"},
    {"input": "It’s not that hard.", "output": "Yeh itna mushkil nahi hai."},
    {"input": "That’s not my fault.", "output": "Yeh meri galti nahi hai."},
    {"input": "I’m getting late.", "output": "Main late ho raha hoon."},
    {"input": "It’s not a big deal.", "output": "Koi badi baat nahi hai."},
]

### Convert to DataFrame
df = pd.DataFrame(hindi_data)

### Save Dataset to CSV
df.to_csv("hindi_colloquial_dataset.csv", index=False)

### Load Dataset into Hugging Face format
dataset = Dataset.from_pandas(df)

def format_data(example):
    # Tokenize inputs and labels
    inputs = tokenizer(example["input"], truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(example["output"], truncation=True, padding="max_length", return_tensors="pt")

    # Get input_ids and labels and ensure they are not empty
    input_ids = inputs["input_ids"].squeeze()
    labels_ids = labels["input_ids"].squeeze()

    # Pad to the maximum length if necessary
    max_length = max(len(input_ids), len(labels_ids))
    input_ids = torch.cat([input_ids, torch.zeros(max_length - len(input_ids), dtype=torch.long)])
    labels_ids = torch.cat([labels_ids, torch.zeros(max_length - len(labels_ids), dtype=torch.long)])


    return {
        "input_ids": input_ids,
        "labels": labels_ids,
    }

dataset = dataset.map(format_data)

dataset = dataset.train_test_split(test_size=0.2)

### Fine-tune Model on CPU
training_args = TrainingArguments(
    output_dir="fine_tuned_hindi_model",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss




TrainOutput(global_step=39, training_loss=0.5978007194323417, metrics={'train_runtime': 1059.8952, 'train_samples_per_second': 0.142, 'train_steps_per_second': 0.037, 'total_flos': 20339018956800.0, 'train_loss': 0.5978007194323417, 'epoch': 3.0})

In [None]:
def translate_text(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    translated = model.generate(**inputs)

    # Decode the generated tokens
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

In [None]:
# Sample input sentences to test
test_sentences = [
    "How are you?",
    "Where are you?",
    "What’s up?",
    "Let’s go!"
]

for sentence in test_sentences:
    translated_sentence = translate_text(sentence)
    print(f"Input: {sentence}")
    print(f"Translated: {translated_sentence}")
    print("-" * 50)

Input: How are you?
Translated: आप कैसे हैं?
--------------------------------------------------
Input: Where are you?
Translated: तुम कहाँ हो?
--------------------------------------------------
Input: What’s up?
Translated: क्या हो रहा है?
--------------------------------------------------
Input: Let’s go!
Translated: चलो चलते हैं!
--------------------------------------------------


In [None]:
# Evaluate on test dataset
results = trainer.evaluate()

print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.10795964300632477, 'eval_runtime': 24.8352, 'eval_samples_per_second': 0.523, 'eval_steps_per_second': 0.161, 'epoch': 3.0}


In [None]:
# Define some unseen test data
unseen_data = [
    {"input": "I am feeling sick.", "expected_output": "Main bimaar hoon."},
    {"input": "Where did you go?", "expected_output": "Tu kahan gaya tha?"},
    {"input": "It’s so hot today!", "expected_output": "Aaj kitni garmi hai!"},
    {"input": "Call me when you reach.", "expected_output": "Jab tu pahuch ja, mujhe call karna."},
    {"input": "I am busy right now.", "expected_output": "Main abhi vyast hoon."},
    {"input": "What are you doing?", "expected_output": "Tu kya kar raha hai?"},
    {"input": "This place is beautiful.", "expected_output": "Yeh jagah bohot khoobsurat hai."},
    {"input": "I need some water.", "expected_output": "Mujhe thoda paani chahiye."},
    {"input": "Let’s meet tomorrow.", "expected_output": "Chalo kal milte hain."},
    {"input": "I am really tired.", "expected_output": "Main bohot thak gaya hoon."},
    {"input": "Why are you so late?", "expected_output": "Tu itni der se kyun aaya?"},
    {"input": "This is not my fault.", "expected_output": "Yeh meri galti nahi hai."},
    {"input": "Can you help me?", "expected_output": "Kya tu meri madad kar sakta hai?"},
    {"input": "I need to go home.", "expected_output": "Mujhe ghar jaana hai."},
]

# Run the model on the unseen test data
for example in unseen_data:
    input_text = example["input"]
    expected_output = example["expected_output"]

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    output = model.generate(**inputs, max_length=50, num_beams=5, early_stopping=True)

    # Decode the output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Input: {input_text}")
    print(f"Expected Output: {expected_output}")
    print(f"Model Output: {decoded_output}")
    print("-" * 50)

Input: I am feeling sick.
Expected Output: Main bimaar hoon.
Model Output: मैं बीमार हो रहा है.
--------------------------------------------------
Input: Where did you go?
Expected Output: Tu kahan gaya tha?
Model Output: तुम कहाँ गए?
--------------------------------------------------
Input: It’s so hot today!
Expected Output: Aaj kitni garmi hai!
Model Output: यह आज इतना गर्म है!
--------------------------------------------------
Input: Call me when you reach.
Expected Output: Jab tu pahuch ja, mujhe call karna.
Model Output: तुम तक पहुंच जब मुझे फोन.
--------------------------------------------------
Input: I am busy right now.
Expected Output: Main abhi vyast hoon.
Model Output: मैं अभी व्यस्त हूँ.
--------------------------------------------------
Input: What are you doing?
Expected Output: Tu kya kar raha hai?
Model Output: तुम क्या कर रहे हो?
--------------------------------------------------
Input: This place is beautiful.
Expected Output: Yeh jagah bohot khoobsurat hai.
Model O

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("Saryu10/fine-tuned-hindi-model")
tokenizer.push_to_hub("Saryu10/fine-tuned-hindi-model")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Saryu10/fine-tuned-hindi-model/commit/2fcaadfad991f1e34389588b84727bc14b930f74', commit_message='Upload tokenizer', commit_description='', oid='2fcaadfad991f1e34389588b84727bc14b930f74', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Saryu10/fine-tuned-hindi-model', endpoint='https://huggingface.co', repo_type='model', repo_id='Saryu10/fine-tuned-hindi-model'), pr_revision=None, pr_num=None)

In [None]:
from datasets import Dataset
dataset.push_to_hub("Saryu10/colloquial-hindi-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/479 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Saryu10/colloquial-hindi-dataset/commit/f64547692ca2840d6e339d97ee741e363457a2e7', commit_message='Upload dataset', commit_description='', oid='f64547692ca2840d6e339d97ee741e363457a2e7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Saryu10/colloquial-hindi-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Saryu10/colloquial-hindi-dataset'), pr_revision=None, pr_num=None)