<a href="https://colab.research.google.com/github/riyans98/Riyans98/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kagglehub transformers datasets accelerate evaluate sacrebleu trl bitsandbytes
!pip install -U bitsandbytes
# Please restart the Colab runtime after running this cell to ensure the latest bitsandbytes version is used.

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m


In [2]:
import os
from google.colab import files

# Upload kaggle.json
files.upload()

# Move kaggle.json to the correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("akshita1122334455/chhattisgarhi-english")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'chhattisgarhi-english' dataset.
Path to dataset files: /kaggle/input/chhattisgarhi-english


In [None]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch

# Set environment variables for GPU support
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check if bitsandbytes is compatible
try:
    import bitsandbytes as bnb
    print("bitsandbytes version:", bnb.__version__)
    # NOTE: If you encounter an ImportError related to bitsandbytes after running the first cell,
    # please restart the Colab runtime.
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
except Exception as e:
    print(f"bitsandbytes error: {e}. Falling back to full precision.")
    quantization_config = None

# Model and dataset configuration
# The model name is updated to your specified model
model_name = "app90/ChhattishgarhiAI_Model"

# Load the dataset from the CSV file.
# Assuming the file is named 'CG-Eng.csv' and is located in the same directory.
# You can also use a full path like f"{path}CG-Eng.csv"
dataset = load_dataset("csv", data_files=f"{path}/CG-Eng.csv")

# --- DEBUGGING STEP ADDED ---
# Print the column names to help identify the correct headers in the CSV.
print("Loaded dataset columns:", dataset["train"].column_names)
# ----------------------------

# Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Preprocess dataset
# This function is updated to correctly use the column names from your CSV
def formatting_func(examples):
    texts = []
    instruction_text = "Translate the following text from Chhattisgarhi to English."
    # The keys here must exactly match the column headers in your CSV file.
    # The traceback indicates a KeyError for 'Chhattishgarhi', so please check your CSV headers.
    for i in range(len(examples['Chhattisgarhi'])):
        # --- FIX: Add check for None values ---
        if examples['Chhattisgarhi'][i] is None or examples['English Translated(Manual)'][i] is None:
            continue
        # ---------------------------------------
        messages = [
            {"role": "user", "content": f"{instruction_text}\n{examples['Chhattisgarhi'][i]}"},
            {"role": "assistant", "content": examples['English Translated(Manual)'][i]}
        ]
        # FIX: The tokenizer does not have a chat template. We provide a default one.
        # This will prevent the ValueError.
        chat_template = (
            "{% for message in messages %}"
            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
            "{% endfor %}"
        )
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            chat_template=chat_template
        )
        texts.append(text)
    return {"text": texts}

formatted_dataset = dataset.map(
    formatting_func,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Load model with or without quantization
if quantization_config:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.bfloat16
    )
    model = prepare_model_for_kbit_training(model)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )

# Configure LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    report_to="none",
    logging_dir="output"
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    peft_config=peft_config,
    data_collator=data_collator
)

# Train and save model
trainer.train()
trainer.save_model("output")

bitsandbytes version: 0.47.0


Generating train split: 0 examples [00:00, ? examples/s]

Loaded dataset columns: ['Chhattisgarhi', 'English Translated', 'English Translated(Manual)']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/25317 [00:00<?, ? examples/s]

Map:   0%|          | 0/24162 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Some weights of MBartForCausalLM were not initialized from the model checkpoint at app90/ChhattishgarhiAI_Model and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]



Truncating train dataset:   0%|          | 0/24162 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)


Step,Training Loss
10,12.3018
20,10.8753
30,7.13
40,2.9388
50,0.7507
60,0.376
70,0.3594
80,0.3523
90,0.351
100,0.4343


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

import os
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch

# Set environment variables for GPU support
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check if bitsandbytes is compatible
try:
    import bitsandbytes as bnb
    print("bitsandbytes version:", bnb.__version__)
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
except Exception as e:
    print(f"bitsandbytes error: {e}. Falling back to full precision.")
    quantization_config = None

# Model and dataset configuration
# The model name is updated to your specified model
model_name = "app90/ChhattishgarhiAI_Model"

# --- UPDATED: Define the path to your CSV file directory ---
# Please update this path to the actual directory where your file is located.
path = "/home/ubuntu/your_directory"
csv_file_path = os.path.join(path, "CG-Eng.csv")

# Load the dataset from the CSV file using the specified path.
dataset = load_dataset("csv", data_files=csv_file_path)

# --- DEBUGGING STEP ADDED ---
# Print the column names to help identify the correct headers in the CSV.
print("Loaded dataset columns:", dataset["train"].column_names)
# ----------------------------

# Load tokenizer and set padding token
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Preprocess dataset
# This function is updated to correctly use the column names from your CSV
def formatting_func(examples):
    texts = []
    instruction_text = "Translate the following text from Chhattisgarhi to English."
    # The keys here must exactly match the column headers in your CSV file.
    # The traceback indicates a KeyError for 'Chhattishgarhi', so please check your CSV headers.
    for i in range(len(examples['Chhattishgarhi'])):
        messages = [
            {"role": "user", "content": f"{instruction_text}\n{examples['Chhattishgarhi'][i]}"},
            {"role": "assistant", "content": examples['English Translated(Manual)'][i]}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

formatted_dataset = dataset.map(
    formatting_func,
    batched=True,
    # The key to remove columns is updated to the correct split name
    remove_columns=dataset["train"].column_names
)

# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Load model with or without quantization
if quantization_config:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.bfloat16
    )
    model = prepare_model_for_kbit_training(model)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16  # Fallback to fp16
    )

# Configure LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, peft_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="/home/ubuntu/output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    report_to="none",
    logging_dir="/home/ubuntu/logs"
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    peft_config=peft_config,
    data_collator=data_collator
)

# Train and save model
trainer.train()
trainer.save_model("/home/ubuntu/output")



# New Section

In [None]:
from google.colab import files
files.download("output.zip")

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"BLEU score: {eval_results['eval_bleu']:.2f}")

# Example inference
text_to_translate = "मोर नाव राजू आय।"
tokenizer.src_lang = "ch_IN"
encoded_ch = tokenizer(text_to_translate, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ch,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print(f"Chhattisgarhi: {text_to_translate}")
print(f"English Translation: {translated_text}")
