In [None]:
pip install pandas transformers torch sentencepiece


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm

# Load the Helsinki model for Marathi → English
model_name = "Helsinki-NLP/opus-mt-mr-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()

# Read CSV
df = pd.read_csv("cleaned_sample.csv")

# Check if a string contains Devanagari (Marathi)
def is_marathi(text):
    return isinstance(text, str) and any('\u0900' <= ch <= '\u097F' for ch in text)

# Translate function
def translate_mr(text):
    if not is_marathi(text):
        return text
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        with torch.no_grad():
            tokens = model.generate(**inputs, max_length=256)
        return tokenizer.decode(tokens[0], skip_special_tokens=True)
    except:
        return text

# Translate column names
new_columns = [translate_mr(col) for col in df.columns]
df.columns = new_columns

# Translate all cells
for col in tqdm(df.columns, desc="Translating columns"):
    df[col] = df[col].apply(translate_mr)

# Save the output
df.to_csv("translated_sample.csv", index=False)
print("✅ Done! Saved to 'translated_sample.csv'")


Translating columns: 100%|██████████| 39/39 [21:16<00:00, 32.74s/it]

✅ Done! Saved to 'translated_sample.csv'



