In [1]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Test one file
path = r"D:\devegiri_task\data\hin\hin_train.json"

df = pd.read_json(path, lines=True)          # ← lines=True is important here too

print(df.shape)
print(df.columns.tolist())
df.head(3)

(1299155, 5)
['unique_identifier', 'native word', 'english word', 'source', 'score']


Unnamed: 0,unique_identifier,native word,english word,source,score
0,hin1,जन्मदिवस,janamdivas,Dakshina,
1,hin2,रक्खा,rakha,Dakshina,
2,hin3,मिलीजुली,milijuli,Dakshina,


In [3]:

BASE = Path(r"D:\devegiri_task\data")

langs = ["hin", "tam", "tel"]

dfs_train = {}
dfs_val   = {}

for lang in langs:
    train_path = BASE / lang / f"{lang}_train.json"
    val_path   = BASE / lang / f"{lang}_valid.json"   # or _val.json, dev.json, ...

    dfs_train[lang] = pd.read_json(train_path, lines=True)
    dfs_val[lang]   = pd.read_json(val_path, lines=True)

    print(f"{lang.upper():6} train: {dfs_train[lang].shape[0]:,} rows")

HIN    train: 1,299,155 rows
TAM    train: 3,230,902 rows
TEL    train: 2,429,562 rows


In [4]:
for lang, df in dfs_train.items():
    df["lang"] = lang

In [5]:
for lang, df in dfs_val.items():
    df["lang"] = lang

In [6]:
dfs_train['hin'].columns

Index(['unique_identifier', 'native word', 'english word', 'source', 'score',
       'lang'],
      dtype='str')

In [7]:
df_train_all = pd.concat([dfs_train['hin'], dfs_train['tam'], dfs_train['tel']], ignore_index=True)
df_train_all = df_train_all[['english word', 'native word','lang']].rename(columns={
    'english word': 'roman',
    'native word':  'native',
    'lang': 'lang'
})

In [8]:
df_val_all = pd.concat([dfs_val['hin'], dfs_val['tam'], dfs_val['tel']], ignore_index=True)
df_val_all = df_val_all[['english word', 'native word','lang']].rename(columns={
    'english word': 'roman',
    'native word':  'native',
    'lang': 'lang'
})

In [9]:
df_train_all.head()

Unnamed: 0,roman,native,lang
0,janamdivas,जन्मदिवस,hin
1,rakha,रक्खा,hin
2,milijuli,मिलीजुली,hin
3,jaanchon,जांचों,hin
4,chamkata,चमकता,hin


In [10]:
df_val_all.head()

Unnamed: 0,roman,native,lang
0,spike,स्पाइक,hin
1,trilok,त्रिलोक,hin
2,chanda,चंदा,hin
3,meeta,मीता,hin
4,jack,जैक,hin


In [11]:
# Quick look
print(df_train_all.head(3))

# Checks for missing values
print(df_train_all.isna().sum())

# Checks string lengths (helps spot anomalies)
print(df_train_all['roman'].str.len().describe())
print(df_train_all['native'].str.len().describe())

        roman    native lang
0  janamdivas  जन्मदिवस  hin
1       rakha     रक्खा  hin
2    milijuli  मिलीजुली  hin
roman     0
native    0
lang      0
dtype: int64
count    6.959619e+06
mean     1.240213e+01
std      4.401593e+00
min      1.000000e+00
25%      9.000000e+00
50%      1.200000e+01
75%      1.600000e+01
max      8.900000e+01
Name: roman, dtype: float64
count    6.959619e+06
mean     1.078422e+01
std      3.592192e+00
min      1.000000e+00
25%      8.000000e+00
50%      1.100000e+01
75%      1.300000e+01
max      1.320000e+02
Name: native, dtype: float64


In [12]:


for df in [df_train_all, df_val_all]:
    df['input'] = '<' + df['lang'] + '> ' + df['roman']

# Quick look — very important check!
print(df_train_all[['lang', 'roman', 'input', 'native']].sample(8))

# Examples should look like:
# lang  roman         input                   native
# hin   janmadivas    <hin> janmadivas     जन्मदिवस
# tam   vanakkam      <tam> vanakkam       வணக்கம்
# tel   bangaram      <tel> bangaram       బంగారం

        lang             roman                   input         native
659513   hin              mmes              <hin> mmes        एमएमईएस
3591133  tam       elizabethan       <tam> elizabethan    எலிசபெத்தன்
1890873  tam    nallaatchiyena    <tam> nallaatchiyena   நல்லாட்சியென
4262937  tam   niraivaettruvar   <tam> niraivaettruvar  நிறைவேற்றுவர்
5886910  tel       baanalingam       <tel> baanalingam       బాణలింగం
1165657  hin            mobira            <hin> mobira         मोबिरा
6584412  tel            janger            <tel> janger        జాంగెర్
2596932  tam  agarathirumaalam  <tam> agarathirumaalam   அகரதிருமாளம்


In [13]:
df_train_all[['lang', 'roman', 'input', 'native']].sample(8)

Unnamed: 0,lang,roman,input,native
4925825,tel,shettiar,<tel> shettiar,శెట్టియార్
4721396,tel,bidanagar,<tel> bidanagar,బిదానగర్
5054515,tel,lakshanamnu,<tel> lakshanamnu,లక్షణంను
955940,hin,tattav,<hin> tattav,तत्तव
2707313,tam,uuttukkaarar,<tam> uuttukkaarar,ஊட்டுக்காரர்
870342,hin,adhikri,<hin> adhikri,अधिकृ
6524225,tel,osmaaniyaalloonuu,<tel> osmaaniyaalloonuu,ఉస్మానియాల్లోనూ
5314583,tel,narasimhalayaanni,<tel> narasimhalayaanni,నరసింహాలయాన్ని


In [14]:
# A. Basic stats
print("Train examples total:", len(df_train_all))
print(df_train_all['lang'].value_counts(normalize=True))  # should be reasonable distribution

# B. Empty or too short entries
print("Empty roman:", (df_train_all['roman'].str.strip() == '').sum())
print("Empty native:", (df_train_all['native'].str.strip() == '').sum())

short_rom = df_train_all[df_train_all['roman'].str.len() < 2]
print("Very short roman inputs:", len(short_rom))
if len(short_rom) > 0:
    print(short_rom.sample(min(5, len(short_rom))))

# C. Random samples from each language (very important!)
for lang in ['hin', 'tam', 'tel']:
    print(f"\n--- {lang.upper()} random samples ---")
    print(df_train_all[df_train_all['lang'] == lang][['input', 'native']].sample(6))

# D. Duplicate check (optional but useful)
print("Duplicate input→native pairs:", df_train_all.duplicated(['input', 'native']).sum())

Train examples total: 6959619
lang
tam    0.464235
tel    0.349094
hin    0.186670
Name: proportion, dtype: float64
Empty roman: 0
Empty native: 0
Very short roman inputs: 52
        roman native lang    input
4595969     g     జీ  tel  <tel> g
130114      p     पी  hin  <hin> p
1361899     l   ஹால்  tam  <tam> l
4604807     z     జి  tel  <tel> z
4598008     l      ల  tel  <tel> l

--- HIN random samples ---
                    input       native
228773      <hin> vedanti     वेदान्ती
984984         <hin> liaz        लियाज
640390     <hin> nikayama      निकायमा
753817  <hin> parathyroid  पैराथायरायड
362368      <hin> cicinda      सिसिंडा
9052        <hin> bhalaai         भलाई

--- TAM random samples ---
                             input             native
1325684        <tam> munnilaiyilum     முன்னிலையிலும்
1401230  <tam> mananilaiyiliruntha    மனநிலையிலிருந்த
1987819   <tam> pathividuvathendru     பதிவிடுவதென்று
1948109         <tam> srideviyidam      ஸ்ரீதேவியிடம்
2711701      <ta

In [15]:
df_train_all.to_csv('modified_dataset/clean_train_dataset.csv')
df_val_all.to_csv('modified_dataset/clean_val_dataset.csv')

### Train

In [16]:
from datasets import Dataset

# Keep only needed columns
df_train = df_train_all[['input', 'native']].rename(columns={'input': 'text', 'native': 'label'})
df_val   = df_val_all[['input', 'native']].rename(columns={'input': 'text', 'native': 'label'})

train_ds = Dataset.from_pandas(df_train)
val_ds   = Dataset.from_pandas(df_val)

# Then continue with tokenizer + Seq2SeqTrainer (I can give full snippet next)

In [18]:
# Very important — look at actual examples
print("\nTrain example:")
print(train_ds[0])

print("\nVal example:")
print(val_ds[1234])   # random index

# Check features / column names
print("\nFeatures:", train_ds.features)


Train example:
{'text': '<hin> janamdivas', 'label': 'जन्मदिवस'}

Val example:
{'text': '<hin> sudipta', 'label': 'सुदिप्ता'}

Features: {'text': Value('large_string'), 'label': Value('large_string')}


In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import numpy as np
from tqdm.auto import tqdm

# ──────────────────────────────────────────────────────────────
# Load ByT5-small
# ──────────────────────────────────────────────────────────────

model_name = "google/mt5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded on: {device}")

Loading weights: 100%|██████████| 192/192 [00:00<00:00, 362.63it/s, Materializing param=shared.weight]                                                       


Model loaded on: cpu


In [20]:
# ──────────────────────────────────────────────────────────────
# Prepare generation function
# ──────────────────────────────────────────────────────────────

def generate_transliteration(batch):
    # batch = list of strings like "<hin> kshatriya", "<tam> vanakkam", ...
    inputs = tokenizer(
        batch,
        padding="longest",
        return_tensors="pt",
        truncation=True,
        max_length=64
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=64,
            num_beams=4,
            early_stopping=True,
            length_penalty=1.0,           # was 0.6 → try 1.0–2.0
            repetition_penalty=1.2,    
            no_repeat_ngram_size=3,       # ← prevents repeating 3-grams
            do_sample=False,              # greedy / beam usually better here
        )

    predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return predictions

In [21]:
# ──────────────────────────────────────────────────────────────
# Evaluate on validation set
# ──────────────────────────────────────────────────────────────

# If your validation dataset is large → take a subset for baseline
eval_dataset = val_ds.shuffle(seed=42).select(range(300))  # or val_dataset.select(range(300)) to make it faster

references = []
predictions = []

batch_size = 32

for i in tqdm(range(0, len(eval_dataset), batch_size)):
    batch = eval_dataset[i:i+batch_size]
    inputs = batch["text"]          # "<hin> romanized text" etc.
    targets = batch["label"]        # native script

    preds = generate_transliteration(inputs)

    predictions.extend(preds)
    references.extend(targets)

# ──────────────────────────────────────────────────────────────
# Compute metrics
# ──────────────────────────────────────────────────────────────

cer_metric = evaluate.load("cer")
wer_metric = evaluate.load("wer")

cer_score = cer_metric.compute(predictions=predictions, references=references)
wer_score = wer_metric.compute(predictions=predictions, references=references)

exact_match = np.mean([p == r for p, r in zip(predictions, references)])

print(f"CER: {cer_score:.4f}")
print(f"WER: {wer_score:.4f}")
print(f"Exact match accuracy: {exact_match:.4f}")

# Optional: show some examples
print("\nExamples:")
for i in range(min(8, len(predictions))):
    print(f"Input : {eval_dataset[i]['text']}")
    print(f"Pred  : {predictions[i]}")
    print(f"Target: {references[i]}")
    print()

100%|██████████| 10/10 [00:59<00:00,  5.93s/it]
Downloading builder script: 5.13kB [00:00, ?B/s]

CER: 1.5102
WER: 1.0067
Exact match accuracy: 0.0000

Examples:
Input : <hin> mehmud
Pred  : <extra_id_0>.
Target: मेहमूद

Input : <tel> shstrachikitsa
Pred  : <extra_id_0>
Target: శస్త్రచికిత్స

Input : <tel> maargamlo
Pred  : <extra_id_0>
Target: మార్గంలో

Input : <tel> nruthyamu
Pred  : <extra_id_0>
Target: నృత్యము

Input : <tam> nathiyaaga
Pred  : <extra_id_0>.
Target: நதியாக

Input : <tel> bahishkarinchindi
Pred  : <extra_id_0>
Target: బహిష్కరించింది

Input : <hin> jatwada
Pred  : <extra_id_0>
Target: जटवाड़ा

Input : <hin> padegi
Pred  : <extra_id_0>.
Target: पड़ेगी




