#### Deep learning-based text normalisation model for Indic languages using Sarvam1-2B model

In [None]:
!pip install indic-nlp-library
!pip install datasets

[31mERROR: Operation cancelled by user[0m[31m
[0m

##### 1.Loading the Dataset from hugging face- Tamil and Malayalam

In [None]:
from datasets import load_dataset

ds = load_dataset("ai4bharat/IndicSentenceSummarization", "ta")
ds_mal=load_dataset("ai4bharat/IndicSentenceSummarization", "ml")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds_mal

DatasetDict({
    train: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 9099
    })
    test: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 4560
    })
    validation: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 4728
    })
})

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 51630
    })
    test: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 6566
    })
    validation: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 6386
    })
})

#### 2. Loading/importing the Sarvam AI model

In [None]:
from datasets import concatenate_datasets
merged_train = concatenate_datasets([ds["train"], ds_mal["train"]]).shuffle(seed=42)
merged_validation = concatenate_datasets([ds["validation"], ds_mal["validation"]]).shuffle(seed=42)
merged_test = concatenate_datasets([ds["test"], ds_mal["test"]]).shuffle(seed=42)

# Create a merged dataset dictionary
merged_dataset = {
    "train": merged_train,
    "validation": merged_validation,
    "test": merged_test
}

print(merged_dataset)

{'train': Dataset({
    features: ['id', 'input', 'target', 'url'],
    num_rows: 60729
}), 'validation': Dataset({
    features: ['id', 'input', 'target', 'url'],
    num_rows: 11114
}), 'test': Dataset({
    features: ['id', 'input', 'target', 'url'],
    num_rows: 11126
})}


#### 3.Hadling Entities

In [None]:
!pip install num2words tqdm regex
!pip install indic-numtowords



In [None]:
import re
from datasets import DatasetDict
from tqdm import tqdm
from indic_numtowords import num2words

SI_UNITS = {
    "kg": {"ta": "கிலோகிராம்", "ml": "കിലോഗ്രാം"},
    "km": {"ta": "கிலோமீட்டர்", "ml": "കിലോമീറ്റർ"},
    "cm": {"ta": "சென்டிமீட்டர்", "ml": "സെന്റിമീറ്റർ"},
    "m": {"ta": "மீட்டர்", "ml": "മീറ്റർ"},
    "g": {"ta": "கிராம்", "ml": "ഗ്രാം"},
    "l": {"ta": "லிட்டர்", "ml": "ലിറ്റർ"},
    "ml": {"ta": "மில்லிலிட்டர்", "ml": "മില്ലിലിറ്റർ"},
    "s": {"ta": "வினாடி", "ml": "സെക്കൻഡ്"},
    "K": {"ta": "கெல்வின்", "ml": "കൽവിൻ"},
    "m/s": {"ta": "மீட்டர் விநாடிக்கு", "ml": "മീറ്റർ പൂർവസെക്കൻഡ്"},
    "m²": {"ta": "சதுர மீட்டர்", "ml": "ചതുരശ്ര മീറ്റർ"},
    "m³": {"ta": "கூப்பியம் மீட்டர்", "ml": "ഘന മീറ്റർ"},
    "kg/m³": {"ta": "கிலோகிராம் மீட்டர் மூன்றுக்கு", "ml": "കിലോഗ്രാം പൂർവഘന മീറ്റർ"},
    "N·m": {"ta": "நியூட்டன் மீட்டர்", "ml": "ന്യൂട്ടൺ മീറ്റർ"},
    "J": {"ta": "ஜூல்", "ml": "ജൗൾ"},
    "W": {"ta": "வாட்", "ml": "വാട്ട്"},
    "Pa": {"ta": "பாஸ்கல்", "ml": "പാസ்கൽ"},
    "V": {"ta": "வோல்ட்", "ml": "വോൾട്ട്"},
    "Ω": {"ta": "ஓம்", "ml": "ഓം"},
    "Hz": {"ta": "ஹெர்ட்ஸ்", "ml": "ഹെർട്സ്"},
    "C": {"ta": "கூலோம்ப்", "ml": "കൂളോമ്പ്"},
    "km/h": {"ta": "கிலோமீட்டர் மணிக்கு", "ml": "കിലോമീറ്റർ പൂർവമണിക്കൂർ"},
}

CURRENCY_UNITS = {
    "₹": {"ta": "ரூபாய்", "ml": "രൂപ"},
    "INR": {"ta": "ரூபாய்", "ml": "രൂപ"},
    "Rs.": {"ta": "ரூபாய்", "ml": "രൂപ"},
    "$": {"ta": "டாலர்", "ml": "ഡോളർ"},
    "€": {"ta": "யூரோ", "ml": "യൂറോ"},
    "£": {"ta": "பவுண்ட்", "ml": "പൗണ്ട്"},
    "¥": {"ta": "யென்", "ml": "യെൻ"},
    "₽": {"ta": "ரூபிள்", "ml": "റൂബിൾ"},
    "₩": {"ta": "வான்", "ml": "വോൺ"},
    "₺": {"ta": "லீரா", "ml": "ലിറ"},
    "฿": {"ta": "பாத்", "ml": "ബാത്ത്"},
    "₦": {"ta": "நைரா", "ml": "നൈറ"}
}

def convert_to_words(text, lang):
    def replace_numbers(match):
        return num2words(match.group(), lang=lang)


    def replace_dates(match):
        day, month, year = match.groups()
        return f"{num2words(day, lang=lang)} {num2words(month, lang=lang)} {num2words(year, lang=lang)}"

    def replace_currency(match):
        amount, currency = match.groups()
        currency_translation = CURRENCY_UNITS.get(currency.strip(), {}).get(lang, currency)
        return f"{num2words(amount, lang=lang)} {currency_translation}"

    words = text.split()
    for i, word in enumerate(words):
        if re.match(r"^\d+(\.\d+)?$", word):
            next_word = words[i + 1] if i + 1 < len(words) else ""
            if next_word in SI_UNITS:
                unit_translation = SI_UNITS[next_word].get(lang, next_word)
                words[i] = num2words(word, lang=lang)
                words[i + 1] = unit_translation
    text = " ".join(words)
    text = re.sub(r"(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})", replace_dates, text)
    text = re.sub(r"(\d+(?:\.\d+)?)\s*(₹|INR|Rs\.?|\$|€|£|¥|₽|₩|₺|฿|₦)", replace_currency, text)

    return text


#### 4.Testing the code with sample data


In [None]:
sample_texts = [
    "நீங்கள் 25 l பரிசு வென்றுள்ளீர்கள்.",
    "ഒരു 12 cm നീളമുള്ള ലോഹ കമ്പി ആവശ്യമാണ്.",
    "குறைந்தபட்சம் 300 $ செலவு செய்ய வேண்டும்.",
    "22-01-2024 அன்று பரீட்சை நடக்கிறது.",
]

for text in sample_texts:
    print("Original:", text)
    print("Processed Tamil:", convert_to_words(text, "ta"))
    print("Processed Malayalam:", convert_to_words(text, "ml"))
    print("------")

Original: நீங்கள் 25 l பரிசு வென்றுள்ளீர்கள்.
Processed Tamil: நீங்கள் இருபத்து ஐந்து லிட்டர் பரிசு வென்றுள்ளீர்கள்.
Processed Malayalam: நீங்கள் ഇരുപത്തിയഞ്ച് ലിറ്റർ பரிசு வென்றுள்ளீர்கள்.
------
Original: ഒരു 12 cm നീളമുള്ള ലോഹ കമ്പി ആവശ്യമാണ്.
Processed Tamil: ഒരു பன்னிரண்டு சென்டிமீட்டர் നീളമുള്ള ലോഹ കമ്പി ആവശ്യമാണ്.
Processed Malayalam: ഒരു പന്ത്രണ്ട് സെന്റിമീറ്റർ നീളമുള്ള ലോഹ കമ്പി ആവശ്യമാണ്.
------
Original: குறைந்தபட்சம் 300 $ செலவு செய்ய வேண்டும்.
Processed Tamil: குறைந்தபட்சம் முந்நூறு டாலர் செலவு செய்ய வேண்டும்.
Processed Malayalam: குறைந்தபட்சம் മുന്നൂറ് ഡോളർ செலவு செய்ய வேண்டும்.
------
Original: 22-01-2024 அன்று பரீட்சை நடக்கிறது.
Processed Tamil: இருபத்து இரண்டு ஒன்று இரண்டாயிரத்து இருபத்து நான்கு அன்று பரீட்சை நடக்கிறது.
Processed Malayalam: ഇരുപത്തിരണ്ട് ഒന്ന് രണ്ടായിരത്തി ഇരുപത്തിനാല് அன்று பரீட்சை நடக்கிறது.
------


#### 5.Pushing dataset to HF

In [None]:
import os
HUGGING_FACK_TOKEN = os.environ.get("HUGGING_FACE_TOKEN")

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

hf_dataset = DatasetDict({
    "train": merged_train,
    "validation": merged_validation,
    "test": merged_test
})
print(hf_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 60729
    })
    validation: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 11114
    })
    test: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 11126
    })
})


In [None]:
from huggingface_hub import HfApi

repo_name = "ml_ta_text_normalization"
username = "Saikrishna2403"

api = HfApi()
api.create_repo(repo_id=f"{username}/{repo_name}", repo_type="dataset")

RepoUrl('https://huggingface.co/datasets/Saikrishna2403/ml_ta_text_normalization', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Saikrishna2403/ml_ta_text_normalization')

In [None]:
###pushing dataset to HF
hf_dataset.push_to_hub(f"{username}/{repo_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Saikrishna2403/ml_ta_text_normalization/commit/5131a4acf655c70cac69863332c84868cae11c31', commit_message='Upload dataset', commit_description='', oid='5131a4acf655c70cac69863332c84868cae11c31', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Saikrishna2403/ml_ta_text_normalization', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Saikrishna2403/ml_ta_text_normalization'), pr_revision=None, pr_num=None)

In [None]:
# Loading the dataset from HF
df = load_dataset("Saikrishna2403/ml_ta_text_normalization")

Generating train split:   0%|          | 0/60729 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11114 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11126 [00:00<?, ? examples/s]

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 60729
    })
    validation: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 11114
    })
    test: Dataset({
        features: ['id', 'input', 'target', 'url'],
        num_rows: 11126
    })
})

In [None]:
df=df.remove_columns(['id','url'])
df

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 60729
    })
    validation: Dataset({
        features: ['input', 'target'],
        num_rows: 11114
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 11126
    })
})

In [None]:
print(df['train'][3])

{'input': 'കണ്ണൂര്: കണ്ണൂർ സർവകലാശാലയിൽ ചരിത്ര കോൺഗ്രസ് ഉദ്ഘാടനം ചെയ്യാനെത്തിയ കേരളാ ഗവർണർക്കെതിരെ വ്യാപക പ്രതിഷേധം.', 'target': 'കണ്ണൂരിൽ ഗവർണ്ണർനെതിരെ വ്യാപക പ്രതിഷേധം: കെഎസ് യു പ്രവർത്തകർ കരിങ്കൊടി കാണിച്ചു'}


In [None]:
!pip install langdetect



In [None]:
from langdetect import detect

def preprocess_function(examples):
    processed_inputs = []
    processed_targets = []

    for input_text, target_text in zip(examples["input"], examples["target"]):
        try:
            lang = detect(input_text)

            if lang in ["ta", "ml"]:
                processed_inputs.append(convert_to_words(input_text, lang=lang))
                processed_targets.append(convert_to_words(target_text, lang=lang))
            else:
                processed_inputs.append(input_text)
                processed_targets.append(target_text)
        except:
            processed_inputs.append(input_text)
            processed_targets.append(target_text)

    return {"input": processed_inputs, "target": processed_targets}


##### 6.Tokenize the dataset

In [None]:
final_ds = df.map(preprocess_function, batched=True)

Map:   0%|          | 0/60729 [00:00<?, ? examples/s]

Map:   0%|          | 0/11114 [00:00<?, ? examples/s]

Map:   0%|          | 0/11126 [00:00<?, ? examples/s]

In [None]:
print(type(df))
print(type(final_ds))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.dataset_dict.DatasetDict'>


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "sarvamai/sarvam-1"  # Sarvam AI model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from datasets import Dataset

if isinstance(df, pd.DataFrame):
    print("df is a Pandas DataFrame")
elif isinstance(df, Dataset):
    print("df is a Hugging Face Dataset")
else:
    print("df is neither a Pandas DataFrame nor a Hugging Face Dataset")

df is neither a Pandas DataFrame nor a Hugging Face Dataset


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sarvamai/sarvam-1")

def tokenize_function(examples):
    return tokenizer(examples["input"], padding="max_length", truncation=True)

tokenized_ds = final_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/60729 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/11114 [00:00<?, ? examples/s]

Map:   0%|          | 0/11126 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("sarvamai/sarvam-1")

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    weight_decay=0.01,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
)

trainer.train()

#### Evaluatig the metrics

In [None]:
from datsets import load_metric
import numpy as np

bleu_metric = load_metric("bleu")
chrf_metric = load_metric("chrf")
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    #Covertig tokens back to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    ###Removig extra spaces etc
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu_score = bleu_metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])["bleu"]
    chrf_score = chrf_metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])["score"]
    wer_score = wer_metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])
    cer_score = cer_metric.compute(predictions=[decoded_preds], references=[[decoded_labels]])

    return {
        "bleu": bleu_score,
        "chrf": chrf_score,
        "wer": wer_score,
        "cer": cer_score,
    }