In [None]:

"""
Experimented Models

GPT-based:
facebook/opt-6.7b
EleutherAI/gpt-neox-20b
facebook/opt-13b


T5-based:
google/flan-t5-xxl
google/ul2
google/byt5-xl


BART-based:
facebook/bart-large-xsum
facebook/bart-large-cnn
  facebook/bart-large

BLOOM:

bigscience/bloom-7b1
bigscience/bloom-3b

Advanced Transformers:
google/mt5-xxl
allenai/t5-11b-base-t2t

"""

## Importing the essential libraries over here

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments
)


In [24]:
from google.colab import drive
!ls

CozmoX.csv  dialect_conversion_model  logs  sample_data  wandb


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Determine Device

In [3]:
!pip install datasets
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from transformers import BartForConditionalGeneration, BartTokenizer



In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Loading the model

In [26]:
data=pd.read_csv("CozmoX.csv")
data.head()

Unnamed: 0,input_text,target_text
0,I CoLoUr 🎨 the centre of my favourite book.,I color the center of my favorite book.
1,He is travelling ✈️ to the THEATRE.,He is traveling to the theater.
2,I have a flat near the lift.,I have an apartment near the elevator.
3,I have a flat near the lift.,I have an apartment near the elevator.
4,The PROGRAMME 🗓️ will start at 6 O'CLOCK.,The program will start at 6 o'clock.


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Create DataFrame

In [6]:
df = pd.DataFrame(data)

## Split Dataset


In [7]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

## Load Pre-trained Model and Tokenizer

In [8]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Tokenization Function

In [9]:
def preprocess_function(examples):
    inputs = [ex for ex in examples['input_text']]
    targets = [ex for ex in examples['target_text']]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

## Convert to HuggingFace Datasets


In [10]:
train_dataset = HFDataset.from_pandas(train_df)
val_dataset = HFDataset.from_pandas(val_df)

## Prepare Datasets


In [11]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/266 [00:00<?, ? examples/s]



Map:   0%|          | 0/67 [00:00<?, ? examples/s]

## Training Arguments


In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./dialect_conversion_model',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100,
    save_total_limit=2,
    push_to_hub=True,
)



## Trainer

In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train Model


In [20]:
def train_dialect_model():
    trainer.train()
    trainer.push_to_hub()

## Dialect Conversion Function


In [21]:
def convert_dialect(text):
    input_ids = tokenizer(text, return_tensors='pt', max_length=128, truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=128, num_return_sequences=1)
    converted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return converted_text

## Model Evaluation


In [22]:
def evaluate_model(test_df):
    bleu_scores = []

    for _, row in test_df.iterrows():
        converted = convert_dialect(row['input_text'])
        reference = row['target_text']

        bleu = sentence_bleu([reference.split()], converted.split())
        bleu_scores.append(bleu)

    return np.mean(bleu_scores)

## Main Execution


In [23]:
def main():
    train_dialect_model()

    test_texts = [
        'I CoLoUr the centre of my favourite book.',
        'He is travelling to the THEATRE.',
        'I have a flat near the lift.',
        "hi how are you?",
        "aeropplane is flying",
        'I need to go to the chemist for some medication.',
        'She bought a jumper for the winter.',
        'We are planning a holiday in the countryside.',
        'The postman delivered my parcel this morning.',
        'I’m going to the shop to get some biscuits.',
        'She parked her car near the roundabout.'
    ]


    print("Dialect Conversion Examples:")
    for text in test_texts:
        converted = convert_dialect(text)
        print(f"Original: {text}\nConverted: {converted}\n")

    average_bleu = evaluate_model(val_df)
    print(f"Average BLEU Score: {average_bleu}")

if __name__ == "__main__":
    main()

    # 8a4db80b274e4315ae7a9e4616370d81fa343525

Step,Training Loss,Validation Loss
100,2.9964,2.560286
200,0.3588,0.222357
300,0.0304,0.047404


Dialect Conversion Examples:
Original: I CoLoUr the centre of my favourite book.
Converted: I color the center of my favorite book.

Original: He is travelling to the THEATRE.
Converted: He is going to the theater.

Original: I have a flat near the lift.
Converted: I have an apartment near the elevator.

Original: hi how are you?
Converted: hi how are you?

Original: aeropplane is flying
Converted: s.  

Original: I need to go to the chemist for some medication.
Converted: I need to some medication.  

Original: She bought a jumper for the winter.
Converted: She bought a sweater for the winter.  

Original: We are planning a holiday in the countryside.
Converted: We are planning a holiday in the countryside.  

Original: The postman delivered my parcel this morning.
Converted: The post arrived at 6:30.  

Original: I’m going to the shop to get some biscuits.
Converted: I’m going to the shop to get some biscuits.  

Original: She parked her car near the roundabout.
Converted: She parked

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.6805763297708781
