# Dialect Conversion Model

convert text between UK and US dialects.

# Data Preparation

In [1]:
! pip install transformers datasets



* Load Dataset

In [2]:
import pandas as pd
import re

In [3]:
file=r"D:\Data centr\Exel_data\int data\CozmoX Assignment Dataset.csv"
data=pd.read_csv(file)
data

Unnamed: 0,input_text,target_text
0,I CoLoUr 🎨 the centre of my favourite book.,I color the center of my favorite book.
1,He is travelling ✈️ to the THEATRE.,He is traveling to the theater.
2,I have a flat near the lift.,I have an apartment near the elevator.
3,I have a flat near the lift.,I have an apartment near the elevator.
4,The PROGRAMME 🗓️ will start at 6 O'CLOCK.,The program will start at 6 o'clock.
...,...,...
91,The theatre's performance was breathtaking.,The theater's performance was breathtaking.
92,Her behaviour has been commendable.,Her behavior has been commendable.
93,The cheque was never received.,The check was never received.
94,The aeroplane ✈️ took off on time.,The airplane took off on time.


In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input_text   96 non-null     object
 1   target_text  96 non-null     object
dtypes: object(2)
memory usage: 1.6+ KB
None


In [5]:
data.isnull().sum()

input_text     0
target_text    0
dtype: int64

* Normalization

In [6]:
import unicodedata

In [7]:
def clean_text(text):
  text=unicodedata.normalize("NFKC", text) # Normalize the char
  text=text.lower()
  text=re.sub(r"[^\w\s]", "", text)  # remove special characters & Emojis
  text=re.sub(r"\s+", " ", text)  # remove extra spaces
  return text

Data Cleaning

In [8]:
data["input_text"] = data["input_text"].apply(clean_text)
data["target_text"] = data["target_text"].apply(clean_text)

In [9]:
print(data.head())

                                 input_text  \
0  i colour the centre of my favourite book   
1           he is travelling to the theatre   
2               i have a flat near the lift   
3              i have a flat near the lift    
4      the programme will start at 6 oclock   

                              target_text  
0  i color the center of my favorite book  
1          he is traveling to the theater  
2   i have an apartment near the elevator  
3   i have an apartment near the elevator  
4      the program will start at 6 oclock  


In [10]:
data.to_csv("processed_cozmoX.csv", index=False)
print("\n saved")


 saved


# Model Selection- T5
Pretrained on Text Transformation.

Best for small dataset with accuracy.

In [11]:
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFAutoModelForSeq2SeqLM

* Tokenization- breaking text into smaller units (tokens)

In [12]:
tokanizer=T5Tokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
model_t5=TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")




model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]




All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [15]:
def tokenize_data(sentence):
  input_text = "translate UK to US: " + sentence["input_text"]  # Task prefix
  target_text = sentence["target_text"]

  tokenized_inputs = tokanizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors="tf")
  tokenized_targets=tokanizer(target_text, padding="max_length", truncation=True, max_length=128, return_tensors="tf")

  return{
      "input_ids": tokenized_inputs["input_ids"][0],  # Extract tensors
      "attention_mask": tokenized_inputs["attention_mask"][0],
      "labels": tokenized_targets["input_ids"][0],
      }

token_dataset=data.apply(tokenize_data, axis=1)

Convert the tokenized dataset to a TensorFlow dataset

In [16]:
def to_tf(tokenized_data):
  input_ids=[]
  attention_masks=[]
  labels=[]

  for item in tokenized_data:
    input_ids.append(item["input_ids"])
    attention_masks.append(item["attention_mask"])
    labels.append(item["labels"])

  return tf.data.Dataset.from_tensor_slices(({
      "input_ids": input_ids,
      "attention_mask": attention_masks
  },
      tf.stack(labels)
  ))
  tf_dataset=to_tf(token_dataset)

# Model Training

In [17]:
lr_sechedule=tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=1000,
    decay_rate=0.9,
    staircase=True
)

In [18]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_sechedule)
loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [19]:
epochs=3
tf_dataset=to_tf(token_dataset).batch(1)

for epoch in range(epochs):
  for batch in tf_dataset:
    inputs, labels = batch

    with tf.GradientTape() as tape:
      # Forward pass
      outputs = model_t5(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
      loss = outputs.loss

      # Backward pass
      gradients = tape.gradient(loss, model_t5.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model_t5.trainable_variables))

      print(f"Loss: {loss.numpy()}")

  print(f"Completed epoch {epoch + 1}")

Loss: [12.706328]
Loss: [2.244296]
Loss: [0.2505015]
Loss: [0.35311702]
Loss: [0.39860797]
Loss: [0.24281064]
Loss: [0.22058557]
Loss: [0.25443977]
Loss: [0.19203979]
Loss: [0.11452226]
Loss: [0.13167979]
Loss: [0.08652425]
Loss: [0.10950971]
Loss: [0.11259355]
Loss: [0.93104804]
Loss: [0.17465252]
Loss: [0.21873431]
Loss: [0.24966344]
Loss: [0.23980623]
Loss: [0.20394443]
Loss: [0.22640795]
Loss: [0.23582111]
Loss: [0.24417078]
Loss: [0.20024209]
Loss: [0.24859692]
Loss: [0.16590907]
Loss: [0.20332238]
Loss: [0.18894307]
Loss: [0.14160718]
Loss: [0.10031591]
Loss: [0.22112861]
Loss: [0.1478179]
Loss: [0.10265511]
Loss: [0.11954484]
Loss: [0.08252809]
Loss: [0.08896951]
Loss: [0.09104055]
Loss: [0.12273654]
Loss: [0.05601408]
Loss: [0.09164749]
Loss: [0.20163625]
Loss: [0.07711082]
Loss: [0.06035097]
Loss: [0.10470805]
Loss: [0.10274045]
Loss: [0.05791936]
Loss: [0.07011247]
Loss: [0.0524703]
Loss: [0.15679812]
Loss: [0.04060385]
Loss: [0.06277525]
Loss: [0.04426995]
Loss: [0.11738274]

# Model Evaluation

In [21]:
from evaluate import load

In [23]:
bleu_metric = load("bleu")
rouge_metric = load("rouge")

In [24]:
epochs = 3
batch_size = 1
tf_dataset = to_tf(token_dataset).batch(batch_size)

# Graph Visualization

# Model Deployment