# Dialect Conversion Model

convert text between UK and US dialects.

# Data Preparation

In [1]:
!pip install transformers datasets




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


* Load Dataset

In [2]:
import pandas as pd
import re

In [3]:
file=r"D:\Data centr\Exel_data\int data\CozmoX Assignment Dataset.csv"
data=pd.read_csv(file)
data

Unnamed: 0,input_text,target_text
0,I CoLoUr 🎨 the centre of my favourite book.,I color the center of my favorite book.
1,He is travelling ✈️ to the THEATRE.,He is traveling to the theater.
2,I have a flat near the lift.,I have an apartment near the elevator.
3,I have a flat near the lift.,I have an apartment near the elevator.
4,The PROGRAMME 🗓️ will start at 6 O'CLOCK.,The program will start at 6 o'clock.
...,...,...
91,The theatre's performance was breathtaking.,The theater's performance was breathtaking.
92,Her behaviour has been commendable.,Her behavior has been commendable.
93,The cheque was never received.,The check was never received.
94,The aeroplane ✈️ took off on time.,The airplane took off on time.


In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   input_text   96 non-null     object
 1   target_text  96 non-null     object
dtypes: object(2)
memory usage: 1.6+ KB
None


In [5]:
data.isnull().sum()

input_text     0
target_text    0
dtype: int64

* Normalization

In [6]:
import unicodedata

In [7]:
def clean_text(text):
  text=unicodedata.normalize("NFKC", text) # Normalize the char
  text=text.lower()
  text=re.sub(r"[^\w\s]", "", text)  # remove special characters & Emojis
  text=re.sub(r"\s+", " ", text)  # remove extra spaces
  return text

Data Cleaning

In [8]:
data["input_text"] = data["input_text"].apply(clean_text)
data["target_text"] = data["target_text"].apply(clean_text)


In [9]:
print(data.head())

                                 input_text  \
0  i colour the centre of my favourite book   
1           he is travelling to the theatre   
2               i have a flat near the lift   
3              i have a flat near the lift    
4      the programme will start at 6 oclock   

                              target_text  
0  i color the center of my favorite book  
1          he is traveling to the theater  
2   i have an apartment near the elevator  
3   i have an apartment near the elevator  
4      the program will start at 6 oclock  


In [10]:
data.to_csv("processed_cozmoX.csv", index=False)
print("\n saved")


 saved


# Model Selection- T5
Pretrained on Text Transformation.

Best for small dataset with accuracy.

In [11]:
import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration, TFAutoModelForSeq2SeqLM

* Tokenization- breaking text into smaller units (tokens)

In [12]:
tokanizer=T5Tokenizer.from_pretrained("t5-small")

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [18]:
model_t5=TFAutoModelForSeq2SeqLM.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


RuntimeError: Failed to import transformers.models.t5.modeling_tf_t5 because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
def tokenize_data(sentence):
  input_text = "translate UK to US: " + sentence["input_text"]  # Task prefix
  target_text = sentence["target_text"]

  tokenized_inputs = tokanizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors="tf")
  tokenized_targets=tokanizer(target_text, padding="max_length", truncation=True, max_length=128, return_tensors="tf")

  return{
      "input_ids": tokenized_inputs["input_ids"][0],  # Extract tensors
      "attention_mask": tokenized_inputs["attention_mask"][0],
      "labels": tokenized_targets["input_ids"][0],
      }

token_dataset=data.apply(tokenize_data, axis=1)

Convert the tokenized dataset to a TensorFlow dataset

In [None]:
def to_tf(tokenized_data):
  input_ids=[]
  attention_masks=[]
  labels=[]

  for item in tokenized_data:
    input_ids.append(item["input_ids"])
    attention_masks.append(item["attention_mask"])
    labels.append(item["labels"])

  return tf.data.Dataset.from_tensor_slices(({
      "input_ids": input_ids,
      "attention_mask": attention_masks
  },
      tf.stack(labels)
  ))
  tf_dataset=to_tf(token_dataset)

# Model Training

In [None]:
lr_sechedule=tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=1000,
    decay_rate=0.9,
    staircase=True
)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_sechedule)
loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [18]:
epochs=3
tf_dataset=to_tf(token_dataset).batch(1)

for epoch in range(epochs):
  for batch in tf_dataset:
    inputs, labels = batch

    with tf.GradientTape() as tape:
      # Forward pass
      outputs = model_t5(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
      loss = outputs.loss

      # Backward pass
      gradients = tape.gradient(loss, model_t5.trainable_variables)
      optimizer.apply_gradients(zip(gradients, model_t5.trainable_variables))

      print(f"Loss: {loss.numpy()}")

  print(f"Completed epoch {epoch + 1}")

Loss: [12.706329]
Loss: [2.2442951]
Loss: [0.2505015]
Loss: [0.3531171]
Loss: [0.3986079]
Loss: [0.24281038]
Loss: [0.22058555]
Loss: [0.25443977]
Loss: [0.19203994]
Loss: [0.1145226]
Loss: [0.1316802]
Loss: [0.08652504]
Loss: [0.10950559]
Loss: [0.112593]
Loss: [0.93117225]
Loss: [0.17465067]
Loss: [0.21873325]
Loss: [0.24966328]
Loss: [0.23980461]
Loss: [0.20394622]
Loss: [0.22640981]
Loss: [0.2358234]
Loss: [0.24417378]
Loss: [0.20024405]
Loss: [0.24859968]
Loss: [0.16591159]
Loss: [0.20332566]
Loss: [0.18894415]
Loss: [0.14160864]
Loss: [0.10031667]
Loss: [0.22113289]
Loss: [0.14782056]
Loss: [0.10265744]
Loss: [0.11954636]
Loss: [0.08252819]
Loss: [0.08897171]
Loss: [0.09104161]
Loss: [0.12273808]
Loss: [0.05601484]
Loss: [0.09164664]
Loss: [0.20164451]
Loss: [0.07710923]
Loss: [0.06035168]
Loss: [0.10468307]
Loss: [0.10275453]
Loss: [0.05793048]
Loss: [0.07010919]
Loss: [0.05244632]
Loss: [0.15660946]
Loss: [0.04061674]
Loss: [0.06268049]
Loss: [0.04417473]
Loss: [0.11729388]
Los

# Model Evaluation

In [None]:
from datasets import load_metric

ImportError: cannot import name 'load_metric' from 'datasets' (c:\Users\himan\AppData\Local\Programs\Python\Python312\Lib\site-packages\datasets\__init__.py)

In [18]:
bleu_matric=load_metric("bleu")
rouge_matric=load_metric("rouge")

In [19]:
epochs = 3
batch_size = 1
tf_dataset = to_tf(token_dataset).batch(batch_size)

# Graph Visualization

# Model Deployment