 # **Install & Import dependencies**

In [None]:
# Install tqdm package using pip
!pip install tqdm 

In [None]:
# Import necessary packages
from tqdm import tqdm # tqdm is used to show progress bar
import pandas as pd # pandas is used for data manipulation
import re # re is used for regular expressions
import os # os is used for operating system related functions
import torch # torch is used for building deep learning models
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW # Import T5 model and its related classes from transformers package
from transformers.optimization import AdamW, get_linear_schedule_with_warmup # Import AdamW optimizer and its related functions from transformers package

# Merge Train Datasets into single DataFrame

In [None]:
# Load the first dataset into a pandas DataFrame
df1 = pd.read_csv("/kaggle/input/bengali-ged/DataSetFold1_u.csv")

# Load the second dataset into a pandas DataFrame
df2 = pd.read_csv("/kaggle/input/bengali-ged/DataSetFold2.csv")

# Load the extra dataset into a pandas DataFrame
#The extra data contains 30 samples which we wanted our model to learn while training
df3 = pd.read_csv("/kaggle/input/extra-data/Bhashabhrom - Sheet1 (1).csv")

# Rename the 'original' column in the extra dataset to 'sentence'
df3.rename(columns = {'original':'sentence'}, inplace = True)

# Concatenate the three datasets into one DataFrame
df = pd.concat([df1, df2, df3])

# Reset the index of the DataFrame
df = df.reset_index(drop=True)

# Preprocessing

In [None]:
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    
    # Replace " $।$" with "$ $।" in the 'gt' column
    if " $।$" in row["gt"]:
        df.at[index, "gt"] = row["gt"].replace(" $।$","$ $।")
        
    # Replace "$ ।$" with "$ $।" in the 'gt' column
    if "$ ।$" in row["gt"]:
        df.at[index, "gt"] = row["gt"].replace("$ ।$","$ $।")
        
    # Replace " ।" with "$ $।" in the 'gt' column
    if " ।" in row["gt"]:
        df.at[index, "gt"] = row["gt"].replace(" ।","$ $।")
        
    # Replace "  " with "$ $" in the 'gt' column
    if "  " in row["gt"]:
        df.at[index, "gt"]=row["gt"].replace("  ","$ $ ")
        
    # Replace " $,$" with "$ $," in the 'gt' column
    if " $,$" in row["gt"]:
        df.at[index, "gt"]=row["gt"].replace(" $,$","$ $,")
        
    # Remove consecutive '$' characters after '।' in the 'gt' column5
    if re.search(r"।\${3,}", row["gt"]):
        df.at[index, "gt"] = re.sub(r"\${3,}", "$", row["gt"])
        
        
    # Remove consecutive '$' characters after '!' in the 'gt' column
    if re.search(r"\!(\$){3}", row["gt"]):
        df.at[index, "gt"] = re.sub(r"\${3,}", "$", row["gt"])
        

In [None]:
def append_if_not_match(string, patterns):
    """
    Appends "$$" to the end of a string if the last two characters do not match
    any of the specified patterns.
    """
    status = True
    for pattern in patterns:
        if re.search(pattern, string[-2:]):
            status = False
            break
    if status:
        string += "$$"
    return string
patterns2 = ["\\$\\$", "!\\$", "\\?\\$", "\\$\\$", "।\\$", "!", "\\?","।"]
for index, row in df.iterrows():
        string2=row["gt"]
        df.at[index, "gt"] = append_if_not_match(string2, patterns2)

# Training

In [None]:
# Disabling logging of training metrics to the wandb servers.
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Initializing the T5 tokenizer and model
prey='/kaggle/input/csebuetnlp-1d1-2d12/bt5_on_3d1_2d1_final_prepro_xoxo'
tokenizer = T5Tokenizer.from_pretrained(prey)  # load T5 tokenizer from pre-trained model
model = T5ForConditionalGeneration.from_pretrained(prey)  # load T5 model for text-to-text conversion from pre-trained model

# Setting the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # check if GPU is available, else use CPU
model = model.to(device)  # move the model to GPU/CPU

# Defining the training parameters
EPOCHS = 100  # number of training epochs
BATCH_SIZE = 16  # batch size for training
LEARNING_RATE = 1e-4  # learning rate for optimizer
WARMUP_STEPS = 100  # number of warmup steps for scheduler
TOTAL_STEPS = (len(df) // BATCH_SIZE) * EPOCHS  # total number of training steps

# Defining the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)  # create AdamW optimizer with specified learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=TOTAL_STEPS)  # create linear scheduler with specified number of warmup steps and total training steps

# Training the model
for epoch in range(EPOCHS):
    for i in tqdm(range(0, len(df), BATCH_SIZE)):
        batch = df.iloc[i:i+BATCH_SIZE]  # get a batch of input data from the DataFrame
        sentences = list(batch["sentence"])  # extract input sentences from the batch
        gts = list(batch["gt"])  # extract corresponding ground truth output sentences from the batch
        input_ids = tokenizer.batch_encode_plus(sentences, padding=True, return_tensors="pt")["input_ids"].to(device)  # tokenize and encode the input sentences, and move the resulting tensors to GPU/CPU
        gt_ids = tokenizer.batch_encode_plus(gts, padding=True, return_tensors="pt")["input_ids"].to(device)  # tokenize and encode the ground truth output sentences, and move the resulting tensors to GPU/CPU
        
        model.train()  # set the model in training mode
        optimizer.zero_grad()  # reset the gradients to zero
        loss = model(input_ids=input_ids, labels=gt_ids).loss  # compute the loss for the current batch
        loss.backward()  # compute the gradients
        optimizer.step()  # update the model parameters
        scheduler.step()  # update the learning rate schedule
        # Print the loss every 100 batches
        if i % (100 * BATCH_SIZE) == 0:
            print(f"Epoch {epoch}, Batch {i}: Loss {loss.item()}")

# Saving the trained model
joker="bt5_on_3d1_2d2_1d1d2_final_prepro_xoxo"
model.save_pretrained(joker)  # save the trained T5 model to the specified directory
tokenizer.save_pretrained(joker)  # save the trained tokenizer to the specified directory