 # **Installing & Importing dependencies**

In [1]:
# Install the tqdm package using pip
!pip install tqdm
import Levenshtein
# Import necessary packages
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm # Import tqdm to show progress bar during loops
import re # Import regular expressions

[0m

# Inference(Takes about 20 mins for 5000 samples)

In [2]:
"""
Just change the path of the csv in the dataframe test_data for running inference on hidden dataset. 
Please kindly make sure that the csv file has a column named "text" as it explicitly called in the 
inference and post processing cell. If the csv file has any other name then kindly change the column name
in the line number 25 of this cell and line number 83, 88, 112, 114, and 136.
"""
# Read the test dataset
test_data = pd.read_csv("/kaggle/input/bhashabhrom-evaluation-thirdset/thirdset.csv")
test_data['sentence'] = test_data['sentence'].apply(lambda x: x.replace('$',''))
test_data["Expected"] = test_data['sentence'] 
test_data["text"] = test_data['sentence'] 
# Create an empty dataframe with columns 'Id' and 'Expected'
df = pd.DataFrame(columns=['Id', 'Expected'])

In [3]:
# Initialize the T5 tokenizer and model using a pre-trained checkpoint
# Note: Version 5 of the dataset "csebuetnlp-1d1-2d12" contains the best model. We have pinned that version. Kindly do not change the version to the lastest one.
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/csebuetnlp-1d1-2d12/bt5_on_3d1_2d1_final_prepro_xoxo")
model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/csebuetnlp-1d1-2d12/bt5_on_3d1_2d1_final_prepro_xoxo")

# Set the device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device
model.to(device)

# Loop through each sentence in the test dataset and generate the corresponding output
index=1
for sentence in tqdm(test_data['sentence']):
    
    # Format input with the sentence and desired prefix
    input_str = sentence
    
    # Tokenize the input
    input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)

    # Generate the output
    output_ids = model.generate(input_ids=input_ids, 
                                max_length=512, 
                                num_beams=4, 
                                early_stopping=True)

    # Decode the output and remove unwanted tokens
    output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Add the transcription to the dataframe 'df'
    data = {'Id': index,
            'Expected': output_str}
    df = df.append(data, ignore_index=True)
    index=index+1

100%|██████████| 9999/9999 [45:57<00:00,  3.63it/s]


# Post Processing(Takes only 5 sec)

In [4]:
def replace_half_dollar_signs(s):
    """
    This function takes a string s as input, splits it into words, 
    and replaces half of the dollar signs in each word with an asterisk. 
    The modified words are then joined back into a string and returned.
    """
    def contains_only_dollar_signs(s):
        for char in s:
            if char != '$':
                return False
        return True  
    
    s2 = s.split()
    for i in range(len(s2)):
        if contains_only_dollar_signs(s2[i]):
            dollar_count = s2[i].count('$')
            s2[i] = s2[i][:(dollar_count+1)//2] + '*' + s2[i][(dollar_count+1)//2:]
    return " ".join(s2)

In [5]:
def replace_double_spaces(str1, str2):
    """
    Replaces double spaces in the first string with a '$ $' separator and replaces certain characters in the second string.

    Args:
    - str1: A string containing double spaces that need to be replaced.
    - str2: A string containing characters that need to be replaced.

    Returns:
    - A new string with the specified modifications.

    """
    # split str1 by double spaces and str2 by single spaces
    str1_list = str1.split("  ")
    str2_list = str2.split()
    str2_list_copy = str2_list.copy()
    
    # Replace any occurrence of '$' in str2_list with an empty string
    str2_list = [i.replace('$', '') for i in str2_list] 
    
    # Split the first part of str1 by single spaces
    str1_list_part1 = str1_list[0].split()  
    
    # Iterate through the split string, checking if each element is equal to the corresponding element in str2_list
    for i in range(len(str1_list_part1)):
        if (str1_list_part1[i] == str2_list[i]):
            continue
            
    # Join the modified str2_list and add a '$ $' separator where the double spaces were in str1
    output = " ".join(str2_list_copy[:i+1]) + "$ $ " + " ".join(str2_list_copy[i+1:])
    return output


In [6]:
def align(first_sentence, second_sentence):
    """
    Aligns two given sentences by replacing words in the second sentence with words in the first sentence.

    Args:
    - first_sentence: A string representing the first sentence with dollar signs ($).
    - second_sentence: A string representing the second sentence (original).

    Returns:
    - A new string with the second sentence aligned to the first sentence.

    """
    try:
        first_sentence=replace_half_dollar_signs(first_sentence)
        fs_words = first_sentence.split()
        ss_words = second_sentence.split()
        for i, (f_word, s_word) in enumerate(zip(fs_words, ss_words)):
            if f_word == s_word:
                continue
            else:
                index=0
                count_head=0
                count_tail=0
                while f_word[index:index+1]=='$':
                    count_head+=1
                    index+=1
                index=-1
                if f_word[index]=='$':
                    while f_word[index]=='$':
                        count_tail+=1
                        index-=1
                    ss_words[i] = ('$'*count_head) + ss_words[i] + ('$'*count_tail)
                else:
                    things_to_ignore=["$,","$?","$!","$।","$'"]
                    found=False
                    for j in things_to_ignore:
                        if f_word == j:
                           found=True
                    if found == False:
                        index=-2
                        while f_word[index]=='$':
                            count_tail+=1
                            index-=1
                        ss_words[i] = ('$'*count_head) + ss_words[i][:-1] + ('$'*count_tail) + ss_words[i][-1]
                    else:
                        ss_words[i] = ('$'*count_head) + ss_words[i]
        joined_sentence = " ".join(ss_words)
        str_1=joined_sentence.replace("$", "")
        if len(str_1)!=len(second_sentence):
            if second_sentence.count("  ") == 1:
                first=second_sentence
                second=joined_sentence
                output = replace_double_spaces(first, second)
                joined_sentence=output
        return joined_sentence
    except Exception as e:
        return first_sentence

In [7]:
index=0
for i in tqdm(df["Expected"]):
    if i.find("$")!=-1:
        first_sentence = i
        second_sentence = test_data["text"][index]
        joined_sentence=align(first_sentence,second_sentence)
        df["Expected"][index]=joined_sentence
        index=index+1
    else:
        df["Expected"][index]=test_data["text"][index]
        index=index+1

100%|██████████| 9999/9999 [00:04<00:00, 2335.88it/s]


In [8]:
for index, row in df.iterrows():
    if " $।$" in row["Expected"]:
        df.at[index, "Expected"] = row["Expected"].replace(" $।$","$ $।")
    if "$ ।$" in row["Expected"]:
        df.at[index, "Expected"] = row["Expected"].replace("$ ।$","$ $।")
    if "  ।" in row["Expected"]:
        df.at[index, "Expected"]=row["Expected"].replace("  ।","$ $$ $।")
    if "  ?" in row["Expected"]:
        df.at[index, "Expected"]=row["Expected"].replace("  ।","$ $$ $।")
    if "  !" in row["Expected"]:
        df.at[index, "Expected"]=row["Expected"].replace("  ।","$ $$ $।")
    if " ।" in row["Expected"]:
        df.at[index, "Expected"] = row["Expected"].replace(" ।","$ $।")
    if " ?" in row["Expected"]:
        df.at[index, "Expected"] = row["Expected"].replace(" ?","$ $?")
    if " !" in row["Expected"]:
        df.at[index, "Expected"] = row["Expected"].replace(" !","$ $!")
        
for index, row in df.iterrows():
    if '$* ' in row["Expected"]:
        a=row["Expected"].replace("$* ","$")
        df["Expected"][index]=align(a,test_data["text"][index])
for index, row in df.iterrows():
    if test_data["text"][index][0]==" " and row["Expected"][0] != "$":
        df["Expected"][index]="$ $" + row["Expected"]
        
def append_if_not_match(string, patterns):
    """
    Appends "$$" to the end of a string if the last two characters do not match
    any of the specified patterns.
    """
    status = True
    for pattern in patterns:
        if re.search(pattern, string[-2:]):
            status = False
            break
    if status:
        string += "$$"
    return string

patterns2 = ["\\$\\$", "!\\$", "\\?\\$", "\\$\\$", "।\\$", "!", "\\?","।"]
for index, row in df.iterrows():
        string2=df["Expected"][index]
        df["Expected"][index] = append_if_not_match(string2, patterns2)
for index, row in test_data.iterrows():
    if row["text"][-1] == " ":
        df["Expected"][index]=df["Expected"][index]+" "
for index, row in df.iterrows():
    if '!$$' in row["Expected"]:
        df["Expected"][index]=row["Expected"].replace("!$$","!")
    if '?$$' in row["Expected"]:
        df["Expected"][index]=row["Expected"].replace("?$$","?")
    if '।$$' in row["Expected"]:
        df["Expected"][index]=row["Expected"].replace("।$$","।")
i=False
for ind, row in df.iterrows():
    sentence=row["Expected"].split()
    for j in range(len(sentence)):
        if "!$" in sentence[j]:
            index=sentence[j].index("!")
            if sentence[j][index-1] != "$":
                sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                i=True
                if sentence[j][0]=="$":
                    sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                    i=i+1
    df["Expected"][ind]=" ".join(sentence)
    if i==True:
        i=False
i=False
for ind, row in df.iterrows():
    sentence=row["Expected"].split()
    for j in range(len(sentence)):
        if "?$" in sentence[j]:
            index=sentence[j].index("?")
            if sentence[j][index-1] != "$":
                sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                i=True
                if sentence[j][0]=="$":
                    sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                    i=i+1
    df["Expected"][ind]=" ".join(sentence)
    if i==True:
        i=False
i=False
for ind, row in df.iterrows():
    sentence=row["Expected"].split()
    for j in range(len(sentence)):
        if "।$" in sentence[j]:
            index=sentence[j].index("।")
            if sentence[j][index-1] != "$":
                sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                i=True
                if sentence[j][0]=="$":
                    sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                    i=i+1
    df["Expected"][ind]=" ".join(sentence)
    if i==True:
        i=False
i=False
for ind, row in df.iterrows():
    if ",$ $" not in sentence:
        sentence=row["Expected"].split()
        for j in range(len(sentence)):
            if ",$" in sentence[j]:
                index=sentence[j].index(",")
                if sentence[j][index-1] != "$":
                    sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                    i=True
                    if sentence[j][0]=="$":
                        sentence[j]=sentence[j][:index]+"$"+sentence[j][index:]
                        i=i+1
        df["Expected"][ind]=" ".join(sentence)
        if i==True:
            i=False

# Converting Dataframe to CSV

In [9]:
distance = 0
for x,y in zip(test_data.ged, df.Expected):
    distance+=Levenshtein.distance(x, y)
print(distance/len(test_data))

0.7073707370737073


In [10]:
df.to_csv('./predicted_sentence.csv',index = False)