In [None]:
#import library
import pandas as pd
import numpy as np
import random
from happytransformer import TTTrainArgs
from happytransformer import HappyTextToText
from happytransformer import TTSettings
import pickle

In [None]:
# load the main dataset
df = pd.read_csv("main_dataset_v3.csv")
df.head(10)

# Generate train dataset for spell correction

In [None]:
# Load data from input CSV file
input_df = pd.read_csv("main_dataset_v3.csv")

# Select 20000 random rows from the input dataframe
selected_rows = input_df.iloc[0:20000,:]

# Extract Bangla sentences from the selected rows
bangla_sentences = selected_rows['Sentence'].tolist()

# List of Bengali consonant characters for misspelling
consonant_characters = [
    'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট',
    'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ',
    'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ড়', 'ঢ়','অ','আ','ই','উ']

# Function to randomly misspell some words
def misspell(sentence, probability=0.3):
    words = sentence.split()
    for i in range(len(words)):
        if random.random() < probability:
            # Misspell the word by changing a random character
            word = list(words[i])
            random_index = random.randint(0, len(word) - 1)
            random_consonant = random.choice(consonant_characters)
            word[random_index] = random_consonant
            words[i] = ''.join(word)
    return ' '.join(words)

# Generate misspelled sentences
misspelled_sentences = [misspell(sentence) for sentence in bangla_sentences]

# Create DataFrame with misspelled sentences
df_new = pd.DataFrame({'sentence': misspelled_sentences})

# Add corrections column with original sentences
df_new['corrections'] = bangla_sentences

# Save DataFrame to CSV
df_new.to_csv('train_dataset.csv', index=False)


In [None]:
df_new.head()

# Generate eval dataset for spell correction

In [None]:
# Load data from input CSV file
input_df = pd.read_csv("main_dataset_v3.csv")

# Select 1500 random rows from the input dataframe
selected_rows = input_df.iloc[20001:250001,:]

# Extract Bangla sentences from the selected rows
bangla_sentences = selected_rows['Sentence'].tolist()

# List of Bengali consonant characters for misspelling
consonant_characters = [
    'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট',
    'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ',
    'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ড়', 'ঢ়','অ','আ','ই','উ']

# Function to randomly misspell some words
def misspell(sentence, probability=0.3):
    words = sentence.split()
    for i in range(len(words)):
        if random.random() < probability:
            # Misspell the word by changing a random character
            word = list(words[i])
            random_index = random.randint(0, len(word) - 1)
            random_consonant = random.choice(consonant_characters)
            word[random_index] = random_consonant
            words[i] = ''.join(word)
    return ' '.join(words)

# Generate misspelled sentences
misspelled_sentences = [misspell(sentence) for sentence in bangla_sentences]

# Create DataFrame with misspelled sentences
df_new = pd.DataFrame({'sentence': misspelled_sentences})

# Add corrections column with original sentences
df_new['corrections'] = bangla_sentences

# Save DataFrame to CSV
df_new.to_csv('eval_dataset.csv', index=False)

In [None]:
# load the model
from happytransformer import HappyTextToText

happy_tt = HappyTextToText("T5", "csebuetnlp/banglat5")

In [None]:
# load the train
train_dataset = pd.read_csv("train_dataset.csv")
train_dataset.head(10)

In [None]:
# load the eval data
eval_dataset = pd.read_csv("eval_dataset.csv")
eval_dataset.head(10)

In [None]:
import csv

def generate_csv(csv_path, dataset):
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["input", "target"])
        for index, row in dataset.iterrows():
            # Assuming each row contains "sentence" and "corrections" columns
            input_text = "grammar: " + row["sentence"]
            correction = row["corrections"]
            if input_text and correction:
                writer.writerow([input_text, correction])

# Assuming input_df is your DataFrame containing "sentence" and "corrections" columns
generate_csv("train.csv", train_dataset)
generate_csv("eval.csv", eval_dataset)


In [None]:
train = pd.read_csv("train.csv")

In [None]:
train.head()

In [None]:
before_result = happy_tt.eval("eval.csv")
print("Before loss:", before_result.loss)


In [None]:
from happytransformer import TTTrainArgs

args = TTTrainArgs(batch_size=1,num_train_epochs=6)
happy_tt.train("dataset/train.csv", args=args)

In [None]:
from happytransformer import TTSettings

beam_settings =  TTSettings(num_beams=1, min_length=2, max_length=32)

In [None]:
example_1 = "grammar: আজ রোফবার দুপুরে রাজফনীর ইস্কাটনে ঢাকা ম্যাস ট্রানজিট কোম্পাকি লিমিডোডের (ডিএমটিসিএল) কার্যালয়ে আয়োজিএ এক সংবাজ সম্মেলনে এ কতা জানাদ সংস্থাটির ব্যবস্থাপকা পরিচাকক এম এ এন"
result_1 = happy_tt.generate_text(example_1, args=beam_settings)
print(result_1.text)

In [None]:
import pickle
pickle.dump(happy_tt, open('model/model.pkl','wb'))

In [None]:
model = pickle.load(open('model/model.pkl', 'rb'))

In [None]:

model = pickle.load(open('model/model.pkl', 'rb'))


beam_settings =  TTSettings(num_beams=5, min_length=1, max_length=100)

example_1 = "grammar: আজ রোফবার দুপুরে রাজফনীর ইস্কাটনে ঢাকা ম্যাস ট্রানজিট কোম্পাকি লিমিডোডের (ডিএমটিসিএল) কার্যালয়ে আয়োজিএ এক সংবাজ সম্মেলনে এ কতা জানাদ সংস্থাটির ব্যবস্থাপকা পরিচাকক"
result_1 = model.generate_text(example_1, args=beam_settings)
print(result_1.text)

In [None]:
example_1 = "grammar: মহাসড়কে ফিটহেস পরক্ষা করা পুবিশের গাড়িই ফিনেসহীন"
result_1 = model.generate_text(example_1, args=beam_settings)
print(result_1.text)

In [None]:
example_1 = "grammar: সিলেট নগরর ফুতঁপাত ও সকক দখল করে পণ্যর পসরা সজিয়ে বসেছিকেন ভ্রাম্যমান ব্যবষয়ীরা। "
result_1 = model.generate_text(example_1, args=beam_settings)
print(result_1.text)

In [None]:
example_1 = "grammar: এদিকে পুনর্বানন কার্যক্রম গুরুর পর সোমবব থেকেই নগলের ফুটফাত ও সড়ক দখমুক্ত করতে অভিযনে নামে সিসিক মঙ্গলবারও নগলের কয়েকটি একাকায় এ অভিযান চালানো হয়। "
result_1 = model.generate_text(example_1, args=beam_settings)
print(result_1.text)