In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM,Bidirectional,Embedding,Input,Dense,TextVectorization

In [3]:
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu,True)

In [4]:
df = pd.read_csv("drive/MyDrive/Grammar_Correction/test_dataset_csv")

In [5]:
df.head()

Unnamed: 0,incorrect,correct
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin..."
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...


In [6]:
df["length_incorr"] = [len(i) for i in df["incorrect"]]
df["length_corr"] = [len(i) for i in df["correct"]]

In [7]:
df.head()

Unnamed: 0,incorrect,correct,length_incorr,length_corr
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin...",56,60
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",87,92
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,334,355
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,49,44
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,54,55


In [8]:
df["length_incorr"].describe()

count    152175.000000
mean        132.274855
std         109.284164
min           2.000000
25%          62.000000
50%         106.000000
75%         172.000000
max       12705.000000
Name: length_incorr, dtype: float64

In [9]:
df["length_corr"].describe()

count    152175.000000
mean        132.002944
std          98.061967
min          12.000000
25%          62.000000
50%         106.000000
75%         173.000000
max        1160.000000
Name: length_corr, dtype: float64

In [10]:
def reduce_sentences(data_input, data_output):
    dataset_input, dataset_output = zip(*[(inp, out) for inp, out in zip(data_input, data_output) if len(inp) <= 500 and len(out) <= 500])
    return list(dataset_input), list(dataset_output)

In [11]:
incorrect_sent,correct_sentences = reduce_sentences(df["incorrect"],df["correct"])

In [12]:
len(incorrect_sent),len(correct_sentences)

(150660, 150660)

In [13]:
X_train,X_test,y_train,y_test = train_test_split(incorrect_sent,correct_sentences,test_size=0.1,random_state=42)

In [14]:
df_new = pd.DataFrame({
    "incorrect":X_train,
    "correct":y_train
})

In [15]:

df_test = pd.DataFrame({
    "incorrect":X_test,
    "correct":y_test
})

In [16]:
df_new.head()

Unnamed: 0,incorrect,correct
0,"All our technical staff works with ES ｃcats, s...","All our technical staff works with ESD coat, s..."
1,"28:20 And David said to Solomon his son, Be st...","28:20 And David said to Solomon his son, Be st..."
2,"Actually strap rockets to our back, though tha...","Actually strap rockets to our back, although, ..."
3,Chart of the are day: You are Better Off Than ...,Chart of the Day: Are You Better Off Than You ...
4,Olly Murs added a series of new shows to his U...,Olly Murs has added a series of new shows to h...


In [17]:
df_new.isna().sum()

incorrect    0
correct      0
dtype: int64

In [18]:
df_new.duplicated().sum()

0

In [19]:
def clean_text(sent):
  sent = sent.lower()
  sent = re.sub(r"([?.!,])",r" \1",sent)
  sent = re.sub(f"[' ']+"," ",sent)
  sent = sent.strip()
  return sent

In [20]:
df_new["incorrect"] = df_new["incorrect"].apply(clean_text)
df_new["correct"] = df_new["correct"].apply(clean_text)

In [21]:
df_test["incorrect"] = df_test["incorrect"].apply(clean_text)
df_test["correct"] = df_test["correct"].apply(clean_text)

In [22]:
df_new.head()

Unnamed: 0,incorrect,correct
0,"all our technical staff works with es ｃcats , ...","all our technical staff works with esd coat , ..."
1,"28:20 and david said to solomon his son , be s...","28:20 and david said to solomon his son , be s..."
2,"actually strap rockets to our back , though th...","actually strap rockets to our back , although ..."
3,chart of the are day: you are better off than ...,chart of the day: are you better off than you ...
4,olly murs added a series of new shows to his u...,olly murs has added a series of new shows to h...


In [23]:
df_new["incorrect"][0]

'all our technical staff works with es ｃcats , shoes and blowing .'

In [24]:
def preparing_decoder_input(sentence):
    return "<sos> " + sentence

def preparing_decoder_target(sentence):
  return sentence + " <eos>"

def preprocessing_decoder(sentence):
  return "<sos> " + sentence + " <eos>"

In [25]:
df_new["decoder_input"] = df_new["correct"].apply(preparing_decoder_input)
df_new["decoder_target"] = df_new["correct"].apply(preparing_decoder_target)

In [26]:
df_test["decoder_input"] = df_test["correct"].apply(preparing_decoder_input)
df_test["decoder_target"] = df_test["correct"].apply(preparing_decoder_target)

In [27]:
df_new["correct"] = df_new["correct"].apply(preprocessing_decoder)

In [28]:
df_new.head()

Unnamed: 0,incorrect,correct,decoder_input,decoder_target
0,"all our technical staff works with es ｃcats , ...",<sos> all our technical staff works with esd c...,<sos> all our technical staff works with esd c...,"all our technical staff works with esd coat , ..."
1,"28:20 and david said to solomon his son , be s...",<sos> 28:20 and david said to solomon his son ...,<sos> 28:20 and david said to solomon his son ...,"28:20 and david said to solomon his son , be s..."
2,"actually strap rockets to our back , though th...","<sos> actually strap rockets to our back , alt...","<sos> actually strap rockets to our back , alt...","actually strap rockets to our back , although ..."
3,chart of the are day: you are better off than ...,<sos> chart of the day: are you better off tha...,<sos> chart of the day: are you better off tha...,chart of the day: are you better off than you ...
4,olly murs added a series of new shows to his u...,<sos> olly murs has added a series of new show...,<sos> olly murs has added a series of new show...,olly murs has added a series of new shows to h...


In [29]:
y = df_new["correct"]

In [30]:
X = df_new["incorrect"]
y_input = df_new["decoder_input"]
y_target = df_new["decoder_target"]

In [31]:
X_test = df_test["incorrect"]
y_input_test = df_test["decoder_input"]
y_target_test = df_test["decoder_target"]

In [32]:
input_tokenizer = TextVectorization(max_tokens=5000,output_sequence_length = 500)
input_tokenizer.adapt(X)

target_tokenizer = TextVectorization(max_tokens=5000,output_sequence_length = 500)
target_tokenizer.adapt(y)

In [33]:
input_vocab_size = len(input_tokenizer.get_vocabulary()) + 1
target_vocab_size = len(target_tokenizer.get_vocabulary()) + 1

print(f"input vocab size :- {input_vocab_size} and target vocab size :- {target_vocab_size}")

input vocab size :- 5001 and target vocab size :- 5001


In [45]:
hidden_dim = 64
dropout = 0.2
batch_size=10
epochs = 20
embedding_dim = 50

In [49]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-04-18 12:52:46--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-04-18 12:52:46--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-04-18 12:52:47--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [50]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [51]:
embeddings_index = {}

embedding_file = open("glove.6B.50d.txt",encoding="utf-8")
for i in embedding_file:
  values = i.split()
  word = values[0]
  coefs = np.asarray(values[1:],dtype="float32")
  embeddings_index[word] = coefs
embedding_file.close()

print(len(embeddings_index))

400000


In [52]:
input_vocab = input_tokenizer.get_vocabulary()
input_word_index = {word: index for index, word in enumerate(input_vocab)}

target_vocab = target_tokenizer.get_vocabulary()
target_word_index = {word: index for index, word in enumerate(target_vocab)}

In [56]:
def data_generator(df, input_tokenizer, target_tokenizer, batch_size):
    num_samples = len(df)
    while True:
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            X = input_tokenizer(df["incorrect"][start:end])
            y_input = target_tokenizer(df["decoder_input"][start:end])
            y_target = target_tokenizer(df["decoder_target"][start:end])
            yield [X, y_input], y_target

train_generator = data_generator(df_new,input_tokenizer,target_tokenizer,batch_size)
test_generator = data_generator(df_test,input_tokenizer,target_tokenizer,64)

In [57]:
from scipy import sparse


embedding_matrix_ip = sparse.lil_matrix((input_vocab_size, embedding_dim), dtype=np.float32)
for word, i in input_word_index.items():
    embedding_vector_ip = embeddings_index.get(word)
    if i < input_vocab_size:
        if embedding_vector_ip is not None:
            embedding_matrix_ip[i] = embedding_vector_ip

In [58]:
embedding_matrix_op = sparse.lil_matrix((target_vocab_size, embedding_dim), dtype=np.float32)
for word, i in target_word_index.items():
    embedding_vector_op = embeddings_index.get(word)
    if i < target_vocab_size:
        if embedding_vector_op is not None:
            embedding_matrix_op[i] = embedding_vector_op

In [59]:
embedding_matrix_ip = embedding_matrix_ip.toarray()
embedding_matrix_op = embedding_matrix_op.toarray()

In [60]:
encoder_inputs = Input(shape=[None])
encoder_embedding_layer = Embedding(input_vocab_size,embedding_dim)
encoder_embedding_output = encoder_embedding_layer(encoder_inputs)

encoder_lstm_layer = LSTM(hidden_dim,return_state=True,dropout=dropout)
encoder_outputs, state_h, state_c = encoder_lstm_layer(encoder_embedding_output)
encoder_states = (state_h,state_c)

In [61]:
decoder_inputs = Input(shape=[None])
decoder_embedding_layer = Embedding(target_vocab_size,embedding_dim)
decoder_embedding_output = decoder_embedding_layer(decoder_inputs)

decoder_lstm_layer = LSTM(hidden_dim,return_sequences=True,return_state=True,dropout = dropout)
decoder_outputs, _,_ = decoder_lstm_layer(decoder_embedding_output,initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size,activation="softmax")

y_prediction = decoder_dense(decoder_outputs)

In [62]:
model = Model([encoder_inputs,decoder_inputs],y_prediction)

In [65]:
loss_fxn = tf.keras.losses.SparseCategoricalCrossentropy()

In [66]:
def custom_loss_function(y_tar,y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_tar,0))
    loss_ = loss_fxn(y_tar,y_pred)

    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *=mask

    return tf.reduce_mean(loss_)

In [67]:
model.compile(optimizer="adam",loss=custom_loss_function,metrics=["sparse_categorical_accuracy"])

In [68]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             250050    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             250050    ['input_2[0][0]']             
                                                                                              

In [69]:
model.layers[2].set_weights([embedding_matrix_ip])
model.layers[2].trainable = False

In [70]:
model.layers[3].set_weights([embedding_matrix_op])
model.layers[3].trainable = False

In [71]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             250050    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             250050    ['input_2[0][0]']             
                                                                                              

In [72]:
train_history = model.fit(train_generator, epochs=2, steps_per_epoch=len(df_new) // batch_size)

Epoch 1/2
Epoch 2/2


In [76]:
test_generator = data_generator(df_test,input_tokenizer,target_tokenizer,256)

In [None]:
loss,acc = model.evaluate(test_generator,batch_size=256)

    139/Unknown - 72s 521ms/step - loss: 0.0099 - sparse_categorical_accuracy: 0.9652

In [None]:
word_to_target = {i:w for i,w in enumerate(target_word_index)}

In [None]:
word_to_target

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_input_length = 500
max_output_length = 500

def predict_sequence(input_text):

    input_sequence = input_tokenizer([input_text])


    input_sequence = pad_sequences(input_sequence, maxlen=max_input_length, padding='post')


    decoder_input = np.zeros((1, 1))
    decoder_input[0, 0] = target_word_index['sos']

    output_sequence = []


    for _ in range(max_output_length):

        predictions = model.predict([input_sequence, decoder_input])

        predicted_index = np.argmax(predictions[0, -1, :])


        if predicted_index == target_word_index['eos']:
            break


        output_sequence.append(predicted_index)

        decoder_input = np.zeros((1, 1))
        decoder_input[0, 0] = predicted_index


    output_text = [word_to_target[idx] for idx in output_sequence]

    return ' '.join(output_text)


your_text = "I am go to the mall."


generated_text = predict_sequence(your_text)

print("Generated Text:")
print(generated_text)

Generated Text:
[UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 

In [None]:
loss,acc

In [None]:
train_history.history

In [None]:
plt.plot(train_history.history["sparse_categorical_accuracy"])
plt.plot(train_history.history["val_sparse_categorical_accuracy"])

In [None]:
plt.plot(train_history.history["loss"])
plt.plot(train_history.history["val_loss"])