In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu,True)

In [3]:
df = pd.read_csv("drive/MyDrive/Grammar/test_dataset_csv")

In [4]:
df.head()

Unnamed: 0,incorrect,correct
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin..."
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...


In [5]:
X = df["incorrect"]
y = df["correct"]

In [6]:
len(X),len(y)

(152175, 152175)

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
len(X_train),len(X_test)

(121740, 30435)

In [9]:
def clean_text(sent):
  sent = re.sub(r"([?.!,])",r" \1",sent)
  sent = re.sub(f"[' ']+"," ",sent)
  sent = sent.strip()
  return sent

In [10]:
X_train_clean = [clean_text(sentence) for sentence in X_train]
X_test_clean = [clean_text(sentence) for sentence in X_test]
y_train_clean = [clean_text(sentence) for sentence in y_train]
y_test_clean = [clean_text(sentence) for sentence in y_test]

In [11]:
X_train_clean[0],y_train_clean[0]

('Kids Crisis But knocked out by CHAMPS how did Your Child or Teen Sleep ?',
 'Kids Crisis Knocked Out By CHAMPS How Well Does Your Child or Teen Sleep ?')

In [12]:
len(X_train_clean),len(y_train_clean)

(121740, 121740)

In [13]:
rev_len = [len(i) for i in X_train_clean]
pd.Series(rev_len).describe()

count    121740.000000
mean        134.664531
std         112.253870
min           4.000000
25%          64.000000
50%         107.000000
75%         174.000000
max       12897.000000
dtype: float64

In [14]:
rev_len = [len(i) for i in y_train_clean]
pd.Series(rev_len).describe()

count    121740.000000
mean        134.736479
std          99.870262
min          12.000000
25%          64.000000
50%         108.000000
75%         176.000000
max        1244.000000
dtype: float64

In [15]:
rev_len = [len(i) for i in y_test_clean]
pd.Series(rev_len).describe()

count    30435.000000
mean       134.022868
std         98.575162
min         17.000000
25%         63.000000
50%        107.000000
75%        176.000000
max       1100.000000
dtype: float64

In [16]:
rev_len = [len(i) for i in X_test_clean]
pd.Series(rev_len).describe()

count    30435.000000
mean       133.827337
std        105.016771
min          3.000000
25%         63.000000
50%        107.000000
75%        174.000000
max       2904.000000
dtype: float64

In [17]:
def reduce_sentences(data_input,data_output):

  dataset_input = []
  dataset_output = []

  for i in range(len(data_input)):
    if(len(data_input[i])<=150 and len(data_output[i])<=150):
      dataset_input.append(data_input[i])
      dataset_output.append(data_output[i])

  return dataset_input,dataset_output

In [18]:
X_train_reduced,y_train_reduced = reduce_sentences(X_train_clean,y_train_clean)
X_test_reduced,y_test_reduced = reduce_sentences(X_test_clean,y_test_clean)

In [19]:
len(X_train_reduced), len(y_train_reduced), len(X_test_reduced), len(y_test_reduced)

(80347, 80347, 20121, 20121)

In [20]:
def prepare_sentence_target(sent):
  sentences = map(lambda s: (' ').join(["<sos>",s,"<eos>"]),sent)
  return list(sentences)

In [21]:
y_train_tagged = prepare_sentence_target(y_train_reduced)
y_test_tagged = prepare_sentence_target(y_test_reduced)

In [22]:
X_train_reduced[0],y_train_tagged[0], y_test_tagged[0]

('Kids Crisis But knocked out by CHAMPS how did Your Child or Teen Sleep ?',
 '<sos> Kids Crisis Knocked Out By CHAMPS How Well Does Your Child or Teen Sleep ? <eos>',
 '<sos> I want this movie to be soooooooo good . <eos>')

In [23]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",filters='"\t\n')
input_tokenizer.fit_on_texts(X_train_reduced)

In [24]:
input_vocab_size = len(input_tokenizer.word_index)+1
input_vocab_size

113716

In [25]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",filters='"\t\n')
target_tokenizer.fit_on_texts(y_train_tagged)

In [26]:
target_vocab_size = len(target_tokenizer.word_index) + 1
target_vocab_size

92344

In [27]:
X_train_encoder = input_tokenizer.texts_to_sequences(X_train_reduced)
X_test_encoder = input_tokenizer.texts_to_sequences(X_test_reduced)

In [28]:
X_train_encoder[0],X_train_reduced[0],X_test_encoder[0],X_test_reduced[0]

([647, 3131, 49, 10234, 53, 26, 11308, 45, 161, 20, 588, 28, 3514, 1580, 13],
 'Kids Crisis But knocked out by CHAMPS how did Your Child or Teen Sleep ?',
 [17, 130, 16763, 1160, 5, 24, 437, 91, 2, 2],
 'I want tihs movie to be soon good . .')

In [29]:
def decoder_input_target(sent,tokenizer):
  sequence = tokenizer.texts_to_sequences(sent)
  decoder_inputs = [s[:-1] for s  in sequence]
  decoder_targets = [s[1:] for s in sequence]

  return decoder_inputs,decoder_targets

In [30]:
y_train_decoder_input,y_train_decoder_target = decoder_input_target(y_train_tagged,target_tokenizer)
y_test_decoder_input,y_test_decoder_target = decoder_input_target(y_test_tagged,target_tokenizer)

In [31]:
y_train_decoder_input[0],y_train_decoder_target[0],y_test_decoder_input[0],y_test_decoder_target[0]

([2,
  586,
  3185,
  11533,
  54,
  29,
  10498,
  44,
  119,
  113,
  20,
  571,
  27,
  3654,
  1698,
  15],
 [586,
  3185,
  11533,
  54,
  29,
  10498,
  44,
  119,
  113,
  20,
  571,
  27,
  3654,
  1698,
  15,
  3],
 [2, 19, 120, 21, 1270, 7, 26, 37477, 104, 4],
 [19, 120, 21, 1270, 7, 26, 37477, 104, 4, 3])

In [32]:
max_encoding_len = 150
max_decoding_len = 150

In [33]:
X_train_padded = pad_sequences(X_train_encoder,max_encoding_len,padding="post",truncating="post")
y_train_padded_input = pad_sequences(y_train_decoder_input,max_decoding_len,padding="post",truncating="post")
y_train_padded_targets = pad_sequences(y_train_decoder_target,max_decoding_len,padding="post",truncating="post")

X_val_padded = pad_sequences(X_test_encoder,max_encoding_len,padding="post",truncating="post")
y_val_padded_input = pad_sequences(y_test_decoder_input,max_decoding_len,padding="post",truncating="post")
y_val_padded_targets = pad_sequences(y_test_decoder_target,max_decoding_len,padding="post",truncating="post")

In [34]:
X_train_padded[0], X_val_padded[0]

(array([  647,  3131,    49, 10234,    53,    26, 11308,    45,   161,
           20,   588,    28,  3514,  1580,    13,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

GLOVE Embedding

!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-02-26 07:03:02--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-02-26 07:03:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-02-26 07:03:03--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

!unzip glove*.zip

In [None]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [37]:
def embedding_layer_glove(glove_path,word,embedding_dim):
  vocab_size = len(word) + 1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(glove_path,encoding="utf8") as file:
    for line in file:
      w,*vector = line.split()
      if w in word:
        idx = word[w]
        embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]

  return embedding_matrix

In [38]:
embedding_dim = 50
input_embedding_matrix_vocab = embedding_layer_glove("glove.6B.50d.txt",input_tokenizer.word_index,embedding_dim)
target_embedding_matrix_vocab = embedding_layer_glove("glove.6B.50d.txt",target_tokenizer.word_index,embedding_dim)

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.50d.txt'

In [39]:
len(input_embedding_matrix_vocab), len(target_embedding_matrix_vocab)

NameError: name 'input_embedding_matrix_vocab' is not defined

In [40]:
input_embedding_matrix_vocab

NameError: name 'input_embedding_matrix_vocab' is not defined

In [41]:
hidden_dim = 256
dropout = 0.2
batch_size=64
epochs = 20

In [42]:
# ENCODER

encoder_inputs = keras.Input(shape=[None])
encoder_embedding_layer = keras.layers.Embedding(len(input_embedding_matrix_vocab),embedding_dim,mask_zero=True,trainable=False)
encoder_embedding_output = encoder_embedding_layer(encoder_inputs)

encoder_lstm_layer = keras.layers.LSTM(hidden_dim,return_state=True,dropout=dropout)
encoder_outputs, state_h, state_c = encoder_lstm_layer(encoder_embedding_output)
encoder_states = (state_h,state_c)

NameError: name 'input_embedding_matrix_vocab' is not defined

In [43]:
# DECODER

decoder_inputs = keras.Input(shape=[None])
decoder_embedding_layer = keras.layers.Embedding(len(target_embedding_matrix_vocab),embedding_dim,mask_zero=True,trainable=False)
decoder_embedding_output = decoder_embedding_layer(decoder_inputs)

decoder_lstm_layer = keras.layers.LSTM(hidden_dim,return_sequences=True,return_state=True,dropout = dropout)
decoder_outputs, _,_ = decoder_lstm_layer(decoder_embedding_output,initial_state=encoder_states)
decoder_dense = keras.layers.Dense(target_vocab_size,activation="softmax")

y_prediction = decoder_dense(decoder_outputs)

NameError: name 'target_embedding_matrix_vocab' is not defined

In [44]:
model = tf.keras.Model([encoder_inputs,decoder_inputs],y_prediction)
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics="sparse_categorical_accuracy")

NameError: name 'y_prediction' is not defined

In [45]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             6189200   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             4999800   ['input_2[0][0]']             
                                                                                              

In [46]:
model_save_path = "./Grammar_correction_model.ckpt"

model_weights_callback = tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,verbose=1)

model_earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss",patience=3)

In [None]:
train_history = model.fit([X_train_padded,y_train_padded_input],y_train_padded_targets,batch_size=batch_size,epochs=epochs,
                          validation_split=0.1,callbacks=[model_weights_callback,model_earlystopping_callback])

Epoch 1/20
