In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM,Bidirectional,Embedding,Input,Dense

In [3]:
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu,True)

In [4]:
df = pd.read_csv("drive/MyDrive/Projects/Grammar_Correction/test_dataset_csv")

In [5]:
df.head()

Unnamed: 0,incorrect,correct
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin..."
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...


In [6]:
df["length_incorr"] = [len(i) for i in df["incorrect"]]
df["length_corr"] = [len(i) for i in df["correct"]]

In [7]:
df.head()

Unnamed: 0,incorrect,correct,length_incorr,length_corr
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin...",56,60
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",87,92
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,334,355
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,49,44
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,54,55


In [8]:
df["length_incorr"].describe()

count    152175.000000
mean        132.274855
std         109.284164
min           2.000000
25%          62.000000
50%         106.000000
75%         172.000000
max       12705.000000
Name: length_incorr, dtype: float64

In [9]:
df["length_corr"].describe()

count    152175.000000
mean        132.002944
std          98.061967
min          12.000000
25%          62.000000
50%         106.000000
75%         173.000000
max        1160.000000
Name: length_corr, dtype: float64

In [10]:
def reduce_sentences(data_input,data_output):

  dataset_input = []
  dataset_output = []

  for i in range(len(data_input)):
    if(len(data_input[i])<=500 and len(data_output[i])<=500):
      dataset_input.append(data_input[i])
      dataset_output.append(data_output[i])

  return dataset_input,dataset_output

In [11]:
incorrect_sent,correct_sentences = reduce_sentences(df["incorrect"],df["correct"])

In [12]:
len(incorrect_sent),len(correct_sentences)

(150660, 150660)

In [13]:
X_train,X_test,y_train,y_test = train_test_split(incorrect_sent,correct_sentences,test_size=0.1,random_state=42)

In [14]:
df_new = pd.DataFrame({
    "incorrect":X_train,
    "correct":y_train
})

In [15]:
df_test = pd.DataFrame({
    "incorrect":X_test,
    "correct":y_test
})

In [16]:
df_new.head()

Unnamed: 0,incorrect,correct
0,"All our technical staff works with ES ｃcats, s...","All our technical staff works with ESD coat, s..."
1,"28:20 And David said to Solomon his son, Be st...","28:20 And David said to Solomon his son, Be st..."
2,"Actually strap rockets to our back, though tha...","Actually strap rockets to our back, although, ..."
3,Chart of the are day: You are Better Off Than ...,Chart of the Day: Are You Better Off Than You ...
4,Olly Murs added a series of new shows to his U...,Olly Murs has added a series of new shows to h...


In [17]:
df_new.isna().sum()

incorrect    0
correct      0
dtype: int64

In [18]:
df_new.duplicated().sum()

0

In [19]:
def clean_text(sent):
  sent = re.sub(r"([?.!,])",r" \1",sent)
  sent = re.sub(f"[' ']+"," ",sent)
  sent = sent.strip()
  return sent

In [20]:
df_new["incorrect"] = df_new["incorrect"].apply(clean_text)
df_new["correct"] = df_new["correct"].apply(clean_text)

In [21]:
df_test["incorrect"] = df_test["incorrect"].apply(clean_text)
df_test["correct"] = df_test["correct"].apply(clean_text)

In [22]:
df_new.head()

Unnamed: 0,incorrect,correct
0,"All our technical staff works with ES ｃcats , ...","All our technical staff works with ESD coat , ..."
1,"28:20 And David said to Solomon his son , Be s...","28:20 And David said to Solomon his son , Be s..."
2,"Actually strap rockets to our back , though th...","Actually strap rockets to our back , although ..."
3,Chart of the are day: You are Better Off Than ...,Chart of the Day: Are You Better Off Than You ...
4,Olly Murs added a series of new shows to his U...,Olly Murs has added a series of new shows to h...


In [23]:
df["incorrect"][0]

'Bitcoin is for $7,094 this morning, which CoinDesk says.'

In [24]:
def preparing_decoder_input(sentence):
    return "<sos> " + sentence

def preparing_decoder_target(sentence):
  return sentence + " <eos>"

def preprocessing_decoder(sentence):
  return "<sos> " + sentence + " <eos>"

In [25]:
df_new["decoder_input"] = df_new["correct"].apply(preparing_decoder_input)
df_new["decoder_target"] = df_new["correct"].apply(preparing_decoder_target)

In [26]:
df_test["decoder_input"] = df_test["correct"].apply(preparing_decoder_input)
df_test["decoder_target"] = df_test["correct"].apply(preparing_decoder_target)

In [27]:
df_new["correct"] = df_new["correct"].apply(preprocessing_decoder)

In [28]:
df_new.head()

Unnamed: 0,incorrect,correct,decoder_input,decoder_target
0,"All our technical staff works with ES ｃcats , ...",<sos> All our technical staff works with ESD c...,<sos> All our technical staff works with ESD c...,"All our technical staff works with ESD coat , ..."
1,"28:20 And David said to Solomon his son , Be s...",<sos> 28:20 And David said to Solomon his son ...,<sos> 28:20 And David said to Solomon his son ...,"28:20 And David said to Solomon his son , Be s..."
2,"Actually strap rockets to our back , though th...","<sos> Actually strap rockets to our back , alt...","<sos> Actually strap rockets to our back , alt...","Actually strap rockets to our back , although ..."
3,Chart of the are day: You are Better Off Than ...,<sos> Chart of the Day: Are You Better Off Tha...,<sos> Chart of the Day: Are You Better Off Tha...,Chart of the Day: Are You Better Off Than You ...
4,Olly Murs added a series of new shows to his U...,<sos> Olly Murs has added a series of new show...,<sos> Olly Murs has added a series of new show...,Olly Murs has added a series of new shows to h...


In [29]:
y = df_new["correct"]

In [30]:
X = df_new["incorrect"]
y_input = df_new["decoder_input"]
y_target = df_new["decoder_target"]

In [31]:
X_test = df_test["incorrect"]
y_input_test = df_test["decoder_input"]
y_target_test = df_test["decoder_target"]

In [32]:
input_tokenizer = Tokenizer(oov_token="<unk>")
input_tokenizer.fit_on_texts(X)

target_tokenizer = Tokenizer(oov_token="<unk>")
target_tokenizer.fit_on_texts(y)

In [33]:
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

print(f"input vocab size :- {input_vocab_size} and target vocab size :- {target_vocab_size}")

input vocab size :- 175989 and target vocab size :- 135929


In [34]:
X = input_tokenizer.texts_to_sequences(X)
y_input = target_tokenizer.texts_to_sequences(y_input)
y_target = target_tokenizer.texts_to_sequences(y_target)

In [35]:
X_test = input_tokenizer.texts_to_sequences(X_test)
y_input_test = target_tokenizer.texts_to_sequences(y_input_test)
y_target_test = target_tokenizer.texts_to_sequences(y_target_test)

In [36]:
padded_length = 1072
X_padded = pad_sequences(X,padded_length,padding="post")
y_input_padded = pad_sequences(y_input,padded_length,padding="post")
y_target_padded = pad_sequences(y_target,padded_length,padding="post")

In [37]:
padded_length = 1072
X_padded_test = pad_sequences(X_test,padded_length,padding="post")
y_input_padded_test = pad_sequences(y_input_test,padded_length,padding="post")
y_target_padded_test = pad_sequences(y_target_test,padded_length,padding="post")

In [38]:
hidden_dim = 64
dropout = 0.2
batch_size=64
epochs = 20
embedding_dim = 50

In [39]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [40]:
!unzip glove*.zip

Archive:  glove.6B.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [41]:
embeddings_index = {}

embedding_file = open("glove.6B.50d.txt",encoding="utf-8")
for i in embedding_file:
  values = i.split()
  word = values[0]
  coefs = np.asarray(values[1:],dtype="float32")
  embeddings_index[word] = coefs
embedding_file.close()

print(len(embeddings_index))

400000


In [42]:
input_word_index = input_tokenizer.word_index
target_word_index = target_tokenizer.word_index

In [43]:
embedding_matrix_ip = np.zeros((input_vocab_size,embedding_dim))
for word,i in input_word_index.items():
  embedding_vector_ip = embeddings_index.get(word)
  if i<input_vocab_size:
    if embedding_vector_ip is not None:
      embedding_matrix_ip[i] = embedding_vector_ip

In [44]:
embedding_matrix_op = np.zeros((target_vocab_size,embedding_dim))
for word,i in target_word_index.items():
  embedding_vector_op = embeddings_index.get(word)
  if i<input_vocab_size:
    if embedding_vector_op is not None:
      embedding_matrix_op[i] = embedding_vector_op

In [45]:

encoder_inputs = Input(shape=[None])
encoder_embedding_layer = Embedding(input_vocab_size,embedding_dim)
encoder_embedding_output = encoder_embedding_layer(encoder_inputs)

encoder_lstm_layer = LSTM(hidden_dim,return_state=True,dropout=dropout)
encoder_outputs, state_h, state_c = encoder_lstm_layer(encoder_embedding_output)
encoder_states = (state_h,state_c)

In [46]:
decoder_inputs = Input(shape=[None])
decoder_embedding_layer = Embedding(target_vocab_size,embedding_dim)
decoder_embedding_output = decoder_embedding_layer(decoder_inputs)

decoder_lstm_layer = LSTM(hidden_dim,return_sequences=True,return_state=True,dropout = dropout)
decoder_outputs, _,_ = decoder_lstm_layer(decoder_embedding_output,initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size,activation="softmax")

y_prediction = decoder_dense(decoder_outputs)

In [47]:
model = Model([encoder_inputs,decoder_inputs],y_prediction)

In [48]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["sparse_categorical_accuracy"])

In [49]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             8799450   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             6796450   ['input_2[0][0]']             
                                                                                              

In [50]:
model.layers[2].set_weights([embedding_matrix_ip])
model.layers[2].trainable = False

In [51]:
model.layers[3].set_weights([embedding_matrix_op])
model.layers[3].trainable = False

In [52]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             8799450   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             6796450   ['input_2[0][0]']             
                                                                                              

In [53]:
train_history = model.fit([X_padded,y_input_padded],y_target_padded,batch_size=batch_size,epochs=epochs,
                          validation_split=0.1)

Epoch 1/20


ResourceExhaustedError: Graph execution error:

Detected at node model/dense/Tensordot/MatMul defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 377, in dispatch_queue

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 250, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 748, in __init__

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-53-4ee47038cb44>", line 1, in <cell line: 1>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 590, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 515, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/dense.py", line 244, in call

OOM when allocating tensor with shape[68608,135929] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/dense/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_5961]

In [None]:
loss,acc = model.evaluate((X_padded_test,y_input_padded_test),y_target_padded_test)

In [None]:
loss,acc

In [None]:
train_history.history

In [None]:
plt.plot(train_history.history["sparse_categorical_accuracy"])
plt.plot(train_history.history["val_sparse_categorical_accuracy"])

In [None]:
plt.plot(train_history.history["loss"])
plt.plot(train_history.history["val_loss"])