In [1]:
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from torch.utils.data import TensorDataset,DataLoader
from collections import Counter



In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

Using cuda


In [3]:
df = pd.read_csv("test_dataset_csv")

In [4]:
df.head()

Unnamed: 0,incorrect,correct
0,"Bitcoin is for $7,094 this morning, which Coin...","Bitcoin goes for $7,094 this morning, accordin..."
1,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
2,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
3,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
4,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...


In [5]:
df["incorrect"].shape, df["correct"].shape

((152175,), (152175,))

In [6]:
X = df["incorrect"].values
y = df["correct"].values

In [7]:
X[0],y[0]

('Bitcoin is for $7,094 this morning, which CoinDesk says.',
 'Bitcoin goes for $7,094 this morning, according to CoinDesk.')

In [9]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
X_train.shape, X_val.shape

((121740,), (30435,))

In [11]:
def clean_text(text):
    text = re.sub(r"[^\w\s]","",text)
    text = re.sub(r"\s+","",text)
    text = re.sub(r"\d","",text)
    return text

In [12]:
def tokenize_words(X_train,y_train):
    
    final_word_list_incorr = []
    final_word_list_corr = []
    
    for i in X_train:
        for j in i.lower().split():
            word = clean_text(j)
            if word !=" " or word!="":
                final_word_list_incorr.append(word)
                
    for i in y_train:
        for j in i.lower().split():
            word = clean_text(j)
            if word !=" " or word!="":
                final_word_list_corr.append(word)

    word_corpus_incorr = Counter(final_word_list_incorr)
    word_corpus_corr = Counter(final_word_list_corr)
    
    corpus_sorted_incorr = sorted(word_corpus_incorr,key=word_corpus_incorr.get,reverse=True)
    corpus_sorted_corr = sorted(word_corpus_corr,key=word_corpus_corr.get,reverse=True)
    
    X_train_incorr = []
    y_train_corr = []
    
    vocab_dict_incorr = {w:i+1 for i,w in enumerate(corpus_sorted_incorr)}
    vocab_dict_corr = {w:i+1 for i,w in enumerate(corpus_sorted_corr)}
    
    for i in X_train:
        X_train_incorr.append([vocab_dict_incorr[clean_text(word)] for word in i.lower().split() if clean_text(word) in vocab_dict_incorr.keys()])
        
    for i in y_train:
        y_train_corr.append([vocab_dict_corr[clean_text(word)] for word in i.lower().split() if clean_text(word) in vocab_dict_corr.keys()])
    
    
    
    return X_train_incorr,y_train_corr,vocab_dict_incorr,vocab_dict_corr

In [13]:
X_train,y_train,vocab_dict_incorr,vocab_dict_corr = tokenize_words(X_train,y_train)

In [14]:
len(vocab_dict_incorr),len(vocab_dict_corr)

(167616, 128433)

In [15]:
X_train_tt[0],y_train_tt[0]

('A drunken Barney Gumble at Simpsons shows street art could get in there.',
 'A drunken Barney Gumble from The Simpsons shows street art can go horizontal too.')

In [16]:
X_train

[[741, 2620, 39, 10016, 55, 19, 16340, 52, 202, 24, 639, 23, 4812, 2029],
 [47, 1005, 190, 1, 2, 27, 7, 29147, 539, 158],
 [626,
  11,
  355,
  4319,
  2406,
  29148,
  8565,
  19441,
  9101,
  4560,
  5,
  54291,
  13,
  60,
  33,
  62,
  7,
  8848,
  6,
  24674,
  3562,
  4813,
  6,
  578,
  687],
 [3563,
  18,
  1379,
  5,
  707,
  15,
  16,
  201,
  14,
  18,
  118,
  266,
  44,
  760,
  102,
  647,
  50,
  886,
  11,
  682,
  5,
  94,
  54292,
  5073],
 [1931,
  48,
  124,
  8,
  35,
  642,
  4,
  35,
  598,
  789,
  1357,
  2188,
  1387,
  2,
  3447,
  1,
  1260,
  26,
  6751,
  7,
  213,
  34,
  134,
  201,
  3,
  28,
  118,
  65,
  123,
  11,
  1,
  1507,
  4,
  54293,
  21698],
 [1736, 24675, 36787, 54294, 24676, 3, 628],
 [2931, 54295, 32, 7, 583, 9, 7, 116, 5, 140],
 [523,
  54296,
  24677,
  86,
  924,
  6,
  1445,
  10,
  11749,
  54297,
  13,
  5,
  1,
  3603,
  13,
  1,
  2121,
  9,
  849,
  55,
  4,
  1066,
  54298,
  800,
  76,
  7,
  4172,
  2086,
  5,
  1307,
  1,
  

In [17]:
y_train

[[704, 2660, 10551, 52, 21, 13072, 48, 85, 154, 22, 649, 24, 4793, 2170],
 [56, 12, 188, 1, 2, 27, 6, 28567, 539, 161],
 [578,
  11,
  351,
  4000,
  2399,
  28568,
  7374,
  21603,
  8737,
  2930,
  4,
  50554,
  13,
  62,
  81,
  59,
  6,
  8268,
  7,
  21604,
  3944,
  4724,
  7,
  548,
  664],
 [4141,
  19,
  1483,
  4,
  736,
  18,
  16,
  201,
  49,
  19,
  132,
  278,
  45,
  837,
  105,
  672,
  53,
  952,
  11,
  657,
  4,
  736,
  50555,
  5143],
 [2028,
  49,
  131,
  8,
  1,
  626,
  5,
  34,
  582,
  761,
  1177,
  2400,
  7686,
  2,
  3298,
  1,
  1270,
  26,
  7884,
  6,
  227,
  36,
  144,
  201,
  3,
  29,
  30,
  64,
  124,
  11,
  1,
  5467],
 [1476, 21605, 28569, 50556, 24438, 3, 677, 597],
 [4001, 1217, 9, 33, 28, 725, 18, 9, 6, 126, 5, 151],
 [514,
  50557,
  24439,
  79,
  890,
  7,
  28,
  1499,
  10,
  11912,
  50558,
  13,
  4,
  1,
  3905,
  13,
  1,
  2649,
  9,
  14536,
  52,
  5,
  1019,
  50559,
  848,
  77,
  6,
  3906,
  2458,
  4,
  1326,
  1,
  222,
 

In [18]:
def pad(sent,seq_len):
    features = np.zeros((len(sent),seq_len),dtype=int)
    for i, review in enumerate(sent):
        if (len(review) !=0):
            features[i,-len(review):] = np.array(review)[:seq_len]
    return features

In [19]:
len(X_train[0]),len(y_train[0])

(14, 14)

In [20]:
min_len = 99999
max_len = 0

for i in range(len(X_train)):
    if(len(X_train[i]) > max_len):
        max_len = len(X_train[i])
        
for i in range(len(X_train)):
    if(len(X_train[i]) < min_len):
        min_len = len(X_train[i])
        
print(f"Max Length X_train = {max_len} and Min Length X_train= {min_len}")


min_len = 99999
max_len = 0

for i in range(len(y_train)):
    if(len(y_train[i]) > max_len):
        max_len = len(y_train[i])
        
for i in range(len(y_train)):
    if(len(y_train[i]) < min_len):
        min_len = len(y_train[i])
        
print(f"Max Length y_train = {max_len} and Min Length y_train = {min_len}")

Max Length X_train = 2083 and Min Length X_train= 1
Max Length y_train = 149 and Min Length y_train = 5


In [21]:
rev_len = [len(i) for i in X_train]
pd.Series(rev_len).describe()

count    121740.000000
mean         22.090701
std          17.766910
min           1.000000
25%          11.000000
50%          18.000000
75%          29.000000
max        2083.000000
dtype: float64

In [22]:
rev_len = [len(i) for i in y_train]
pd.Series(rev_len).describe()

count    121740.000000
mean         21.926721
std          15.793291
min           5.000000
25%          11.000000
50%          18.000000
75%          29.000000
max         149.000000
dtype: float64

In [23]:
X_train = pad(X_train,2083)
y_train = pad(y_train,2083)

In [24]:
train_data = TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train))         

In [25]:
batch_size = 64

In [26]:
train_loader = DataLoader(train_data,shuffle=True,batch_size=batch_size)

In [27]:
dataiter = iter(train_loader)
m,n = next(dataiter)

In [28]:
m,n

(tensor([[     0,      0,      0,  ...,     75,   1003,   1356],
         [     0,      0,      0,  ...,   7352,      3,  15112],
         [     0,      0,      0,  ...,    532, 137455,   1081],
         ...,
         [     0,      0,      0,  ...,   4020,    117,    884],
         [     0,      0,      0,  ...,      1,   1409,    467],
         [     0,      0,      0,  ...,      3,   2494,   1988]]),
 tensor([[     0,      0,      0,  ...,     73,    996,   1312],
         [     0,      0,      0,  ...,      3,  17459,    690],
         [     0,      0,      0,  ...,    494, 107690,   1357],
         ...,
         [     0,      0,      0,  ...,   4174,    115,    797],
         [     0,      0,      0,  ...,      1,   1477,    457],
         [     0,      0,      0,  ...,      3,   2321,   1789]]))

In [29]:
input_dim = len(vocab_dict_incorr)
output_dim = len(vocab_dict_corr)
enc_embedding_dim = 256
dec_embedding_dim = 256
hidden_dim = 512
n_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

In [30]:
input_dim,output_dim

(167616, 128433)

In [31]:
class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim,hidden_dim,n_layers,dropout):
        super().__init__()
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim,emb_dim)
        self.rnn = nn.LSTM(emb_dim,hidden_dim,n_layers,dropout=dropout)
    
    
    def forward(self,src_batch):
        
        embedded = self.embedding(src_batch)
        outputs, (hidden,cell) = self.rnn(embedded)
        
        return hidden,cell

In [32]:
encoder = Encoder(input_dim,enc_embedding_dim,hidden_dim,n_layers,enc_dropout)

In [33]:
encoder

Encoder(
  (embedding): Embedding(167616, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
)

In [34]:
hidden,cell = encoder(m)

In [35]:
hidden.shape,cell.shape

(torch.Size([2, 2083, 512]), torch.Size([2, 2083, 512]))

In [36]:
class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim,hidden_dim,n_layers,dropout):
        super().__init__()
        self.emb_dim  = emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim,emb_dim)
        self.rnn = nn.LSTM(emb_dim,hidden_dim,n_layers,dropout=dropout)
        self.out = nn.Linear(hidden_dim,output_dim)
        
    
    def forward(self,trg,hidden,cell):
        embedded = self.embedding(trg.unsqueeze(0))
        outputs, (hidden,cell) = self.rnn(embedded, (hidden,cell))
        prediction = self.out(outputs.squeeze(0))
        return prediction,hidden,cell

In [37]:
decoder = Decoder(output_dim,dec_embedding_dim,hidden_dim,n_layers,dec_dropout)

In [38]:
decoder

Decoder(
  (embedding): Embedding(128433, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  (out): Linear(in_features=512, out_features=128433, bias=True)
)

In [39]:
hidden,cell = encoder(m)

In [40]:
hidden.shape,cell.shape

(torch.Size([2, 2083, 512]), torch.Size([2, 2083, 512]))

In [41]:
n.shape

torch.Size([64, 2083])

In [42]:
prediction,h,c = decoder(n[0],hidden,cell)

In [43]:
prediction.shape, hidden.shape, cell.shape

(torch.Size([2083, 128433]),
 torch.Size([2, 2083, 512]),
 torch.Size([2, 2083, 512]))

In [44]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device=device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self,src_batch,trg_batch,teacher_forcing_ratio:float=0.5):
        
        max_len,batch_size = trg_batch.shape
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len,batch_size,trg_vocab_size).to(self.device)
        hidden,self = self.encoder(src_batch)
        
        trg = trg_batch[0]
        
      
        for i in range(1, max_len):
            prediction, hidden, cell = self.decoder(trg, hidden, cell)
            outputs[i] = prediction
    
            if random.random() < teacher_forcing_ratio:
                trg = trg_batch[i]
            else:
                trg = prediction.argmax(1)
                
        return outputs

In [45]:
seq2seq = Seq2Seq(encoder,decoder,device).to(device)

In [46]:
seq2seq

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(167616, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(128433, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (out): Linear(in_features=512, out_features=128433, bias=True)
  )
)

In [47]:
outputs = seq2seq(m,n)

RuntimeError: [enforce fail at alloc_cpu.cpp:83] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 68486640384 bytes. Error code 12 (Cannot allocate memory)