<a href="https://colab.research.google.com/github/prakashradhakrish/NLP_comparison/blob/main/MLP_vs_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
import time
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/LFD/IMDB_Dataset.csv')

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [45]:
corpus=[]
ps = PorterStemmer()
def clean_text_corpus(df):
  for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['review'][i])
    review = review.lower().split()
    
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
  return corpus

In [46]:
st = time.time()
cp_t = clean_text_corpus(df)
print("- %s minutes - " % round((time.time() - st)/60,2))

- 20.75 minutes - 


In [100]:
with open('/content/drive/MyDrive/LFD/cp_t.txt', 'w') as f:
    for item in cp_t:
        f.write("%s\n" % item)

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
X = cv.fit_transform(cp_t).toarray()

In [54]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['sentiment'])

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X), torch.FloatTensor(y))

In [56]:
batch_size = 16
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

In [58]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()

        self.layer_1 = nn.Linear(2500, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [59]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [60]:
model = binaryClassification()
model.to(device)
print(model)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

binaryClassification(
  (layer_1): Linear(in_features=2500, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [61]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [62]:
epoch = 50
model.train()
st = time.time()
for e in range(1, epoch+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

print("- %s minutes - " % round((time.time() - st)/60,2))

Epoch 001: | Loss: 0.36143 | Acc: 84.628
Epoch 002: | Loss: 0.30595 | Acc: 87.492
Epoch 003: | Loss: 0.27577 | Acc: 88.950
Epoch 004: | Loss: 0.24775 | Acc: 90.321
Epoch 005: | Loss: 0.21871 | Acc: 91.514
Epoch 006: | Loss: 0.19481 | Acc: 92.868
Epoch 007: | Loss: 0.18232 | Acc: 93.340
Epoch 008: | Loss: 0.17241 | Acc: 93.584
Epoch 009: | Loss: 0.15428 | Acc: 94.348
Epoch 010: | Loss: 0.14179 | Acc: 95.003
Epoch 011: | Loss: 0.13464 | Acc: 95.132
Epoch 012: | Loss: 0.11835 | Acc: 95.961
Epoch 013: | Loss: 0.12596 | Acc: 95.422
Epoch 014: | Loss: 0.11667 | Acc: 95.876
Epoch 015: | Loss: 0.11154 | Acc: 96.060
Epoch 016: | Loss: 0.11002 | Acc: 96.138
Epoch 017: | Loss: 0.10259 | Acc: 96.356
Epoch 018: | Loss: 0.10883 | Acc: 96.264
Epoch 019: | Loss: 0.09264 | Acc: 96.804
Epoch 020: | Loss: 0.08960 | Acc: 96.886
Epoch 021: | Loss: 0.08296 | Acc: 97.135
Epoch 022: | Loss: 0.08521 | Acc: 97.102
Epoch 023: | Loss: 0.08682 | Acc: 96.987
Epoch 024: | Loss: 0.08634 | Acc: 97.095
Epoch 025: | Los

In [64]:
import os
torch.save(model.state_dict(), os.path.join("/content/drive/MyDrive/LFD/","trained_model_18_12_2020.pth"))

In [73]:
test_data = ['movie is good']
test_data = pd.DataFrame(test_data,columns=['review'])
no_of_rows=len(test_data)
cp_test = clean_text_corpus(test_data)

In [74]:
test_X = cv.fit_transform(cp_test).toarray()

In [102]:
tp = torch.Tensor(test_X[-1])
tp = tp.reshape(1,len(tp))

In [103]:
model.eval()
test_y = model(tp.to(device))

In [104]:
test_y

tensor([[3.7805]], device='cuda:0', grad_fn=<AddmmBackward>)