<a href="https://colab.research.google.com/github/rushikeshnaik779/PracticeForNLP/blob/main/RNN_using_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# RNN Text Classification : Predict the sentiment of the IMDB movie reviews 


In [3]:
from pathlib import Path 

import pandas as pd 
import torch 
import torch.nn.functional as F 
import torch.nn as nn 
import torch.optim as optim 
from google_drive_downloader import GoogleDriveDownloader as gdd 
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# DOWNLOAD THE TRAINING DATA 

DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=DATA_PATH,
    )

Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... Done.


In [36]:
# PRocess the text 

class Sequences(Dataset):

    def __init__(self, path, max_seq_len):
        self.max_seq_len = max_seq_len 

        df = pd.read_csv(path)
        print(df.head())
        vectorizer = CountVectorizer(stop_words='english', min_df=0.015)
        vectorizer.fit(df.review.tolist())

        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1
        print(self.token2idx['<PAD>'])

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x : [self.token2idx[token] for token in tokenizer(x)
                                                            if token in self.token2idx]

        self.pad = lambda x : x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]

        sequences = [self.encode(sequence)[: max_seq_len] for sequence in df.review.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label in zip(sequences, df.label.tolist()) if sequence])
        print(sequences, self.labels)
        self.sequences = [self.pad(sequence) for sequence in sequences]

    
    def __getitem__(self, i):
        assert len(self.sequences[i] )== self.max_seq_len
        return self.sequences[i], self.labels[i]

    def __len__(self):
        return len(self.sequences)


In [37]:
dataset = Sequences(DATA_PATH, max_seq_len=128)
dataset

                                              review  label
0  Once again Mr. Costner has dragged out a movie...      0
1  This is an example of why the majority of acti...      0
2  First of all I hate those moronic rappers, who...      0
3  Not even the Beatles could write songs everyon...      0
4  Brass pictures (movies is not a fitting word f...      0
1103


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<__main__.Sequences at 0x7f9163452b50>

In [38]:
len(dataset.token2idx)

1104

In [39]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target 

batch_size = 2048 
trainloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)

In [40]:
class RNN(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        batch_size, 
        embedding_dimension = 100, 
        hidden_size=128, 
        n_layers = 1,
        device='cpu'):

        super(RNN, self).__init__()
        self.n_layers = n_layers 
        self.hidden_size = hidden_size
        self.device = device 
        self.batch_size = batch_size 


        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension, 
            hidden_size, 
            num_layers = n_layers ,
            batch_first = True
        )
        self.decoder = nn.Linear(hidden_size, 1)

    def init_hidden(self):
        return torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device)
    

    def forward(self, inputs):
        # Avoid breaking if the last batch has a different size 
        batch_size = inputs.size(0)
        if batch_size != self.batch_size: 
            self.batch_size = batch_size
        
        encoded = self.encoder(inputs)
        output, hidden = self.rnn(encoded, self.init_hidden())
        output = self.decoder(output[:,:,-1]).squeeze()

        return output


In [41]:
model = RNN(
    hidden_size = 128, 
    vocab_size=len(dataset.token2idx),
    device=device, 
    batch_size = batch_size,
)

model= model.to(device)
model

RNN(
  (encoder): Embedding(1104, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=1, bias=True)
)

In [42]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [43]:
model.train()
train_losses = []

for epoch in range(20):
    progress_bar = tqdm_notebook(trainloader, leave=False )
    losses = []
    total= 0 
    for inputs, target in progress_bar: 
        inputs, target = inputs.to(device), target.to(device)

        model.zero_grad()

        output = model(inputs)

        loss = criterion(output, target)

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()

        progress_bar.set_description(f'Loss: {loss.item() :.3f}')

        losses.append(loss.item())
        total +=1
    epoch_loss = sum(losses)/total
    train_losses.append(epoch_loss)

    tqdm.write(f'epoch {epoch+1}\ntrain Loss: {epoch_loss: .3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 1
train Loss:  0.755


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 2
train Loss:  0.712


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 3
train Loss:  0.707


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 4
train Loss:  0.695


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 5
train Loss:  0.681


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 6
train Loss:  0.683


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 7
train Loss:  0.612


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 8
train Loss:  0.583


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 9
train Loss:  0.509


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 10
train Loss:  0.480


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 11
train Loss:  0.452


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 12
train Loss:  0.432


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 13
train Loss:  0.417


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 14
train Loss:  0.403


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 15
train Loss:  0.393


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 16
train Loss:  0.382


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 17
train Loss:  0.374


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 18
train Loss:  0.366


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 19
train Loss:  0.358


HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))

epoch 20
train Loss:  0.351


In [44]:
def predict_sentiment(text):
    model.eval()

    with torch.no_grad():
        test_vector = torch.LongTensor([dataset.pad(dataset.encode(text))]).to(device)

        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5 : 
            print(f'{prediction : 0.3}: positive sentiment')
        
        else : 
            print(f'{prediction: 0.3}: Negative Sentiment')

In [45]:
test_text = """Nice keyboard, loved it"""
predict_sentiment(test_text)

 0.986: positive sentiment


In [46]:
test_text = """Worst product"""
predict_sentiment(test_text)

 0.00712: Negative Sentiment


In [54]:
for i in range(10):
    test_text = """
    The soft crispness is extraordinary. Great evening snack. This karachi pusta bakery bisuit item we bought in july 3030
    """
    predict_sentiment(test_text)
    test_text = """
    The whole biscuits were crushed inside the box. only few biscuits were ok. paying so much money for the product, but not geting it in a proper way. No point of ordering food items from Amazon
    """
    predict_sentiment(test_text)
    print('\n\n')

 0.959: positive sentiment
 0.243: Negative Sentiment



 0.94: positive sentiment
 0.147: Negative Sentiment



 0.968: positive sentiment
 0.0919: Negative Sentiment



 0.98: positive sentiment
 0.0929: Negative Sentiment



 0.999: positive sentiment
 0.292: Negative Sentiment



 0.974: positive sentiment
 0.117: Negative Sentiment



 0.977: positive sentiment
 0.255: Negative Sentiment



 0.937: positive sentiment
 0.183: Negative Sentiment



 0.979: positive sentiment
 0.0915: Negative Sentiment



 0.956: positive sentiment
 0.23: Negative Sentiment





In [50]:
test_text = """
The whole biscuits were crushed inside the box. only few biscuits were ok. paying so much money for the product, but not geting it in a proper way. No point of ordering food items from Amazon
"""
predict_sentiment(test_text)

 0.104: Negative Sentiment
