<a href="https://colab.research.google.com/github/rushikeshnaik779/PracticeForNLP/blob/main/pytorch_BAGOFWORDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google_drive_downloader import GoogleDriveDownloader as gdd 

from pathlib import Path

In [21]:
import pandas as pd
import torch 
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset 
from sklearn.feature_extraction.text import CountVectorizer 
from tqdm import tqdm_notebook, tqdm

In [5]:
data_path = 'data/imdb_reviews.csv'
if not Path(data_path).is_file():
    gdd.download_file_from_google_drive(
        file_id = '1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=data_path
    )

Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... Done.


In [8]:
pd.read_csv(data_path).sample(4)

Unnamed: 0,review,label
59492,This is one of my two or three favorite Stooge...,1
52138,After the SuperFriends and Scooby Doo left the...,1
40309,A funny comedy from beginning to end! There ar...,1
35212,As horror fans we all know that blind rentals ...,0


In [11]:
class Sequences(Dataset):

    def __init__(self, path):
        df = pd.read_csv(path)
        self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.review.tolist())
        self.labels = df.label.tolist()
        self.token2idx = self.vectorizer.vocabulary_ 
        self.idx2token = { idx : token for token, idx in self.token2idx.items()}


    
    def __getitem__(self, i):
        return self.sequences[i,: ].toarray(), self.labels[i]

    
    def __len__(self):
        return self.sequences.shape[0]

In [12]:
dataset = Sequences(data_path)
train_loader = DataLoader(dataset, batch_size=4096)
print(dataset[5][0])
print(dataset[5][0].shape)

[[0 0 0 ... 0 0 0]]
(1, 3028)


In [13]:
# MODEL DEFINITION 
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)

    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [14]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64)
model

BagOfWordsClassifier(
  (fc1): Linear(in_features=3028, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [15]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [16]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [22]:
model.train()

train_losses = []
for epoch in range(10):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0

    for inputs, target in progress_bar : 
        model.zero_grad()

        output = model(inputs)
        loss = criterion(output.squeeze(), target.float())

        loss.backward()

        nn.utils.clip_grad_norm(model.parameters(), 3)

        optimizer.step()
        progress_bar.set_description(f'loss: {loss.item():.3f}')

        losses.append(loss.item())
        total += 1

    
    epoch_loss = sum(losses)/total
    train_losses.append(epoch_loss)

    tqdm.write(f'Epoch #{epoch + 1}\ train loss :{ epoch_loss : .3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))



Epoch #1\ train loss : 0.668


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #2\ train loss : 0.640


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #3\ train loss : 0.585


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #4\ train loss : 0.515


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #5\ train loss : 0.444


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #6\ train loss : 0.388


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #7\ train loss : 0.348


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #8\ train loss : 0.322


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #9\ train loss : 0.305


HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

Epoch #10\ train loss : 0.293


In [23]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray())

        output = model(test_vector)
        prediction = torch.sigmoid(output).item()

        if prediction > 0.5:
            print(f'{ prediction: 0.3}: Positive Sentiment')
        
        else :
            print(f'{prediction : 0.3}: Negative Sentiment')


test_text = ""

In [24]:
test_text="""
from Bryan, OH, panned it, noting that it “does not come with pocket protector, or spare dignity … If you’re a whiny, blond, teenage farmer, this jacket is for you. However you’ll be forever banished to the ‘friend zone’ by every girl you see, or worse, the ‘brother zone.’ But you’ll always have Yavin!
"""

In [25]:
predict_sentiment(test_text)

 0.36: Negative Sentiment


In [28]:
test_text='very Awesome product but not good packing'
predict_sentiment(test_text)

 0.573: Positive Sentiment
