In [0]:
from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
  gdd.download_file_from_google_drive(
      file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
      dest_path = DATA_PATH,
  )

Downloading 1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz into data/imdb_reviews.csv... Done.


In [6]:
# View some example records
pd.read_csv(DATA_PATH).sample(5)

Unnamed: 0,review,label
1208,"Very odd, this seems like a very average movie...",0
4815,I am not afraid of bad movies. I like bad movi...,0
51150,...though for a film that seems to be trying t...,0
48832,I've seen this film criticized with the statem...,1
18263,I've seen the 1973 movie Lost Horizons and rea...,1


In [23]:
len(pd.read_csv(DATA_PATH).iloc[0, 0].split())

168

In [0]:
class Sequences(Dataset):
  def __init__(self, path):
    df = pd.read_csv(path)
    self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
    self.sequences = self.vectorizer.fit_transform(df.review.tolist())
    self.labels = df.label.tolist()
    self.token2idx = self.vectorizer.vocabulary_
    self.idx2token = {idx: token for token, idx in self.token2idx.items()}
  
  def __getitem__(self, i):
    return self.sequences[i, :].toarray(), self.labels[i]

  def __len__(self):
    return self.sequences.shape[0]

In [64]:
dataset = Sequences(DATA_PATH)
train_loader = DataLoader(dataset, batch_size=4096)

print(dataset[5][0].shape)

(1, 3028)


In [0]:
class BagOfWordsClassifier(nn.Module):
  def __init__(self, vocab_size, hidden1, hidden2):
    super(BagOfWordsClassifier, self).__init__()
    self.fc1 = nn.Linear(vocab_size, hidden1)
    self.fc2 = nn.Linear(hidden1, hidden2)
    self.fc3 = nn.Linear(hidden2, 1)
    
  def forward(self, inputs):
    x = F.relu(self.fc1(inputs.squeeze(1).float()))
    x = F.relu(self.fc2(x))
    return self.fc3(x)

In [58]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64)
model

BagOfWordsClassifier(
  (fc1): Linear(in_features=3028, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)

In [0]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [60]:
for p in model.parameters():
  if p.requires_grad:
    print(p.shape)

torch.Size([128, 3028])
torch.Size([128])
torch.Size([64, 128])
torch.Size([64])
torch.Size([1, 64])
torch.Size([1])


In [61]:
model.train()
train_losses = []
for epoch in range(3):
  progress_bar = tqdm_notebook(train_loader, leave=False)
  losses = []
  total = 0
  for inputs, target in progress_bar:
    model.zero_grad()

    output = model(inputs)
    loss = criterion(output.squeeze(), target.float())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), 3)
    
    optimizer.step()

    progress_bar.set_description(f'Loss: {loss.item():.3f}')

    losses.append(loss.item())
    total += 1

  epoch_loss = sum(losses) / total
  train_losses.append(epoch_loss)

  tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')



HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch #1	Train Loss: 0.719


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch #2	Train Loss: 0.683


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

Epoch #3	Train Loss: 0.663


In [62]:
model.train()
train_losses = []
for epoch in range(3):
  losses = []
  total = 0
  for inputs, target in train_loader:
    model.zero_grad()

    output = model(inputs)
    loss = criterion(output.squeeze(), target.float())

    loss.backward()

    # nn.utils.clip_grad_norm_(model.parameters(), 3)
    
    optimizer.step()

    # progress_bar.set_description(f'Loss: {loss.item():.3f}')

    losses.append(loss.item())
    total += 1

  epoch_loss = sum(losses) / total
  train_losses.append(epoch_loss)

  print(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')



Epoch #1	Train Loss: 0.609
Epoch #2	Train Loss: 0.525
Epoch #3	Train Loss: 0.441


In [0]:
def predict_sentiment(text):
  model.eval()
  with torch.no_grad():
    test_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray())

    output = model(test_vector)
    prediction = torch.sigmoid(output).item()

    if prediction > 0.5:
      print(f'{prediction:0.3}: Positive sentiment')
    else:
      print(f'{prediction:0.3}: Negative sentiment')

In [69]:
test_text = """
This poor excuse for a movie is terrible. It has been 'so good it's bad' for a
while, and the high ratings are a good form of sarcasm, I have to admit. But
now it has to stop. 
"""
predict_sentiment(test_text)

0.312: Negative sentiment


In [70]:
test_text = """
Cool Cat Saves The Kids is a symbolic masterpiece directed by Derek Savage that
is not only satirical in the way it makes fun of the media and politics, but in
the way in questions as how we humans live life and how society tells us to
live life.
"""
predict_sentiment(test_text)

0.592: Positive sentiment


In [71]:
test_text = """
Don't let any bullies out there try and shape your judgment on this gem of a
title.

Some people really don't have anything better to do, except trash a great movie
with annoying 1-star votes and spread lies on the Internet about how "dumb"
Cool Cat is.

I wouldn't be surprised to learn if much of the unwarranted negativity hurled
at this movie is coming from people who haven't even watched this movie for
themselves in the first place. Those people are no worse than the Butch the
Bully, the film's repulsive antagonist.

As it just so happens, one of the main points of "Cool Cat Saves the Kids" is
in addressing the attitudes of mean naysayers who try to demean others who
strive to bring good attitudes and fun vibes into people's lives. The message
to be learned here is that if one is friendly and good to others, the world is
friendly and good to one in return, and that is cool. Conversely, if one is
miserable and leaving 1-star votes on IMDb, one is alone and doesn't have any
friends at all. Ain't that the truth?

The world has uncovered a great, new, young filmmaking talent in "Cool Cat"
creator Derek Savage, and I sure hope that this is only the first of many
amazing films and stories that the world has yet to appreciate.

If you are a cool person who likes to have lots of fun, I guarantee that this
is a movie with charm that will uplift your spirits and reaffirm your positive
attitudes towards life.
"""
predict_sentiment(test_text)

0.771: Positive sentiment


[-] sparse matrix CSR from  
[-] trainig 5 steps  
[-] nn.utils.clip_grad_norm_(model.parameters(), 3)  
