# BoW Classifier for sentiment classification

In [1]:
import re
import pandas as pd
import numpy as np

## Pytorch Imports
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data
import torch.nn.functional as F
import torch.optim as optim

## NLP Libraries
import spacy
from sklearn.model_selection import train_test_split
from nltk import download
import gensim
from nltk.corpus import stopwords

spacy_en = spacy.load('en')
download('stopwords')

Using TensorFlow backend.
  return f(*args, **kwds)


[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

## 1. Classifier architecture

## 2. Dataset
- more information here: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews
- 0 - negative    
- 1 - positive  

In [2]:
train = pd.read_pickle('train.pkl')
train.drop(labels='phrase_preprocessed', inplace=True,axis=1)
train.head()

Unnamed: 0,phrase,sentiment,sentiment_simple
0,A series of escapades demonstrating the adage ...,1,NEG
1,"This quiet , introspective and entertaining in...",4,POS
2,"Even fans of Ismail Merchant 's work , I suspe...",1,NEG
3,A positively thrilling combination of ethnogra...,3,POS
4,Aggressive self-glorification and a manipulati...,1,NEG


In [3]:
test = pd.read_pickle('test.pkl')
test.drop(labels='phrase_preprocessed', inplace=True,axis=1)
test.head()

Unnamed: 0,phrase
0,An intermittently pleasing but mostly routine ...
1,Kidman is really the only thing that 's worth ...
2,Once you get into its rhythm ... the movie bec...
3,I kept wishing I was watching a documentary ab...
4,"Kinnear does n't aim for our sympathy , but ra..."


## 3. Preprocessing

In [4]:
def transformText(text, do_stop=False, do_stem=False):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stops]
    else:
        filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [5]:
train['phrase_preprocessed']=train['phrase'].apply(lambda x: transformText(x,do_stop=False, do_stem=False))
train.head()

Unnamed: 0,phrase,sentiment,sentiment_simple,phrase_preprocessed
0,A series of escapades demonstrating the adage ...,1,NEG,series escapades demonstrating the adage that ...
1,"This quiet , introspective and entertaining in...",4,POS,this quiet introspective and entertaining inde...
2,"Even fans of Ismail Merchant 's work , I suspe...",1,NEG,even fans ismail merchant work suspect would h...
3,A positively thrilling combination of ethnogra...,3,POS,positively thrilling combination ethnography a...
4,Aggressive self-glorification and a manipulati...,1,NEG,aggressive self glorification and manipulative...


In [6]:
test['phrase_preprocessed']=test['phrase'].apply(lambda x: transformText(x,do_stop=False, do_stem=False))
test.head()

Unnamed: 0,phrase,phrase_preprocessed
0,An intermittently pleasing but mostly routine ...,intermittently pleasing but mostly routine effort
1,Kidman is really the only thing that 's worth ...,kidman really the only thing that worth watchi...
2,Once you get into its rhythm ... the movie bec...,once you get into its rhythm the movie becomes...
3,I kept wishing I was watching a documentary ab...,kept wishing was watching documentary about th...
4,"Kinnear does n't aim for our sympathy , but ra...",kinnear does n t aim for our sympathy but rath...


## 4. Train/Test split, Vocab

In [7]:
test_size = 0.2

In [26]:
x_train, x_valid, y_train, y_valid = train_test_split(train['phrase_preprocessed'],
                                                      train['sentiment_simple'], 
                                                      test_size=0.2)

In [27]:
x_test=test['phrase_preprocessed']
x_test[0:5]

0    intermittently pleasing but mostly routine effort
1    kidman really the only thing that worth watchi...
2    once you get into its rhythm the movie becomes...
3    kept wishing was watching documentary about th...
4    kinnear does n t aim for our sympathy but rath...
Name: phrase_preprocessed, dtype: object

In [28]:
## Build Vocabulary
word_to_ix = {}
for sent in list(x_train) + list(x_valid) + list(x_test):
    for word in sent.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

In [29]:
word_to_ix

{'starts': 0,
 'intense': 1,
 'political': 2,
 'and': 3,
 'psychological': 4,
 'thriller': 5,
 'but': 6,
 'sabotaged': 7,
 'ticking': 8,
 'time': 9,
 'bombs': 10,
 'other': 11,
 'hollywood': 12,
 'action': 13,
 'cliches': 14,
 'while': 15,
 'kids': 16,
 'will': 17,
 'probably': 18,
 'eat': 19,
 'the': 20,
 'whole': 21,
 'thing': 22,
 'most': 23,
 'adults': 24,
 'way': 25,
 'ahead': 26,
 'plot': 27,
 'hoot': 28,
 'watching': 29,
 'rock': 30,
 'chomp': 31,
 'jumbo': 32,
 'ants': 33,
 'pull': 34,
 'arrow': 35,
 'out': 36,
 'his': 37,
 'back': 38,
 'leap': 39,
 'unscathed': 40,
 'through': 41,
 'raging': 42,
 'fire': 43,
 'how': 44,
 'washed': 45,
 'despite': 46,
 'all': 47,
 'that': 48,
 'project': 49,
 'prime': 50,
 'mystery': 51,
 'laggard': 52,
 'drama': 53,
 'wending': 54,
 'its': 55,
 'uninspired': 56,
 'philosophical': 57,
 'epiphany': 58,
 'feels': 59,
 'like': 60,
 'cold': 61,
 'old': 62,
 'man': 63,
 'going': 64,
 'motions': 65,
 'path': 66,
 'may': 67,
 'familiar': 68,
 'first':

In [30]:
print("Vocab size = {}".format(len(word_to_ix)))

Vocab size = 17454


In [31]:
label_to_ix = { "NEG": 0, "POS": 1 }

## 5. Making dataset iterable

In [32]:
batch_size = 50
n_iters = 3000
num_epochs = n_iters/(len(x_train) / batch_size)
num_epochs = int(num_epochs)

In [52]:
## iterable datasets
train_data=list(zip(x_train,y_train))
train_data

[('starts intense political and psychological thriller but sabotaged ticking time bombs and other hollywood action cliches',
  'NEG'),
 ('while kids will probably eat the whole thing most adults will way ahead the plot',
  'NEG'),
 ('hoot watching the rock chomp jumbo ants pull arrow out his back and leap unscathed through raging fire',
  'POS'),
 ('but how washed out despite all that the project prime mystery', 'NEG'),
 ('laggard drama wending its way uninspired philosophical epiphany', 'NEG'),
 ('feels like cold old man going through the motions', 'NEG'),
 ('while the path may familiar first time director denzel washington and top notch cast manage keep things interesting',
  'POS'),
 ('empty shell epic rather than the real deal', 'NEG'),
 ('reeboir varies between sweet smile and angry bark while said attempts wear down possible pupils through repetition',
  'NEG'),
 ('cloaks familiar anti feminist equation lrb career kids misery rrb tiresome romantic comedy duds',
  'NEG'),
 ('the d

In [54]:
valid_data=list(zip(x_valid,y_valid))
valid_data

[('compassionate moving portrait american lrb and america rrb always reaching for something just outside his grasp',
  'POS'),
 ('this story still seems timely and important', 'POS'),
 ('rewarding', 'POS'),
 ('probes light hearted way the romantic problems individuals for whom the yearning for passion spells discontent',
  'POS'),
 ('all three actresses are simply dazzling particularly balk who finally been given part worthy her considerable talents',
  'POS'),
 ('when not wallowing hormonal melodrama real women have curves sweet honest and enjoyable comedy drama about young woman who wants many things life but fears she ll become her mother before she gets fulfill her dreams',
  'POS'),
 ('lot more dimensional and complex than its sunny disposition would lead you believe',
  'POS'),
 ('superbly photographed and staged mendes with series riveting set pieces the likes which mainstream audiences have rarely seen',
  'POS'),
 ('all all reign fire will good lrb successful rrb rental', 'NEG

## 6. Model -  BoW Classifier

In [34]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        ## Defining parameters for linear model
        self.linear = nn.Linear(vocab_size, num_labels)
    
    def forward(self, bow_vec):
        ## do the foward pass and implement non-linearity
        return F.log_softmax(self.linear(bow_vec))

In [35]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence.split():
        vec[word_to_ix[word]] += 1
    return vec.view(1,-1)

In [36]:
def make_target(label, label_to_idx):
    return torch.LongTensor([label_to_idx[label]])

In [38]:
n=5118
sample_phrase=make_bow_vector(x_train[n],word_to_ix)
print(">> SENTENCE: {}".format(x_train[n]))
print(">> SENTIMENT: {}".format(y_train[n]))
print(">> INPUT SIZE: {}".format(sample_phrase.size()))
sample_phrase

>> SENTENCE: feeble comedy
>> SENTIMENT: NEG
>> INPUT SIZE: torch.Size([1, 17454])



    0     0     0  ...      0     0     0
[torch.FloatTensor of size 1x17454]

## 7. Training

In [39]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)

In [40]:
model = BoWClassifier(NUM_LABELS,VOCAB_SIZE)

In [41]:
for param in model.parameters():
    print(param)

Parameter containing:
1.00000e-03 *
-6.8583  6.4013  2.0233  ...   2.2908  0.2709 -3.6443
 6.1853  6.9782  1.0960  ...   7.3783 -1.8122  1.5992
[torch.FloatTensor of size 2x17454]

Parameter containing:
1.00000e-03 *
  5.8047
  0.0330
[torch.FloatTensor of size 2]



In [42]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(params = model.parameters(), lr = learning_rate)

In [70]:
iter = 0
for epoch in range(num_epochs):
    for (sent,label) in train_data:
        # Step 1 - clear the gradients
        model.zero_grad()
        optimizer.zero_grad()
        
        ## Step 2- Prepare input and label
        bow_vec = Variable(make_bow_vector(sent, word_to_ix))
        target = Variable(make_target(label, label_to_ix))
        
        # Step 3 - Run forward pass
        output = model(bow_vec)
        #print("Log probabilities - {}".format(log_probs))
        
        # Step 4 - Compute loss, gradients, update parameters
        loss = loss_function(output, target)
        loss.backward()
        optimizer.step()
        
#        iter+=1
#  
#
#        if iter % 100 == 0:
#            correct = 0
#            total = 0
#            for (sent,label) in valid_data:
#                bow_vec = Variable(make_bow_vector(sent, word_to_ix))
#                target = Variable(make_target(label, label_to_ix))
#                output = model(bow_vec)
#                _,predicted = torch.max(output.data,1)
#                total += target.size(0)
#                correct += (predicted == label).sum()
#            accuracy = 100 * correct/total
#            print('Iterations: {}. Loss: {}. Accuracy: {}'.format(iter,loss.data[0],accuracy))

In [91]:
n=8
bow_vec = Variable(make_bow_vector(valid_data[n][0], word_to_ix))
print("-"*20 + " INPUT "+"-"*20)
print("LABEL = {}".format(valid_data[n][1]))
print("SENTENCE = {}".format(valid_data[n][0]))
print("-"*20 + " PREDICTION "+"-"*20)
log_probs = model(bow_vec)
_,predicted = torch.max(log_probs.data,1)
print("PRED = {}".format(predicted[0]))
print("LOG_PROB = {}".format(log_probs))
print("PROBS = {}".format(F.softmax(log_probs)))

-------------------- INPUT --------------------
LABEL = NEG
SENTENCE = all all reign fire will good lrb successful rrb rental
-------------------- PREDICTION --------------------
PRED = 0
LOG_PROB = Variable containing:
-0.4364 -1.0395
[torch.FloatTensor of size 1x2]

PROBS = Variable containing:
 0.6464  0.3536
[torch.FloatTensor of size 1x2]



## References:
- https://github.com/ilkarman/DeepLearningFrameworks/blob/lstm/LSTM_PyTorch_IMDB.ipynb
- https://www.kaggle.com/himako/lb-0-424-naive-lstm-model-and-glove-embeddings 
- https://github.com/vanzytay/pytorch_sentiment_rnn