
# Building a Sentiment Classifier

In [265]:
# Necessary imports for the code to function
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation, Dropout
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

Defining global variables which we will measure and use later

In [266]:
global uniq_train_words
uniq_train_words=set()


In [267]:
from sklearn.datasets import load_files
# Read training files
reviews_set = load_files("aclImdb/data/")
# Lets get training reviews and training labels in sepearate lists
reviews, labels = reviews_set.data, reviews_set.target

# Let's understand the two lists: reviews (text_train) and their labels (y_train)
print("Data type of reviews: ",type(reviews))
print("Total number of reviews: ",len(reviews))
print("Text at index 6:\n ", reviews[6])
print("Label of the review at Index 6: ",labels[6])
# 0 for negative and 1 for positive

Data type of reviews:  <class 'list'>
Total number of reviews:  2002
Text at index 6:
  b'I thought this was a very clunky, uninvolving version of a famous Australian story. Heath Ledger and Orlando Bloom were very good in their roles, and gave their characters some personality; but the whole thing felt forced and mechanical.<br /><br />The beginning could have been a lot more involving; perhaps starting with a shootout, and then flashing back for a recap of how they got there or that sort of thing. And I felt like every scene was routinely predictable and signposted, like a very bad tv soap.<br /><br />I was really looking forward to this movie, and hoping for something a lot better. The only thing I can say in its favour is that it beats the Mick Jagger version, but not by much.'
Label of the review at Index 6:  0


In [268]:

train_rev, test_rev,train_lbls, test_lbls = train_test_split(reviews, labels, test_size=0.33)

 <h4>We need to get unique words to determine the vocabulary size based on the words in the training set.

In [269]:
import nltk
import string
from nltk.stem import PorterStemmer

#Custom function for text noise removal and stemming
def noise_removal(text_data, isTraining=True):
    
    stemmer = PorterStemmer()
    punctuation_list=list(string.punctuation)
    # additional punctuation marks
    punctuation_list.append("--") 
    punctuation_list.append("''")
    punctuation_list.append("``")
    text_data=text_data.replace("<br />", " ") 
    # some  stop words
    stop_words=['a','an','he','she','it','am','will','have','has','i','you','me','\'s','``','\'','(',')','*****','...']
    noise_list=punctuation_list +stop_words
    # making sure in testing we are keeping only words in training and removing noise in both training and testing
    filtered_words=[stemmer.stem(w) for w in nltk.word_tokenize(text_data) if w not in noise_list]
       
    if isTraining==True:
        for w in filtered_words:
            uniq_train_words.add(w )
    else:# during testing
        filtered_words=[w for w in filtered_words if w in uniq_train_words]
        
    return  " ".join(filtered_words) # convert the list of tokenize words to a string

In [270]:
max_length = 300

In [271]:

def pre_process(reviews,isTraining=True):
    reviews= [noise_removal(text.decode(),isTraining) for text in reviews]
    # Integer encode the documents
   
    vocab_size=len(uniq_train_words)
    print(vocab_size)
    
    print (reviews[0])
    encoded_reviews = [one_hot(review, vocab_size) for review in reviews]
    
    # pad documents to a max length of n words
    padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
    return padded_reviews

In [272]:
train_padded=pre_process(train_rev)
print (train_padded[1])

15304
Me be of irish origin love thi movi not onli wa the guy hot and funni wa also sincer and honest I love the girl who fell in love with too wa pretti they were such cute coupl the end wa so sad love thi movi although is littl dirti remind of british or irish version of prime If like thi movi should watch prime same stori line young guy fall for older women older women fall for young guy to A lot of path cross in the end the best decis is made or task is complet Do n't anyth els to say without ruin the whole movi all though I thought the french guy wa ugli less appeal to umm if like irish movi I would recommend circl of friend that movi is so good quick quot might not get unless watch well that my dinner ruin lol
[ 6751  2978  1739  9645  2030 11927  6353  2605 12127 14276  9613 15277
  2793  9645  9330  9703  8512  7867  9214 10253  7612  8891  4124  4848
 14968 12333   928  4124  2140 13995 11033  2612  4790 14968 14276  1704
  6751 12608 11927 15026 12333  2605  5856  7192 12333 

In [285]:
# Total vocabulary size
len(uniq_train_words)

15304

<H4>We have completed our pre-processing, it is now time to build the neural network based classifier. 

In [295]:
#from keras import models
vocab_size=len(uniq_train_words)
# define the model
model = Sequential()
# Define the embedding matrix dimensions. Each vector is of 8 dimensions and there will be total of vocab_size vectors
# The input length (window) is 300 words so the output from embedding layer will be a conactenated (flattened) vector of 
# 2400 dimensions
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(units=2400, activation='relu'))
model.add(Dropout(0.40))
model.add(Dense(units=1, activation='sigmoid'))
# compile the model with optimization algorithm and binary cross entropy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_25 (Embedding)     (None, 300, 8)            122432    
_________________________________________________________________
flatten_25 (Flatten)         (None, 2400)              0         
_________________________________________________________________
dense_54 (Dense)             (None, 2400)              5762400   
_________________________________________________________________
dropout_30 (Dropout)         (None, 2400)              0         
_________________________________________________________________
dense_55 (Dense)             (None, 1)                 2401      
Total params: 5,887,233
Trainable params: 5,887,233
Non-trainable params: 0
_________________________________________________________________
None


In [309]:
# Fit the model...I am trying with batch_size=30, you can delete it for default batch 
#size or change it to a bigger number
model.fit(train_padded, train_lbls, epochs=20, batch_size=30,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1fc01760550>

In [302]:
#pre process like training set
test_padded=pre_process(test_rev, isTraining=False)
print (test_padded[5])

15304
unwatch you ca n't even make past the first three minut and thi is come from huge adam sandler fan 1
[ 1259 11823  5178  9396  7323  1571  9703  2883 12365  1869  9613 14058
 12602  4124  8195  6867  7985  4124   765  5307  1259 12333  7928  1473
  9703 14673  4124  5456  9703  1231  3768  3357  1739  5245 14904 11621
  9977  6751  1856  3768 13362 14276 14716 13077  9232 12859 12173 14968
  2281 12519  6867  7985  7439  9613  4621  5595 14167 14699  7300  2593
  6253  3768  7439  1231 11069  9943  4124  7441 11973  7439  9613 10253
  9703  5595  7964  1435   488  8437  1856  3768 14029  4013  1224  6253
  9943  4124  7322  6629 10270  4352  9645  4431 10492  6002 12519  7928
 10138  2141 14699  2883 12365 10665   166  9645 12200 10013   540  8560
 14276  2605  6002 12411  2436  4124  8624  3433  3042  2927  1428  9543
  2605  9645 10678  8195 14673   166 14297  5429 14968  1413  7814  7700
  4124 11863 12333  8629  8104 14968  3744  5178  9396  7323  8011  7894
  9613 14029 1085

In [310]:
# evaluate the model
loss, accuracy = model.evaluate(test_padded, test_lbls, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 71.835098


In [284]:
from sklearn.metrics import classification_report
import numpy as np

predictions = model.predict(test_padded, batch_size=100, verbose=1)
predictions_bool = np.argmax(predictions, axis=1)

print(classification_report(test_lbls, predictions_bool))

              precision    recall  f1-score   support

           0       0.52      1.00      0.68       341
           1       0.00      0.00      0.00       320

    accuracy                           0.52       661
   macro avg       0.26      0.50      0.34       661
weighted avg       0.27      0.52      0.35       661



  'precision', 'predicted', average, warn_for)
