# Sentiment Analysis on IMDB moview review dataset
Poorly rated movies are labeled 0 and highly rated movies are labeled 1. 

### Data loading

In [9]:
import pandas as pd

In [10]:
df_train = pd.read_csv('train.csv', encoding='iso8859')
df_test = pd.read_csv('test.csv', encoding='iso8859')

In [11]:
df_test.columns = ['id','sentiment','review']

In [12]:
df_train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [13]:
x_train_list = df_train['review'].tolist()
x_test_list = df_test['review'].tolist()

### Clean the dataset 

In [14]:
stopwords = open('stopwords.txt', ).read().splitlines()

In [15]:
import numpy as np
import spacy
import re
import string

nlp = spacy.load("en", disable=['tagger', 'ner'])

parsed_text_train = []
parsed_text_test = []

for i in range(len(x_train_list)):
    for c in string.punctuation:
        text = x_train_list[i].replace(c, "") 
    text = text.lower()
    parsed_text_train.append(nlp(text))

for i in range(len(x_test_list)):
    for c in string.punctuation:
        text = x_test_list[i].replace(c, "") 
    text = text.lower()
    parsed_text_test.append(nlp(text))     

In [16]:
parsed_text_train_filtered = []
parsed_text_test_filtered = []
parsed_text_train_filtered_max = 0
parsed_text_test_filtered_max = 0

for i in range(len(parsed_text_train)):
    inner_list = []
    for token in parsed_text_train[i]:    
        if (str(token).lower() not in stopwords) & (token.is_alpha or token.is_digit):
            inner_list.append(str(token).lower())
    parsed_text_train_filtered.append(inner_list)
    if len(inner_list) > parsed_text_train_filtered_max:
        parsed_text_train_filtered_max = len(inner_list)
        
    
for i in range(len(parsed_text_test)):
    inner_list = []
    for token in parsed_text_test[i]:    
        if (str(token).lower() not in stopwords) & (token.is_alpha or token.is_digit):
            inner_list.append(str(token).lower())
    parsed_text_test_filtered.append(inner_list)    
    if len(inner_list) > parsed_text_test_filtered_max:
        parsed_text_test_filtered_max = len(inner_list)

print("parsed_text_train_filtered_max",parsed_text_train_filtered_max)  
print("parsed_text_test_filtered_max",parsed_text_test_filtered_max)        

parsed_text_train_filtered_max 504
parsed_text_test_filtered_max 456


In [17]:
for idx,i in enumerate(parsed_text_train_filtered[:5]):
    print("Review ",idx)
    print(i," ", end='')
    print("\n")

Review  0
['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'guilty', 'innocent', 'moonwalker', 'biography', 'feature', 'film', 'remember', 'going', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'press', 'obvious', 'message', 'drugs', 'bad', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'going', 'hate', 'boring', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'say', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', '20', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 'wants', 'mj', 'dead', 'bad', 'mj', 'overheard', 'plans', 'nah', 'joe', 'pesci', 'character', 'ranted', 'wanted',

### Turn movie reviews into vectors with pre-computed GloVe word embeddings

In [18]:
word_to_embedding = {}

# we will use the 50-dimensional embedding vectors
with open("./glove.6B.50d.txt", encoding='UTF-8') as f:
    # each row represents a word vector
    for line in f:
        values = line.split()
        # the first part is word
        word = values[0]
        # the rest of the values form the embedding vector
        embedding = np.asarray(values[1:], dtype='float32')
        word_to_embedding[word] = embedding

print('Found %s word vectors.' % len(word_to_embedding))

Found 400000 word vectors.


In [54]:
# i-th movie review should have an embedding given by the i-th row of the 2D array
embedding_dim = 50
vocab_size = len(parsed_text_train_filtered)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for review_idx in range(vocab_size):
    words_found_in_embedding = 0
    for word_idx in range(len(parsed_text_train_filtered[review_idx])):
        if parsed_text_train_filtered[review_idx][word_idx] in word_to_embedding:
            word = parsed_text_train_filtered[review_idx][word_idx] 
            embedding_matrix[review_idx] += word_to_embedding[word]
            words_found_in_embedding += 1
    embedding_matrix[review_idx] /= words_found_in_embedding

train_x = embedding_matrix   

In [53]:
# do the same for test set
vocab_size = len(parsed_text_test_filtered)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for review_idx in range(vocab_size):
    words_found_in_embedding = 0
    for word_idx in range(len(parsed_text_test_filtered[review_idx])):
        if parsed_text_test_filtered[review_idx][word_idx] in word_to_embedding:
            word = parsed_text_test_filtered[review_idx][word_idx] 
            embedding_matrix[review_idx] += word_to_embedding[word]
            words_found_in_embedding += 1
    embedding_matrix[review_idx] /= words_found_in_embedding

test_x = embedding_matrix      

In [61]:
# create train and test labels
train_y = df_train['sentiment'].values
test_y = df_test['sentiment'].values

### Train a polynomial kernel SVM using the review embeddings as feature vectors

In [77]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

num_folds = 5
k_fold = KFold(num_folds)
C_values = np.logspace(-4, 2, 3)
D_values = range(1, 4)

indices = range(train_x.shape[0])

arg_max = None
max_cross_val_score = -np.inf
for C in C_values:
    for d in D_values:
        fold_scores = []
        for train_indices, val_indices in k_fold.split(indices):
            classifier = SVC(kernel='poly',C=C,degree=d,gamma='auto')
            classifier.fit(train_x[train_indices], train_y[train_indices])
            train_y_pred = classifier.predict(train_x[val_indices])
            fold_score = accuracy_score(train_y[val_indices], train_y_pred)
            fold_scores.append(fold_score)
            print('C:', C, '/ d:', d, fold_score)

        cross_val_score = np.mean(fold_scores)
        print('MEAN C:', C, '/ d:', d, cross_val_score)
        if cross_val_score > max_cross_val_score:
            max_cross_val_score = cross_val_score
            arg_max = (C, d)
            
best_C, best_d = arg_max
print(best_C, best_d)

C: 0.0001 / d: 1 0.545
C: 0.0001 / d: 1 0.55
C: 0.0001 / d: 1 0.47
C: 0.0001 / d: 1 0.51
C: 0.0001 / d: 1 0.5125628140703518
MEAN C: 0.0001 / d: 1 0.5175125628140703
C: 0.0001 / d: 2 0.545
C: 0.0001 / d: 2 0.55
C: 0.0001 / d: 2 0.47
C: 0.0001 / d: 2 0.51
C: 0.0001 / d: 2 0.5125628140703518
MEAN C: 0.0001 / d: 2 0.5175125628140703
C: 0.0001 / d: 3 0.545
C: 0.0001 / d: 3 0.55
C: 0.0001 / d: 3 0.47
C: 0.0001 / d: 3 0.51
C: 0.0001 / d: 3 0.5125628140703518
MEAN C: 0.0001 / d: 3 0.5175125628140703
C: 0.1 / d: 1 0.545
C: 0.1 / d: 1 0.55
C: 0.1 / d: 1 0.47
C: 0.1 / d: 1 0.51
C: 0.1 / d: 1 0.5125628140703518
MEAN C: 0.1 / d: 1 0.5175125628140703
C: 0.1 / d: 2 0.545
C: 0.1 / d: 2 0.55
C: 0.1 / d: 2 0.47
C: 0.1 / d: 2 0.51
C: 0.1 / d: 2 0.5125628140703518
MEAN C: 0.1 / d: 2 0.5175125628140703
C: 0.1 / d: 3 0.545
C: 0.1 / d: 3 0.55
C: 0.1 / d: 3 0.47
C: 0.1 / d: 3 0.51
C: 0.1 / d: 3 0.5125628140703518
MEAN C: 0.1 / d: 3 0.5175125628140703
C: 100.0 / d: 1 0.765
C: 100.0 / d: 1 0.73
C: 100.0 / d: 1

In [79]:
model_svm = SVC(kernel='poly', gamma='auto', C=best_C, degree=best_d)
model_svm.fit(train_x, train_y)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
test_y_predict = model_svm.predict(test_x)
print("Accuracy:",accuracy_score(test_y_predict,test_y))

Accuracy: 0.7429718875502008
