# Sentiment analysis using IMDB data

## Getting the IMDB data

In [2]:
# Data from Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). 
# Learning Word Vectors for Sentiment Analysis. 
# The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

import pyprind 
import pandas as pd
import os 
pybar = pyprind.ProgBar(50000) # 50000 is the number fo files.

label = {'pos':1, 'neg':0}
df_train = pd.DataFrame()
df_test  = pd.DataFrame()
for testTrainFolders in ['test', 'train']: # Load only the test data
    for classesFolder in ['pos', 'neg']:
        path = './aclImdb/%s/%s' %(testTrainFolders, classesFolder)
        for f in os.listdir(path):
            with open(os.path.join(path, f), 'r') as infile:
                text = infile.read()
                if testTrainFolders == "test":
                    df_test = df_test.append([[text, label[classesFolder]]], ignore_index = True)
                    pybar.update()
                elif testTrainFolders == "train":
                    df_train = df_train.append([[text, label[classesFolder]]], ignore_index = True)
                    pybar.update()
                
df_test.columns = ['review', 'sentiment']
df_train.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA[sec]: 0.000 
Total time elapsed: 458.021 sec


## Randomize the test data and store it in csv file

In [4]:
import numpy as np
np.random.seed(0)

df_train = df_train.reindex(np.random.permutation(df_train.index))
df_train.to_csv('./movie_train_data.csv', index=False)
df_train = pd.read_csv('./movie_train_data.csv')

df_test  = df_test.reindex(np.random.permutation(df_test.index))
df_test.to_csv('./movie_test_data.csv', index=False)
df_test = pd.read_csv('./movie_test_data.csv')

print df_train.head(3)
print df_test.head(3)

                                              review  sentiment
0  This is a generally nice film, with good story...          1
1  I just accidentally stumbled over this film on...          1
2  This movie is a good example of the extreme la...          0
                                              review  sentiment
0  hello all Denver fans!<br /><br />i couldn't a...          1
1  If you watched this film for the nudity (as I ...          1
2  The opening scene keeps me from rating at abso...          0


## CountVectorizer for getting the text as feature vectors

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
ctVectorizer = CountVectorizer()
allData = ['Sun rises in the east',
          'Sun sets in the west',
          'Japan is called the land of the rising sun.']
featureVectors = ctVectorizer.fit_transform(allData)
print featureVectors.toarray() 
print ctVectorizer.vocabulary_  # Note that CountVectorizer is case insensitive

[[0 1 1 0 0 0 0 1 0 0 1 1 0]
 [0 0 1 0 0 0 0 0 0 1 1 1 1]
 [1 0 0 1 1 1 1 0 1 0 1 2 0]]
{u'land': 5, u'rises': 7, u'sun': 10, u'is': 3, u'sets': 9, u'of': 6, u'rising': 8, u'west': 12, u'in': 2, u'japan': 4, u'the': 11, u'east': 1, u'called': 0}


## Term frequency and Inverse-term frequency: TfidfTransformer

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
norm_featureVectors = transformer.fit_transform(featureVectors.toarray())
print norm_featureVectors.toarray()

[[ 0.          0.55249005  0.42018292  0.          0.          0.          0.
   0.55249005  0.          0.          0.32630952  0.32630952  0.        ]
 [ 0.          0.          0.42018292  0.          0.          0.          0.
   0.          0.          0.55249005  0.32630952  0.32630952  0.55249005]
 [ 0.35934656  0.          0.          0.35934656  0.35934656  0.35934656
   0.35934656  0.          0.35934656  0.          0.21223587  0.42447173
   0.        ]]


## Term frequency and Inverse-term frequency: TfidfVectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfVectorizer = TfidfVectorizer()
print tfVectorizer.fit_transform(allData).toarray()

[[ 0.          0.55249005  0.42018292  0.          0.          0.          0.
   0.55249005  0.          0.          0.32630952  0.32630952  0.        ]
 [ 0.          0.          0.42018292  0.          0.          0.          0.
   0.          0.          0.55249005  0.32630952  0.32630952  0.55249005]
 [ 0.35934656  0.          0.          0.35934656  0.35934656  0.35934656
   0.35934656  0.          0.35934656  0.          0.21223587  0.42447173
   0.        ]]


## Cleaning data: Remove http tags and any non word characters

In [8]:
import re 

def preprocessor(text):
    text = re.sub(r'<[^>]*>', '', text) # Removing the http tags
    text = re.sub(r'\W+', ' ', text) # Removing all non words (even emoticons)
    return text

## Regular tokenizer

In [11]:
def tokenizer(text):
    return text.split()

## Porter tokenizer

In [12]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

print tokenizer_porter("fast fasts fasting")

tfVectorizer = TfidfVectorizer(tokenizer=tokenizer_porter)
print tfVectorizer.fit_transform(['run runs running', 'fast fasts fasting']).toarray()

[u'fast', u'fast', u'fast']
[[ 0.  1.]
 [ 1.  0.]]


## Logistic regression model for training

In [19]:
# Recap
# Data is in df_test. This was read from the files and then shuffled (shuffling was not needed because we are not mixing test and training data.)
# Use TfidfVectorizer to get the feature vectors ==> Use porterStemmer as tokenizer.
# Use logistic regression for fitting. 
# Use the learned logistic regression to predict.

# Before we create the pipeline, we need to get the data in numpy array from df_test.
# X_test should be a list of strings. This will get converted to n_sample X n_features numpy_array by TfidfVectorizer.
# y_test sould be n_sample numpy array.

X_train = df_train.loc[:10000, "review"].values     ## 25000 data points taking too much time. 
y_train = df_train.loc[:10000, "sentiment"].values

X_test = df_test.loc[:10000, "review"].values
y_test = df_test.loc[:10000, "sentiment"].values

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


#param_grid = [{'vect__ngram_range'  : [(1,1)], 
#               'vect__tokenizer'    : [tokenizer_porter, tokenizer],
#               'vect__preprocessor' : [None, preprocessor],
#               'lr__penalty'        : ['l2'],
#               'lr__C'              : [1.0, 10]}]

param_grid = [{'vect__tokenizer'    : [tokenizer_porter, tokenizer],
               'lr__C'              : [1.0, 10]}]

tfidf_lr_pipeline = Pipeline([('vect', TfidfVectorizer(preprocessor=preprocessor)), 
                              ('lr', LogisticRegression(random_state=0))])
gs_tfidf_lr = GridSearchCV(tfidf_lr_pipeline, 
                           param_grid,
                           scoring='accuracy',
                           cv=5, 
                           verbose=1)

print "Starting to fit"
gs = gs_tfidf_lr.fit(X_train, y_train )
print "Done fitting"

Starting to fit
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Done fitting


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.6min finished


In [24]:
print ("Best score %.3f" % gs.best_score_)
print ("Best parameters %s" % gs.best_params_)

clf = gs.best_estimator_
print ("Test accuracy %.3f" % clf.score(X_test, y_test))

Best score 0.879
Best parameters {'vect__tokenizer': <function tokenizer at 0x8f30758>, 'lr__C': 10}
Test accuracy 0.877
