In [43]:
import numpy as np
import pandas as pd
import os
from io import StringIO
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from numpy.random import seed
from sklearn import datasets
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.pipeline import Pipeline
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import accuracy_score,confusion_matrix 
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve,auc
from scipy import interp
from itertools import combinations
import pyprind
import re
from nltk import PorterStemmer
from nltk import download
from nltk.corpus import stopwords

In [4]:
# Assemble all text documents into CSV file
pbar = pyprind.ProgBar(50000) # number of documents to read
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = '/home/sandeepubuntu/Downloads/aclImdb/%s/%s' %(s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r') as infile:
                txt = infile.read()
                
            df = df.append([[txt,labels[l]]],ignore_index=True)
            pbar.update()
            
            
df.columns = ['review','sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:04:42


In [5]:
# Shuffle the date and store it in CSV
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv',index=False)

In [9]:
# Read the CSV
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"Never, ever, have I been as impressed by a fil...",1
1,This piece of garbage belongs in the basement ...,0
2,Arg. The shuffling dinosaurs are back to take ...,0


In [17]:
# Bag of words model example 

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])

bags = count.fit_transform(docs)
print(bags)
print(count.vocabulary_)
print(bags.toarray())

# Every column is the tf of the dictionary items

  (0, 2)	1
  (0, 1)	1
  (0, 3)	1
  (0, 5)	1
  (1, 4)	1
  (1, 6)	1
  (1, 1)	1
  (1, 5)	1
  (2, 0)	1
  (2, 4)	1
  (2, 6)	1
  (2, 2)	1
  (2, 1)	2
  (2, 3)	1
  (2, 5)	2
{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [18]:
# Assessing the weights of the words using TF-IDF

tfidf = TfidfTransformer()
np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [22]:
# Cleaning the data 
print(df.loc[0,'review'][-50:])


, go and see it - the hours float by very quickly!


### Function : Preprocessing Text

In [25]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+',' ',text.lower()) + \
                        ' '.join(emoticons).replace('-','')
        
    return text

In [27]:
# Example: 
preprocessor(df.loc[0,'review'][-50:])

# Applying preprocessor to the entire dataset
df['review'] = df['review'].apply(preprocessor)

### Function: Tokenizer

In [30]:
def tokenizer(text):
    return text.split()

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [34]:
# Example
print(tokenizer('runners like running and thus they run'))
print(tokenizer_porter('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']
['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [39]:
# Stop words like 'is , are, the, and' 
stop = stopwords.words('english')
[w for w in tokenizer_porter('runners like running and thus they run') if w not in stop]

['runner', 'like', 'run', 'thu', 'run']

### Logistic regression model 

In [47]:
# Define the training and test data 
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values


tfidf = TfidfVectorizer(strip_accents = None, 
                        lowercase= False,
                        preprocessor = None)


param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'clf__penalty': ['l1','l2'],
               'clf__C': [1.0,10.0,100.0]}, 
              
             { 'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty': ['l1','l2'],
               'clf__C': [1.0,10.0,100.0]}
             ]


lr_tfidf = Pipeline([ ('vect',tfidf), 
                      ('clf', LogisticRegression(random_state=0))
                    ])

gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid, scoring='accuracy', cv = 5, verbose = 1,n_jobs=-1)

gs_lr_tfidf.fit(X_train,y_train)












Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 27.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 124.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 158.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [48]:

# Analyzing the model and predictin result
print(gs_lr_tfidf.best_params_)
print('\nCV Best Score: %0.4f'%gs_lr_tfidf.best_score_)
clf  = gs_lr_tfidf.best_estimator_
print('\nTest Score: %0.4f'%( clf.score(X_test,y_test)))




{'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f376fabce18>}

CV Best Score: 0.8927

Test Score: 0.8986
