In [1]:
import pandas as pd
import os

In [None]:
# Extracting useful information from IMDb movie reviews dataset

# unable to conda install pyprind due to dependency issues, wait for upgrade

labels = {'pos':1, 'neg':0} # integer class label for positive and negative
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path='./aclImdb/%s/%s' % (s, l) 
        for file in os.listdir(path): # list all file names
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index = True) # continuous index
df.columns = ['review', 'sentiment']

In [3]:
# Shuffle the dataframe
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False) # store in csv for convenience

In [2]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"Susie Q. is one of those rare, and sweet movie...",1
1,It starts out looking like it may be going som...,0
2,Two films are useful for scaring people to God...,0


In [3]:
# Bag-of-words model
# 1. create vocabulary of unique tokens - e.g. words from the entire set of documents
# 2. construct feature vector from each document that contains the counts of how often each word occurs in
# that particular document

# Transforming words into feature vectors

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer() # takes an array of text data and constructs bag-of-words model
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)

In [4]:
print(count.vocabulary_) # maps unique words to integer indices

{'sun': 3, 'and': 0, 'shining': 2, 'is': 1, 'sweet': 4, 'weather': 6, 'the': 5}


In [5]:
print(bag.toarray()) # feature vectors
# each row represents each document, and the count of words indexed according to vocabulary_
# raw term frequencies tf(t,d) - the number of times a term t occurs in a document d
# 1-gram / unigram model - each item or token in vocab represents single word. n-gram (more generally)
# to initialize 2-gram, CountVectorizer with ngram_range=(2,2)

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [6]:
# Assessing word relevancy via term frequency-inverse document frequency (tf-idf)
# assumption: frequently encountered words don't contain useful or discriminatory information
# downweight these words using tf-idf

# tf-idf: product of term frequency and the inverse document frequency:
# tf-idf(t,d) = tf(t,d) x idf(t,d)
# idf(t,d) = log(nd / (1 + df(d,t)))) ,
# where nd = total number of documents and df(d,t) = number of documents that contain term t
# + 1 to ensure non zero deonominator; log to ensure that low document frequencies not given too much weight

# TfidTransformer in Scikit-learn takes raw term frequencies from CountVectorizer as input and transforms them
# into tf-idfs:
# note that formula for calculating tf-idf slightly different in sklearn
# TfidTransformer normalizes tf-idfs directly, using default norm='l2', all vectors sum to 1
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [7]:
# Cleaning text data by stripping off unwanted characters

df.loc[0, 'review'][-50:] # display last 50 characters from first document


'would show it just ONE more time!^_^ Go Susie Q.!!'

In [8]:
# remove punctuation marks but keep emoticon characters
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove html tag [^>] means any character other than >
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) 
    # ?: means do not extract, returns concactenated string in array
    # add emoticon to the end and remove nose character -
    text = re.sub('[\W]+', ' ', text.lower())  + ' '.join(emoticons).replace('-', '') 
    return text

# resource for learning about regex in python:
# https://developers.google.com/edu/python/regular-expressions

In [9]:
# check if preprocessor works correctly
preprocessor(df.loc[0, 'review'][-50:])

'would show it just one more time _ go susie q '

In [10]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [11]:
# Apply preprocessor function to all movie reviews in dataframe
df['review'] = df['review'].apply(preprocessor)

In [12]:
# Processing documents into tokens
# split the text corpora into individual elements
# tokenize documents by spliting them to individual characters
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [13]:
# Another technique is word stemming -  transform a word into its root form that allows us to map related words
# to the same stem.
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [14]:
# stop-word removal
# stop-words: words that are extremely common in texts and bear little useful information, e.g. is, and, has
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/shunji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords
stop = stopwords.words('english')  # english stop words
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [16]:
# Training a logistic regression model for document classification

# divide dataframe of cleaned text documents into 25,000 documents for training and 25,000 documents for testing
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [18]:
# use GridSearchCV object to find the optimal set of parameters for logistic regression model using
# 5-fold stratified cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, # character accents
                       lowercase=False, # convert to lowercase if True
                       preprocessor=None)

# vect and clf are variable names set by us, access parameter via name__parameter
param_grid = [{'vect__ngram_range': [(1,1)], # unigram
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range':[(1,1)],
             'vect__stop_words': [stop, None],
             'vect__tokenizer': [tokenizer,
                                tokenizer_porter],
             'vect__use_idf':[False], # train model based on raw frequencies
             'vect__norm': [None],
             'clf__penalty': ['l1','l2'],
             'clf__C':[1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                          cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 64.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 278.8min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 358.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__tokenizer': [<function tokenizer at 0x7f40fc92e1e0>, <function tokenizer_porter at 0x7f40fc92e598>], 'vect__ngram_range': [(1, 1)], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',...sn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'], None]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [19]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'vect__tokenizer': <function tokenizer at 0x7f40fc92e1e0>, 'vect__ngram_range': (1, 1), 'clf__C': 10.0, 'vect__stop_words': None, 'clf__penalty': 'l2'} 


In [20]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.895


In [21]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.899


In [53]:
# Working with bigger data - online algorithms and out of core learning
import numpy as np
import re
from nltk.corpus import stopwords

# clean unprocessed text data - separate into word tokens and remove stop words
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [54]:
# reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [55]:
# verify that stream_docs works correctly
next(stream_docs(path='./movie_data.csv'))

('"Susie Q. is one of those rare, and sweet movies that give you a warm feeling. It\'s bittersweet, but wholesome, and it\'s characters are fun, and captivating. At first, I thought the movie would be the cliché cuddly movie that would bore me after five minutes, but was I wrong. It made me tear up at times, and it\'s plot was enticing, making me root for the good guys. I loved the movie, and still remember it today, 9 years later!! I recommend it highly to ANYONE, and the movie is family oriented, so you won\'t have to worry about unsuitable content. Truly, if Disney would show more movies that are up to par as Susie Q., it would be the most popular family oriented channel in the world. Now if only Disney would show it just ONE more time!^_^ Go Susie Q.!!"',
 1)

In [60]:
# Take a document stream from stream_docs and return a particular number of documents specified by the size
# parameter

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [61]:
# For out of core vectorizing, use hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [62]:
# Start out of core learning
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

In [64]:
# Use the last 5000 documents to evaluate the performance of our model
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [65]:
# update our model with last 5000 documents
clf = clf.partial_fit(X_test, y_test)

In [66]:
# Serialize fitted scikit-learn estimators
import pickle 
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)