## Logistic Regression based classification

#### Import data from the IMDB training set 

In [1]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg': 0}
df=pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = './aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'rt', encoding="utf8") as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()
df.columns = ['review','sentiment']
df.to_csv('./movie_data.csv', index=False) #save for use in future SGD step

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:02:34


#### Preprocess text and convert it into a bag of words

import numpy as np
np.random.seed(0)
df=df.reindex(np.random.permutation(df.index))

In [3]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-','')
    return text

In [4]:
df['review']=df['review'].apply(preprocessor)

In [None]:
np.set_printoptions(precision=2)
X_train=df.loc[:25,'review'].values
y_train=df.loc[:25,'sentiment'].values

X_test=df.loc[25000:,'review'].values
y_test=df.loc[25000:,'sentiment'].values

import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('English')
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()

def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#### Setup SciKit pipeline 

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

count = CountVectorizer()
tfidf = TfidfVectorizer(strip_accents=False, preprocessor=None, lowercase=False)

param_grid = [
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'clf__penalty': ['l1','l2'],
        'clf__C': [1.0, 10.0, 100.0]        
    },
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],
        'vect__use_idf': [False],
        'vect__norm': [None],
        'clf__penalty': ['l1','l2'],
        'clf__C': [1.0, 10.0, 100.0]        
    }
    ]
best_param_grid = [
    {
        'vect__ngram_range': [(1,1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer],
        'clf__penalty': [l2'],
        'clf__C': [10.0]        
    }
]
lr = LogisticRegression(random_state=0)
lr_tfidf = Pipeline([('vect', tfidf), ('clf', lr)])

gs_lr_tfidf = GridSearchCV(lr_tfidf, best_param_grid, scoring='accuracy', n_jobs=1, cv=5, verbose=1)

#### Train the classifier

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

#### Choose the best estimator and run the classifier on the test set

In [None]:
clf = gs_lr_tfidf.best_estimator_

print("Test Accuracy: %0.3f" % clf.score(X_test, y_test))

# _Online Learning_ using Stochastic Gradient Descent

The above method is fine for a static one-time training set. However we would like to be able to fine-tune the classifier based on new data that is available, for example, based on feedback provided by end-users or from additional training sets.

So for our final solution we use Stochastic Gradient Descent instead. It also happens to be way faster: on my laptop, training with 50,000 reviews took under a minute with SGD but took almost an hour with the best LR classifier.

Setup tokenizer and stream the reviews in small batches

In [None]:
import numpy as np

import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path): 
    with open(path, 'r') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

def get_minibatch(doc_stream, size):
    docs, y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

Setup classifier 

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',#norm='l2',ngram_range=(2,2),
                            n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

doc_stream = stream_docs(path='./movie_data.csv')

import pyprind
num_batches = 45
pbar = pyprind.ProgBar(num_batches)
train_batch_size = 1000
test_batch_size = 5000

classes = np.array([0, 1])

Run training batches

In [None]:
for _ in range(num_batches):
    X_train, y_train = get_minibatch(doc_stream, size=train_batch_size)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

X_test, y_test = get_minibatch(doc_stream, size=test_batch_size)
X_test = vect.transform(X_test)

print("Test Accuracy: %0.3f" % clf.score(X_test, y_test))

clf = clf.partial_fit(X_test, y_test, classes=classes)

We have performed training on servers on large datasets. However we want to run predictions in a web application 
(where users can interact). So we need to have the trained classifier on the webserver. We achieve this by capturing the 
latest state of the relevant Python objects using Python "pickling" and copy them over to the webserver.


In [None]:
import pickle, os

dest = os.path.join('movieclassifier', 'pickled_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,
    open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
    protocol=4)    
pickle.dump(clf,
    open(os.path.join(dest, 'classifier.pkl'), 'wb'),
    protocol=4) 