<h3>Working with bigger data using SGD - Online algorithms and out of core learning</h3>
<br>
<b>NOTE: This is being used to develop the web application as well</b>

In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
import pyprind
import pickle
import os

In [2]:
'''
using partial_fit function of SGDClassifier to stream the documents directly
from local drive and train a logistic regression model using small
minibatches of documents
'''
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [3]:
#sample generator function
def generator():
    l = [1, 2, 3]
    for val in l:
        yield val

In [4]:
#generator function that reads in one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) #done to skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [5]:
# next(stream_docs(path='movie_data.csv'))

In [6]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    
    return docs, y

In [7]:
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

#using logistic regression with SGD
clf = SGDClassifier(loss='log', random_state=1, n_jobs=-1)
doc_stream = stream_docs(path='movie_data.csv')

In [8]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

#using 45 minibatches, each having 1000 docs to partial_fit the dataset.
#rest 5 batches to be used for test set
for _ in range(45):
    x_train, y_train = get_minibatch(doc_stream, size=1000)
    if not x_train:
        break

    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:28


In [9]:
x_test, y_test = get_minibatch(doc_stream, size=5000)
x_test = vect.transform(x_test)

print(f'Training accuracy: {clf.score(x_train, y_train)}')
print(f'Test accuracy: {clf.score(x_test, y_test)}')

Training accuracy: 0.888
Test accuracy: 0.8682


In [10]:
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

In [11]:
#wb: opened the file in binary mode for pickle
#HashingVectorizer need not be pickled as it does not work that way, as discussed before
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)