In [1]:
# The goal is to deal with the same dataset in a more compute friendly manner
# These methods will allow us to create tools that can be deployed online
# We will use an SGD classifier to process in small batches
import numpy as np
import re
from nltk.corpus import stopwords

rnd_seed = 1

In [2]:
stop = stopwords.words('english')

def tokenizer(txt):
    txt = re.sub(r'<[^>]*>', '', txt) # Remove all breaks, formatting etc
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', txt) # Sift out emojis to add to the end
    txt = re.sub(r'[\W]+', ' ', txt.lower()) + ' '.join(emoticons).replace('-', '')
    return [w for w in txt.split() if w not in stop] # Remove nuisance stop words

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            txt, label = line[:-3], int(line[-2]) # Read off the review and label in the csv
            yield txt, label # return a generator object

# Test
next(stream_docs(path='movie_data.csv'))

('"I was taken to this film by a friend and was sceptical about a Swedish film with subtitles. However, I thoroughly enjoyed every minute of this beautiful film. The unnecessary cruelty that man is capable of was portrayed confidently without overwhelming images - although animal lovers may have to shield their eyes for a brief couple of seconds somewhere during the first 10 minutes. A traditional story of humility versus brutality and hope versus tragedy was illustrated from a satisfyingly fresh angle using a spectrum of characters with very natural flaws and features. I particularly liked how the film managed to address multiple aspects of hypocritical human behaviour that concern bias, discrimination and sanctimonious pretence. An absolute gem of a film that I will promote to all who will listen."',
 1)

In [3]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

# We use a Hashing vectorizer: converts a collection of documents into a sparse numpy matrix
# This is similar to the Count Vectorizer
vect = HashingVectorizer(decode_error='ignore',
                         norm='l2',
                         ngram_range=(1, 1),
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log_loss', random_state=rnd_seed)
doc_stream = stream_docs(path='movie_data.csv')

In [5]:
from tqdm import tqdm

n_batches = 45
batch_size = 1000

pbar = tqdm(total=n_batches)

classes = np.array([0, 1])

for _ in range(n_batches):
    X_train, y_train = get_minibatch(doc_stream=doc_stream, size=batch_size)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update(1)

100%|██████████| 45/45 [00:18<00:00,  2.42it/s]

In [6]:
test_size = 50000 - (n_batches * batch_size)
X_test, y_test = get_minibatch(doc_stream, size=test_size)
X_test = vect.transform(X_test)
print(f"Test accuracy: {clf.score(X_test, y_test):.3f}")

Test accuracy: 0.865


This was slightly weaker, with just over an 86% accuracy. However, we did not perform the stemming operations, gridsearching, or any real other optimization. This was a simple batch update. However, for a few percent loss in accuracy, the entire code ran in under a minute, in stark contrast to sentiment_analysis.ipynb