In [1]:
import numpy as np
import re
from nltk.corpus import stopwords

In [2]:
stop = stopwords.words('english')

In [3]:
stop[:5]

['i', 'me', 'my', 'myself', 'we']

### defining tokenizer

In [4]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', ''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
  text = (re.sub('[\W]+', ' ', text.lower()) +' '.join(emoticons).replace('-', ''))


### Generator function 

#### `stream_docs`, that reads in and returns one document at a time

In [5]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [6]:
next(stream_docs(path='movie_data.csv'))

('"This picture\'s following will only grow as time goes by. Better than any of the best picture nominees in 97 and it rewards repeated viewings. I\'ve seen it three times now so I know. Anderson was compared to some of the great American directors (Altman, Scorcese, Tarantino) and he may have those influences but chances are, after a few more films, he\'ll be considered part of that short list himself.<br /><br />One last note: Julianne Moore\'s ""Amber Waves"" will resonate in the memory long after other 90\'s movie characters have faded. THE best performance of the year -in any of the four categories."',
 1)

In [7]:
# function, get_minibatch, 
# will take a document stream from the stream_docs function and return a particular number of documents specified by 
# the size parameter

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


In [8]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

### doing out-of-core learning

In [9]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000) # 45 mini-batches of 1000 documents
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()



In [10]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.881


#### Finally, we can use the last 5,000 documents to update our model !

In [11]:
clf.partial_fit(X_test,y_test)