# Web Application

## Recap - Training a model for movie review classification

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop = stopwords.words('english')
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path='../movie_data.csv'))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='../movie_data.csv')

In [None]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
clf = clf.partial_fit(X_test, y_test)

# Serializing fitted scikit-learn estimators

In [None]:
import pickle
import os

In [None]:
dest = os.path.join('movie_classifier', 'pkl_objects')

if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=2)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=2)

In [None]:
%%writefile movie_classifier/vectorizer.py
from sklearn.feature_extraction.text import HashingVectorizer
import re 
import os
import pickle

# gives the directory of vectorizer
cur_dir = os.path.dirname(__file__)
# since the pkl objects parent dir is the same with vectorizer
# cur_dir works perfectly
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
                   + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

After executing the preceeding code cells, we can now restart the IPython notebook kernel to check if the objects were serialized correctly.

In [None]:
import os
os.chdir('movie_classifier/')

In [None]:
import pickle
import re
import os
from vectorizer import vect

clf =  pickle.load(open(os.path.join('pkl_objects/','classifier.pkl'),'rb'))

In [None]:
clf

In [None]:
import numpy as np

In [None]:
label = {0:'negative', 1:'positive'}

example = ['I love this movie']
X = vect.transform(example)
print 'Prediction: %s\nProbability: %.2f%%'\
%(label[clf.predict(X)[0]], clf.predict_proba(X).max()*100)

<br>
# Setting up a SQLite database for data storage

In [None]:
import sqlite3
import os

if os.path.exists('reviews.sqlite'):
    os.remove('reviews.sqlite')

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [None]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-07-13 15:15' AND DATETIME('now')")
results = c.fetchall()

conn.close()

In [None]:
print results

# Developing a web application with Flask

## Turning the movie classifier into a web application