In [2]:
import numpy as np
import pandas as pd
import pyprind, os, re, nltk, pickle,sqlite3
from tqdm import tqdm

# Preparing the dataset

In [None]:
pbar = pyprind.ProgBar(50000)  #50000 number of documents to be read in 
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()

In [None]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = r'C:\Users\Samuel\Desktop\sentiment_ananalysis\sentiment_analysis\data\external\aclImdb\%s\%s'%(s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding="utf8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()     
df.columns = ['review', 'sentiment']

In [None]:
#randomize the dataframe and save as movie dataset
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
output_path = r'C:\Users\Samuel\Desktop\sentiment_ananalysis\sentiment_analysis\data\processed\movie_data.csv'
df.to_csv(path, index=False)

# Preprocessing 

1. We create a vocabulary of unique tokens—for example, words—from the entire set of documents.

2. We construct a feature vector from each document that contains the counts of how often each word occurs in the particular document.

In [3]:
#cleaning text data
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) #remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) #keep emoticon character
    text = re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-', '') #remove non-word character, convert to lower case 
    return text

#apply preprocessor function to the movie dataset
tqdm.pandas()
df = pd.read_csv(r'C:\Users\Samuel\Desktop\sentiment_ananalysis\sentiment_analysis\data\processed\movie_data.csv')
df['review'] = df['review'].progress_apply(preprocessor)

100%|██████████████████████████████████████████████████████████████████████████| 50296/50296 [00:07<00:00, 6598.99it/s]


In [4]:
#Process document into tokens
def tokenizer(text):
    return text.split()

#word stemming; transfomring word to its root form
#Note: In order to install the NLTK, you can simply execute pip install nltk
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#for example
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [5]:
#stop word removal e.g. is, has etc.(there are about 127 stopwords in NLTK)
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

#example
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samuel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

# Training logistic regression model and Using Grid Search

# Note: Do not run this section, the Grid Search might take time!

In [None]:
#25000 training and 25000 test sets
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

Next we will use a GridSearchCV object to find the optimal set of parameters for our logistic regression model using 5-fold stratified cross-validation:

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)

param_grid = [{'vect__ngram_range': [(1,1)],'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer,tokenizer_porter],
'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}, {'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer,tokenizer_porter],'vect__use_idf':[False], 'vect__norm':[None],'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf), ('clf',LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1,n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

In [None]:
print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'% clf.score(X_test, y_test))

# Working with bigger data – Online algorithms and out-of-core learning

Grid search can be computationally expensive, instead we will apply the out-of-core learning.

Instead, we will make use the partial_fit function of the SGDClassifier in scikit-learn to stream the documents directly from our local drive and train a logistic regression model using small minibatches of documents

In [6]:
#clean and tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())+ ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


#Next we define a generator function, stream_docs, that reads in and returns one document at a time:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

#verify if the stream_docs function work 
output_path = r'C:\Users\Samuel\Desktop\sentiment_ananalysis\sentiment_analysis\data\processed\movie_data.csv'
next(stream_docs(path=output_path))

('"An insane assault on viewers senses. This is a mish-mash of assorted Hindi and English movies - poorly done. The name carries over from a 70s\' multi star cast, which the 2002 version also boasts of. The story is taken from the 70s\' Sunil Dutt/Reena Roy starrer - ""Nagin"" and visual effects taken (a horrible attempt) from The Matrix, Terminator 2 and Mission Impossible II.<br /><br />Set in a college environment (Sunil Shetty, Akshaye Kumar, Manisha - college kids!!!???!!), Manisha Koirala is the victim, who mistakes a fatal assault on her by two students as a collective effort on the part of our heros. As it turns out Manisha is a Cobra (Nag) snake reborn as a girl in this life and her mate from the previous life, now a super powerful-all-and-any-shape-assuming (Ichadhari Nag) - Munish Kohli, is out looking for her in this life. Manisha appeals to him to avenge her violation and murder.<br /><br />So begins the mad killing spree, where the avenging lover starts singling each male

We will now define a function, get_minibatch, that will take a document stream from the stream_docs function and return a particular number of documents specified by the size parameter:

In [7]:
#take a document stream from the stream_docs function and return a particular 
##number of documents specified by the size parameter
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [8]:
#using hashing vectorizer for text processing
#Note CountVectorizer is not used because it requires holding all vocabulary in memory
#TfidfVectorizer is not used because it requires keeping all feature vectors of the
#training data in memory to calculate inverse document frequencies.

#We will use Hashingvectorizer, this is data independent
#uisng SGD classifier algorithm
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path=output_path)

In [9]:
#incremental training of 45000 documents
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:25


In [10]:
#evaluate performance of the model with 5000 documents left
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

#update the model with the prediction 
clf = clf.partial_fit(X_test, y_test)

Accuracy: 0.870




# Embedding a Machine Learning Model into a Web Application

In this section you will learn the following:

• Saving the current state of a trained machine learning model

• Using SQLite databases for data storage

• Developing a web application using the popular Flask web framework

• Deploying a machine learning application to a public web server

In [11]:
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'),'wb'), protocol=4)
    pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'),protocol=4)

Next create a new Python script for the Hashing vectorizer which will always be imported to the python session...save this
scipt as vectorizer.py

In [None]:
#quick check if the pickled files and vectorizer.py are working
from movieclassifier.vectorizer import vect
clf = pickle.load(open(os.path.join('./movieclassifier/pkl_objects','classifier.pkl'), 'rb'))

label = {0:'negative', 1:'positive'}
example = ['the movie is fair']
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(X)[0]],np.max(clf.predict_proba(X))*100))

# Set up an SQLite Database

SQLite is an open source SQL database engine that doesn't require a separate server to operate, which makes it ideal for smaller projects and simple web applications...

In [12]:
#create an sqlite database using the sqlite3 API in Python
conn = sqlite3.connect('./movieclassifier/reviews.sqlites')

c = conn.cursor()
c.execute('CREATE TABLE review_db' ' (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db" " (review, sentiment, date) VALUES" " (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db" " (review, sentiment, date) VALUES" " (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()

In [13]:
#check if the entries has been stored
conn = sqlite3.connect('./movieclassifier/reviews.sqlites')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date" " BETWEEN '2015-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2019-11-20 15:44:54'), ('I disliked this movie', 0, '2019-11-20 15:44:54')]


# Developing a Web Application with Flask

1. install flask library in your python environment by executing: pip install flask

2. You will need to install WTForms (pip install wtforms) library in order to collect data (i.e. movie reviews) from users.

3. flask_app.py contain the main code that will be executed by the Python interpreter to run the Flask web application

4. templates directory is the directory in which Flask will look for static HTML files for rendering in the web browser

5. static directory has the CSS code for styling. 

6. You can improve the CSS and HTML file.



# Deploy the Web Application to a Public Server

For this tutorial, we will be using the PythonAnywhere web hosting service, which specializes in the hosting of Python web applications and makes it extremely simple and hassle-free.

To create an account:
    
1. https://www.pythonanywhere.com

2. Click on: Create a beginner account and create an account
    
3. We cannot ssh to the pythonanywhere server (for beginner account) so we have to use the Pythonanywhere interface to manage the web
    application.

4. Next click on the DAshboard button  in the top right corner, then click on web app and add a new web app.

5. Select the Flask framework and specify the python version.

6. Go to files on the interface and load the movieclassifier files from your local machine.

7. Go to Web and reload <username.pythonanywhere.com> button to propagate the changes and refresh the web application. 
Finally, the web app should now be up and running and publicly available via the address <username.pythonanywhere.com>