In [1]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

  tar.extractall()


In [1]:
import pyprind
import pandas as pd
import os
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'
labels = {'pos': 1, 'neg': 0}
# the progress bar that shows when running
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
# read the test and train subdirectories and append them to the DataFrame (df)
# 1 = positive, 0 = negative
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = pd.concat([df, pd.DataFrame([[txt, labels[l]]])], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:09:23


In [2]:
import numpy as np
# shuffling the sorted dataset using the permutation function
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
# storing the shuffled data into a CSV file
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

# confirming the data is saved into the CSV correctly
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [3]:
# checking that the dataset as all 50,000 elements
df.shape

(50000, 2)

In [4]:
# display the last 50 characters from the first document
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [5]:
# remove all punctuation& HTML marks except emoji characters
import re
def preprocessor(text):
    # remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # find emojis
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [6]:
# testing the preprocessor function
print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))

is seven title brazil not available
this is a test :) :( :)


In [7]:
# apply preprocessor function to the entire DataFrame
df['review'] = df['review'].apply(preprocessor)

In [8]:
# tokenize the document by splitting words at their whitespaces
def tokenizer(text):
    return text.split()
# testing the tokenizer function
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [9]:
# reducing words to their root form by using PorterStemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
# testing the new tokenizer function
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [10]:
# installing NLTK library of 127 stop-words to remove them
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phoeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# removing stop-words (word with little significant meaning)
from nltk.corpus import stopwords
stop = stopwords.words('english')

# example for testing
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [12]:
# dividing into train (25,000) & test (25,000) data
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
#  find the optimal set of parameters for our logistic regression
#  model using 5-fold stratified cross-validation

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter], 
               'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False], 'vect__norm':[None], 
               'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf',
                       LogisticRegression(random_state=0,
                                          solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',
                           cv=5, verbose=2, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
# print the best parameter set
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)             

In [None]:
# average 5-fold cross-validation accuracy scores on the training dataset
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
# classification accuracy on the test dataset
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f'
      % clf.score(X_test, y_test))

## **Out-Of-Core Learning**

In [16]:
# tokenizer that also removes stop-words
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)',
    text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) \
                  + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [17]:
# generator function that reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [18]:
# testing the stream_docs() function
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [19]:
# function that will take a document stream from the stream_docs() function
# and return a particular number of documents specified by the size parameter
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [20]:
# using the HashingVectorizer to find the optimal set of parameters
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [21]:
# starting out-of-core learning
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:19


In [22]:
# using the last 5,000 documents to evaluate the model
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [23]:
# using the last 5,000 documents to update our model
clf = clf.partial_fit(X_test, y_test)

## **Latent Dirichlet Allocation (LDA)**


LDA is a generative probabilistic model that tries to find groups of words that
appear frequently together across different documents. These frequently appearing
words represent our topics, assuming that each document is a mixture of different
words. 

In [24]:
# loading the dataset into DF
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [25]:
# creating bag-of-words matrix as input to the LDA
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                        max_df=.1,              # setting the max dod frequency to 10 percent
                        max_features=5000)
X = count.fit_transform(df['review'].values)

In [26]:
# fitting the LDA to the bag-of-words matrix
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

In [27]:
# accessing the components of the LDA (matrix containing word importance)
lda.components_.shape

(10, 5000)

In [28]:
# printing the 5 most important words for each of the 10 topics
# (ranked in increasing order)
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                    [:-n_top_words - 1:-1]]))

Topic 1:
horror effects budget special gore
Topic 2:
guy worst money minutes stupid
Topic 3:
version action japanese english match
Topic 4:
book audience human feel documentary
Topic 5:
series tv episode shows episodes
Topic 6:
family woman father mother girl
Topic 7:
music musical role performance song
Topic 8:
war police men murder action
Topic 9:
script comedy role actor performance
Topic 10:
comedy original action watched fan


In [29]:
# testing the categories by printing out category 6 to see if they are
# the topic that the algorithm analyzed
# CATEGORY 6 IS ANALYZED TO BE A FAMILY MOVIE
family = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(family[:3]):
    print('\nFamily movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Family movie #1:
I don't know whether this film hits my heart the way it does because of the feelings of friendship, love, closeness to others or the warmth of that transformation Babette's cooking creates, but when the feast starts and for the rest of the movie, I choke up often. <br /><br />Yes, this is a feel-goo ...

Family movie #2:
The morbid Catholic writer Gerard Reve (Jeroen Krabbé) that is homosexual, alcoholic and has frequent visions of death is invited to give a lecture in the literature club of Vlissingen. While in the railway station in Amsterdam, he feels a non-corresponded attraction to a handsome man that embarks i ...

Family movie #3:
This was just another marvelous film of the Berlin Festival. But unlike "Yes", by Sally Potter, which I had seen some days before, where after leaving the cinema I felt a strong desire of wishing to embrace the whole world and was just happy to be alive, this time quite the opposite thing happened:  ...


## **PICKLE METHOD**

In [34]:
import pickle
import os
# creating a movieclassifier directory where we will later store the files and data for our web application.
# creating a pkl_objects subdirectory to save the serialized Python objects to our local hard drive or solid-state drive.
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
# serializing the trained logistic regression model as well as the stop-word set from the Natural Language Toolkit (NLTK) library, so that
# we don't have to install the NLTK vocabulary on our server.
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [35]:
import sqlite3
import os

# created a connection (conn) to an SQLite database file by calling the connect method of the sqlite3 library, which created
# the new database file reviews.sqlite in the movieclassifier directory if it didn't already exist.
conn = sqlite3.connect('reviews.sqlite')
# created a cursor via the cursor method, which allows us to traverse over the database records using the versatile SQL syntax.
# Via the first execute call, we then created a new database table, review_db. We used this to store and access database entries. 
# Along with review_db, we also created three columns in this database table: review, sentiment, and date. 
# We used these to store two example movie reviews and respective class labels (sentiments).
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db'\
          ' (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db"\
          " (review, sentiment, date) VALUES"\
          " (?, ?, DATETIME('now'))", (example1, 1))                # added date and timestamps to our entries.
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db"\
          " (review, sentiment, date) VALUES"\
          " (?, ?, DATETIME('now'))", (example2, 0))
# save the changes that we made to the database
conn.commit()  
# closing the connection                            
conn.close()

In [36]:
# checking if the entries have been stored in the database table correctly
# by reopening the connection to the database and use the SQL SELECT command to fetch all
# rows in the database table that have been committed between the beginning of the year 2017 and today:

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date"\
          " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2025-03-14 23:29:11'), ('I disliked this movie', 0, '2025-03-14 23:29:11')]


## **Turning the movie review classifier into a web application**