# IMDB reviews's sentiment analysis

## Prerequisites

### Install requirements

In [6]:
import sys

print(sys.executable)

with open("requirements.txt") as requirements_file:
    requirements = [line.strip()
                    for line in requirements_file.readlines()]

for package in requirements:
    !{sys.executable} -m pip -q install {package}
print("Installation finished!")

/Users/sergiidenysiuk/.virtualenvs/ml-team-wow/bin/python3.7
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Installation finished!


### Defaults

In [4]:
%%writefile config.py

DATASET_FILE_PATH = "aclImdb_v1.tar.gz"
DATASET_TEST_NEG_REVIEW = 'aclImdb/test/neg/*.txt'
DATASET_TEST_POS_REVIEW = 'aclImdb/test/pos/*.txt'
DATASET_TRAINING_NEG_REVIEW = 'aclImdb/train/neg/*.txt'
DATASET_TRAINING_POS_REVIEW = 'aclImdb/train/pos/*.txt'
DATASET_TRAINING_UNSUP_REVIEW = 'aclImdb/train/unsup/*.txt'


Overwriting config.py


## The task

Sentiment analysis is a challenging subject in machine learning. People express their emotions in language that is often obscured by sarcasm, ambiguity, and plays on words, all of which could be very misleading for both humans and computers. This is an example of sentiment analysis for movie review.

## The data

To achieve these goals, used a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. This dataset was collected in association with the following publication: http://ai.stanford.edu/~amaas/data/sentiment/

### Prepare dataset

In [7]:
import tarfile

import config

tar = tarfile.open(config.DATASET_FILE_PATH, "r:gz")
tar.extractall()
tar.close()


### Read dataset

In [9]:
%%writefile utils/read_data.py
import typing

from parsers import base as base_parser


def read_and_parse(path_pattern: str,
                   parser: typing.Type[base_parser.BaseParser],
                   **kwargs: typing.Any) -> Iterator[typing.Tuple[str, str]]:
    """
    Read and clean data from all files that match to given path-pattern.

    :param path_pattern: pattern for files that must be processed
    :param parser: parser class
    :return: tuple with filename and parsed and cleaned data from file, example ('file', file_data)
    """
    for filename in glob.glob(path_pattern):
        with open(filename, 'r') as file:
            yield filename, parser.parse(file.read(), **kwargs)


def concat_sets(reviews: typing.Iterable[typing.Tuple[bool, typing.Iterable[typing.Tuple[str, str]]]],
                negative_reviews: typing.Iterable[typing.Tuple[str, str]],
                is_join: bool = False,
                is_shuffle: bool = False) -> pd.DataFrame:
    """
    Build dataset from given positive and negative datasets.

    :param positive_reviews: list with positive data, example [(True, [('file_1', 'positive review text'), ...]), ...])
    :param negative_reviews: list with negative data, example [('file_2', 'negative review text'), ...])
    :param is_join: join items in each set from words to sentences
    :param is_shuffle: shuffle positive and negative reviews
    :return: table with filenames, reviews and it's sentiment values, respectively
    """
    data: typing.List[typing.Tuple[str, typing.Union[list, str], str]] = []
    dataset = ((positive_reviews, True), (negative_reviews, False))
    
    for reviews, sentiment in dataset:
        data.extend(
            (
                filename,
                " ".join(filedata) if is_join else filedata,
                sentiment,
            ) for filename, filedata in reviews)

    if is_shuffle:
        random.shuffle(data)

    return pd.DataFrame(data,
                        columns=columns)


Writing utils/read_data.py


### Parse data

In [None]:
from abc import ABCMeta
import re

from bs4 import BeautifulSoup
import nltk.data

try:
    nltk.data.find('stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords

EN_STOPWORDS = set(stopwords.words('english'))


class BaseParser(metaclass=ABCMeta):

    @classmethod
    def clean_html_markup(cls, text, parser='html.parser'):
        """Remove HTML markup."""
        return BeautifulSoup(text, parser).get_text()

    @classmethod
    def remove_non_letters(cls, text):
        """Remove non-letters."""
        return re.sub('[^a-zA-Z]', ' ', text)

    @classmethod
    def to_lower(cls, text):
        """Convert text to lowercase."""
        return text.lower()

    @classmethod
    def split_to_words(cls, text):
        """Split into individual words."""
        return text.split()

    @classmethod
    def remove_stopwords(cls, words, stopwords_list=EN_STOPWORDS):
        """Remove stopwords."""
        return [w for w in words if w not in EN_STOPWORDS]

    @classmethod
    def parse(cls, text, *args, **kwargs):
        raise NotImplementedError("Should have implemented this")


class WordsParser(BaseParser):
    """
    Implement processing raw HTML text
    into segments of words for further learning.
    """

    @classmethod
    def parse(cls,
              text,
              is_remove_non_letters=True,
              is_remove_stopwords=True):
        """
        Get cleaned words from text.

        :param text: text to parse
        :type text: string
        :param is_remove_non_letters: does non-letters have to be removed
        :type is_remove_non_letters: bool
        :param is_remove_stopwords: does stopwords have to be removed
        :type is_remove_stopwords: bool
        :return: list with cleaned words
        :rtype: list
        """
        result = cls.clean_html_markup(text)

        if is_remove_non_letters:
            result = cls.remove_non_letters(result)

        result = cls.to_lower(result)
        result = cls.split_to_words(result)

        if is_remove_stopwords:
            result = cls.remove_stopwords(result)

        return result


In [5]:
print("Read, clean and parse train data...")
train_data = utils.concat_sets(
    utils.read_and_parse(config.DATA_TRAINING_POS_REVIEW, word_parser.WordsParser),
    utils.read_and_parse(config.DATA_TRAINING_NEG_REVIEW, word_parser.WordsParser),
    columns=["id", "text", "sentiment"],
    is_join=True, is_shuffle=True)
print("Done.")

print("Read, clean and parse test data...")
test_data = utils.concat_sets(
    utils.read_and_parse(config.DATA_TEST_POS_REVIEW, word_parser.WordsParser),
    utils.read_and_parse(config.DATA_TEST_NEG_REVIEW, word_parser.WordsParser),
    columns=["id", "text", "sentiment"],
    is_join=True, is_shuffle=True)
print("Done.")

Read, clean and parse train data...
Done.
Read, clean and parse test data...
Done.


In [6]:
train_data.head()

Unnamed: 0,id,text,sentiment
0,aclImdb/train/neg/5925_3.txt,worse star trek tos episode maybe least gets v...,False
1,aclImdb/train/neg/1784_1.txt,ok taped tv missed start film seconds titles a...,False
2,aclImdb/train/neg/1682_2.txt,takashi shimizu great opportunity remake origi...,False
3,aclImdb/train/neg/7339_3.txt,maya woman without interests dreams life away ...,False
4,aclImdb/train/neg/4081_1.txt,worst film seen peter greenaway close dishonor...,False


In [7]:
test_data.head()

Unnamed: 0,id,text,sentiment
0,aclImdb/test/pos/11149_8.txt,recently purchased universal marlene dietrich ...,True
1,aclImdb/test/neg/9344_1.txt,want know writers movie consider funny robot c...,False
2,aclImdb/test/pos/10966_8.txt,ends declaration film seen improvisation makin...,True
3,aclImdb/test/neg/1672_1.txt,received movie pack called star movies cents m...,False
4,aclImdb/test/neg/5677_2.txt,show amazing fresh innovative idea first aired...,False


## Prepare text data (convert a collection of text documents to a matrix of token counts)

## Word embeddings

The problem with text is that machine learning algorithms cannot work with raw text directly. The text must be converted into numbers, specifically, vectors of numbers.


Bag of Words

## Intro

The **Bag of Words** (or **BoW**) model defines a texts' vocabulary, then models each text by counting the number of times each word appears. So it's just throws away all of the order information in the words and focuses on the occurrence of words in a document.

For example, consider the following two sentences:
* Sentence 1: "The cat sat on the hat"
* Sentence 2: "The dog ate the cat and the hat"

From these two sentences, vocabulary is as follows: *{ the, cat, sat, on, hat, dog, ate, and }*

To get bags of words, count the number of times each word occurs in each sentence.
* Sentence 1: { 2, 1, 1, 1, 1, 0, 0, 0 }
* Sentence 2: { 3, 1, 0, 0, 1, 1, 1, 1 }

In the IMDB data, there is a very large number of reviews, which will give a large vocabulary. To limit the size of the feature vectors, choose some maximum vocabulary size. Below, used the $5000$ most frequent words (remember that stopwords have already been removed in previous step).

**Note.** `CountVectorizer` comes with its own options to automatically do preprocessing, tokenization, and stop word removal for each of these, instead of specifying `None`, it's possible to use a built-in method or custom function, however, in this example, for data cleaning, custom parser is used.

In [8]:
print("Creating the Bag Of Words...")
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word",
                             max_features=5000)
print("Done.")

Creating the Bag Of Words...
Done.


In [9]:
%%time
print("Learn a vocabulary from documents...")
vectorizer.fit(train_data["text"].tolist())
print("Done.")

Learn a vocabulary from documents...
Done.
CPU times: user 3.04 s, sys: 91.3 ms, total: 3.13 s
Wall time: 3.29 s


In [10]:
vectorizer.get_feature_names()

['abandoned',
 'abc',
 'abilities',
 'ability',
 'able',
 'abraham',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absurd',
 'abuse',
 'abusive',
 'abysmal',
 'academy',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accident',
 'accidentally',
 'accompanied',
 'accomplished',
 'according',
 'account',
 'accuracy',
 'accurate',
 'accused',
 'achieve',
 'achieved',
 'achievement',
 'acid',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'activities',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adams',
 'adaptation',
 'adaptations',
 'adapted',
 'add',
 'added',
 'adding',
 'addition',
 'adds',
 'adequate',
 'admire',
 'admit',
 'admittedly',
 'adorable',
 'adult',
 'adults',
 'advance',
 'advanced',
 'advantage',
 'adventure',
 'adventures',
 'advertising',
 'advice',
 'advise',
 'affair',
 'affect',
 'affected',
 'afford',
 'aforementioned',
 'afraid',
 'africa',
 'african',
 'after

In [6]:
vectorizer.get_stop_words()

NameError: name 'vectorizer' is not defined

In [12]:
print("Encode each train movie review document to vector...")
train_vectors = vectorizer.transform(train_data["text"].tolist()).toarray()
print("Done.")

print("Encode each test movie review document to vector...")
test_vectors = vectorizer.transform(test_data["text"].tolist()).toarray()
print("Done.")

Encode each train movie review document to vector...
Done.
Encode each test movie review document to vector...
Done.


In [7]:
train_vectors
train_vectors

NameError: name 'train_vectors' is not defined

In [14]:
test_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Train classifiers

In [None]:
# %load classifiers/sklearn.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier


def random_forest(train_review_bag_of_words, train_review_sentiments,
                  n_estimators=100):
    """Random Forest classifier."""
    forest = RandomForestClassifier(n_estimators=n_estimators)
    forest = forest.fit(train_review_bag_of_words, train_review_sentiments)
    return forest


def naive_bayes_gaussian(train_review_bag_of_words, train_review_sentiments):
    """Naive Bayes Gaussian classifier."""
    nbg = GaussianNB()
    nbg = nbg.fit(train_review_bag_of_words, train_review_sentiments)
    return nbg


def naive_bayes_multinomial(train_review_bag_of_words, train_review_sentiments):
    """Naive Bayes Multinomial classifier."""
    nbm = MultinomialNB()
    nbm = nbm.fit(train_review_bag_of_words, train_review_sentiments)
    return nbm


def naive_bayes_bernoulli(train_review_bag_of_words, train_review_sentiments):
    """Naive Bayes Bernoulli classifier."""
    nbb = BernoulliNB()
    nbb = nbb.fit(train_review_bag_of_words, train_review_sentiments)
    return nbb


def k_nearest_neighbors(train_review_bag_of_words, train_review_sentiments,
                        n_neighbors=100, weights='uniform', algorithm='auto'):
    """k-Nnearest Neighbors classifier."""
    knn = KNeighborsClassifier(
        n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    knn = knn.fit(train_review_bag_of_words, train_review_sentiments)
    return knn


In [16]:
print("Training the Random Forest...")
n_estimators = 100
test_sentiments_predicted_rf = sk_classifiers.random_forest(
    train_vectors, train_data["sentiment"], n_estimators=n_estimators).predict(test_vectors)
print("Done.")

Training the Random Forest...
Done.


In [17]:
print("Training the Naive Bayes Gaussian...")
test_sentiments_predicted_nbg = sk_classifiers.naive_bayes_gaussian(
    train_vectors, train_data["sentiment"]).predict(test_vectors)
print("Done.")

Training the Naive Bayes Gaussian...
Done.


In [18]:
print("Training the Naive Bayes Multinomial...")
test_sentiments_predicted_nbm = sk_classifiers.naive_bayes_multinomial(
    train_vectors, train_data["sentiment"]).predict(test_vectors)
print("Done.")

Training the Naive Bayes Multinomial...
Done.


In [19]:
print("Training the Naive Bayes Bernoulli...")
test_sentiments_predicted_nbb = sk_classifiers.naive_bayes_bernoulli(
    train_vectors, train_data["sentiment"]).predict(test_vectors)
print("Done.")

Training the Naive Bayes Bernoulli...
Done.


In [20]:
print("Training the k-Nearest Neighbors...")
n_neighbors = 100
test_sentiments_predicted_knn = sk_classifiers.k_nearest_neighbors(
    train_vectors, train_data["sentiment"], n_neighbors=n_neighbors).predict(test_vectors)
print("Done.")

Training the k-Nearest Neighbors...
Done.


## Check models' accuracy and save summary to files

In [21]:
# %load -s write_results_to_csv utils.py

In [22]:
filename_sklearn_rf = 'bag-of-words-sklearn-rf-model.csv'
filename_sklearn_nbg = 'bag-of-words-sklearn-nbg-model.csv'
filename_sklearn_nbm = 'bag-of-words-sklearn-nbm-model.csv'
filename_sklearn_nbb = 'bag-of-words-sklearn-nbb-model.csv'
filename_sklearn_knn = 'bag-of-words-sklearn-knn-model.csv'
filename_summary = 'bag-of-words-summary.txt'

print(f"Write Random Forest results to {filename_sklearn_rf}")
utils.write_results_to_csv(
    test_data["id"],
    test_data["sentiment"],
    test_sentiments_predicted_rf,
    filename_sklearn_rf)
print("Done.")

print(f"Write Naive Bayes Gaussian results to {filename_sklearn_nbg}")
utils.write_results_to_csv(
    test_data["id"],
    test_data["sentiment"],
    test_sentiments_predicted_nbg,
    filename_sklearn_nbg)
print("Done.")

print(f"Write Naive Bayes Multinomial results to {filename_sklearn_nbm}")
utils.write_results_to_csv(
    test_data["id"],
    test_data["sentiment"],
    test_sentiments_predicted_nbm,
    filename_sklearn_nbm)
print("Done.")

print(f"Write Naive Bayes Bernoulli results to {filename_sklearn_nbb}")
utils.write_results_to_csv(
    test_data["id"],
    test_data["sentiment"],
    test_sentiments_predicted_nbb,
    filename_sklearn_nbb)
print("Done.")

print(f"Write k-Nearest Neighbors results to {filename_sklearn_knn}")
utils.write_results_to_csv(
    test_data["id"],
    test_data["sentiment"],
    test_sentiments_predicted_knn,
    filename_sklearn_knn)
print("Done.")

Write Random Forest results to bag-of-words-sklearn-rf-model.csv
Done.
Write Naive Bayes Gaussian results to bag-of-words-sklearn-nbg-model.csv
Done.
Write Naive Bayes Multinomial results to bag-of-words-sklearn-nbm-model.csv
Done.
Write Naive Bayes Bernoulli results to bag-of-words-sklearn-nbb-model.csv
Done.
Write k-Nearest Neighbors results to bag-of-words-sklearn-knn-model.csv
Done.


In [23]:
# %load -s calculate_accuracy utils.py

In [24]:
# %load -s count_words utils.py

In [25]:
print(f"Write summary results to {filename_summary}")
with open(filename_summary, "w") as file_summary:
    print('Size of train dataset: {size}'.format(
        size=len(train_data["id"])), file=file_summary)

    print('Size of test dataset: {size}'.format(
        size=len(test_data["id"])), file=file_summary)

    print('', file=file_summary)

    print('Number of trees in Random Forest: {trees}'.format(
        trees=n_estimators), file=file_summary)

    print('Number of neighbors in KNN: {neighbors}'.format(
        neighbors=n_neighbors), file=file_summary)

    print('', file=file_summary)

    print('Accuracy of the the Random Forest sklearn: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(
            test_data["sentiment"], test_sentiments_predicted_rf)), file=file_summary)

    print('Accuracy of the Naive Bayes Gaussian sklearn: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(
            test_data["sentiment"], test_sentiments_predicted_nbg)), file=file_summary)

    print('Accuracy of the Naive Bayes Multinomial sklearn: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(
            test_data["sentiment"], test_sentiments_predicted_nbm)), file=file_summary)

    print('Accuracy of the Naive Bayes Bernoulli sklearn: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(
            test_data["sentiment"], test_sentiments_predicted_nbb)), file=file_summary)

    print('Accuracy of the k-Nearest Neighbors sklearn: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(
            test_data["sentiment"], test_sentiments_predicted_knn)), file=file_summary)

    print('', file=file_summary)

    print('Count of each word in train dataset: {counts}'.format(
        counts=utils.count_words(vectorizer.get_feature_names(), train_data["text"])), file=file_summary)
print("Done.")

Write summary results to bag-of-words-summary.txt
Done.


***