**Importing the required modules**

In [None]:
# Importing the NLTK library and Random module
import nltk
import random

In [None]:
# Downloading all the NLTK files
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
# importing the Movie Review corpus
from nltk.corpus import movie_reviews

**Creating a list of documents**

In [None]:
# Creating an empty list for storing the documents
documents = []

# For each category in the movie_reviews corpus
for category in movie_reviews.categories():

  # for each file identifiers in the each category
  for fileid in movie_reviews.fileids(category):
    
    # appending the file identifiers and cateories in a list. (the fileids and categories are stored in a set)
    documents.append((list(movie_reviews.words(fileid)), category))

**We will now comment out the shuffle function for the documents**

In [None]:
# Using the shuffle function of the random module to shuffle the elements of the documents list.
# random.shuffle(documents)

**Creating a list of all word tokens**

In [None]:
# Let's get all the word tokens of the movie_review corpus and stored it in a list (all_words)
all_words_tokens = []
for w in movie_reviews.words():
    all_words_tokens.append(w.lower()) # lowring the words

**Frequency Distribution of all the word tokens**

In [None]:
# Get the frequency distribition of all the words
all_words_freq = nltk.FreqDist(all_words_tokens)

In [None]:
# Print out the length of the all frequent words list
print(len(all_words_freq))

39768


In [None]:
all_words_freq.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

**Removing Stopwords and Punctuations**

In [None]:
# Importing stopwords and punctuations
from nltk.corpus import stopwords
import string               # for punctuations

In [None]:
# Getting the English stopwords
stopwords_eng = stopwords.words("english")

In [None]:
# Creating a function to remove stopwords and punctuations
def remove_punc_stopwords(txt):
    """
        1. First we will remove punctutations
        2. Then, we will remove stopwords
        3. Lastly, we will return the clean word tokens
    """
    nopunc = [char for char in txt if char not in string.punctuation]
    no_stops = [word for word in nopunc if word.lower() not in stopwords_eng]
    return no_stops

In [None]:
# Running the function on all word tokens
all_words_tokens_cleaned = remove_punc_stopwords(all_words_tokens)

In [None]:
# Let's see the lengths of the all word tokens list prior and after removing stopwords and punctuations
print("Original len of all word tokens = ", len(all_words_tokens))
print("After removal of stopwords and punctuations,  len of all word tokens = ", len(all_words_tokens_cleaned))

Original len of all word tokens =  1583820
After removal of stopwords and punctuations,  len of all word tokens =  710578


In [None]:
# Frequency Distribution of all the word tokens after removing punctuations and stopwords
all_words_tokens_cleaned_freq = nltk.FreqDist(all_words_tokens_cleaned)

In [None]:
# Now let's see the top 15 most common words 
all_words_tokens_cleaned_freq.most_common(15)

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906)]

In [None]:
# Let's check the length of all freq words
print(len(all_words_tokens_cleaned_freq))

39586


In [None]:
# Most common words (2000 freq words)
most_common_word_tokens = all_words_tokens_cleaned_freq.most_common(2000)

# print top 10 most common words
print(most_common_word_tokens[:10])

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]


In [None]:
# Least 10 freq words (botton 10 common words)
print(most_common_word_tokens[1990:])

[('remain', 64), ('anna', 64), ('moved', 64), ('asking', 64), ('genuinely', 64), ('rain', 64), ('path', 64), ('aware', 64), ('causes', 64), ('international', 64)]


In [None]:
# Since the elements of the most_common_word_tokens list are in the form of tuples, we need to extract the first element of each tuple to get the words as word features
word_features = [token[0] for token in most_common_word_tokens]

# Print out the top 10 word features
print(word_features[:10])

['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


In [None]:
# Length of the word_features list
len(word_features)

2000

**Creating a Feature Set**

In [None]:
# Creating a function to get the features (words) in a dictionary
def doc_features(doc):
    
    # creating a set for all the unique words present in a document
    doc_words = set(doc)
    
    # creating an empty features list
    features = {}
    
    # Will iterate through all the words present in the word_features list
    for word in word_features:
        
        # Get that word and see its presence in the document (will return a bollean value)
        features[word] = (word in doc_words)
    
    return features

In [None]:
# Now, we are going to create a feature set which will contain the word features of the review and its correspoding category
feature_sets = [(doc_features(review), category) for (review, category) in documents]

In [None]:
# Print out the first element of the feature set
print(feature_sets[0])

({'film': True, 'one': True, 'movie': True, 'like': True, 'even': True, 'good': True, 'time': False, 'story': False, 'would': True, 'much': False, 'character': True, 'also': True, 'get': True, 'two': True, 'well': True, 'characters': True, 'first': False, '--': False, 'see': True, 'way': True, 'make': True, 'life': True, 'really': True, 'films': True, 'plot': True, 'little': True, 'people': True, 'could': False, 'scene': False, 'man': False, 'bad': True, 'never': False, 'best': False, 'new': True, 'scenes': True, 'many': False, 'director': True, 'know': True, 'movies': True, 'action': False, 'great': False, 'another': False, 'love': False, 'go': True, 'made': False, 'us': True, 'big': True, 'end': False, 'something': False, 'back': True, 'still': True, 'world': True, 'seems': True, 'work': False, 'makes': True, 'however': False, 'every': True, 'though': False, 'better': False, 'real': False, 'audience': True, 'enough': False, 'seen': False, 'take': False, 'around': False, 'going': True

**Model Training**

Now, we will create training and testing sets.

Positive data example 

In [None]:
# Training set and Testing set
train_data = feature_sets[:1900]
test_data = feature_sets[1900:]

In [None]:
# Length of training set
len(train_data)

1900

In [None]:
# Length of testing set
len(test_data)

100

In [None]:
(len(train_data)/2000, len(test_data)/2000)

(0.95, 0.05)

Negative data example

In [None]:
# Training set and Testing set
train_data1 = feature_sets[100:]
test_data1 = feature_sets[:100]

In [None]:
# Length of training set
len(train_data1)

1900

In [None]:
# Length of testing set
len(test_data1)

100

In [None]:
(len(train_data1)/2000, len(test_data1)/2000)

(0.95, 0.05)

We will be using the **Naive Bayes Classifier** for our training.

In [None]:
# Importing the NaiveBayesClassifier from nltk
from nltk import NaiveBayesClassifier

# Creating an instance of our classifier and training the model
base_model = NaiveBayesClassifier.train(train_data)
base_model1 = NaiveBayesClassifier.train(train_data1)

In [None]:
# Importing classify from nltk
from nltk import classify

# Calculating the accuracy of the base model 
accuracy_score = classify.accuracy(base_model, test_data)
accuracy_score1 = classify.accuracy(base_model1, test_data1)
print("Accuracy Score of Base Model (+ve data example): {}%".format(100 * accuracy_score))
print("Accuracy Score of Base Model1 (-ve data example): {}%".format(100 * accuracy_score1))

Accuracy Score of Base Model (+ve data example): 76.0%
Accuracy Score of Base Model1 (-ve data example): 87.0%


**Importing other classifiers**

In [None]:
# Importing scikit-learn module from NLTK (a wrapper for sklearn)
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
# Lets use some other types of Naive Bayes classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# Lets import some more classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [None]:
# Multinomial Naive Bayes classifier
multinomial_nb_model = SklearnClassifier(MultinomialNB())
multinomial_nb_model.train(train_data)

multinomial_nb_model1 = SklearnClassifier(MultinomialNB())
multinomial_nb_model1.train(train_data1)

<SklearnClassifier(MultinomialNB())>

In [None]:
# Bernoulli Naive Bayes classifier
bernoulli_nb_model = SklearnClassifier(BernoulliNB())
bernoulli_nb_model.train(train_data)

bernoulli_nb_model1 = SklearnClassifier(BernoulliNB())
bernoulli_nb_model1.train(train_data1)

<SklearnClassifier(BernoulliNB())>

In [None]:
# Logistic Regression classifier
logit_model = SklearnClassifier(LogisticRegression())
logit_model.train(train_data)

logit_model1 = SklearnClassifier(LogisticRegression())
logit_model1.train(train_data1)

<SklearnClassifier(LogisticRegression())>

In [None]:
# Stochastic Gradient Descent classifier
sgd_model = SklearnClassifier(SGDClassifier())
sgd_model.train(train_data)

sgd_model1 = SklearnClassifier(SGDClassifier())
sgd_model1.train(train_data1)

<SklearnClassifier(SGDClassifier())>

In [None]:
# C-Support Vector Classification classifier
svc_model = SklearnClassifier(SVC())
svc_model.train(train_data)

svc_model1 = SklearnClassifier(SVC())
svc_model1.train(train_data1)

<SklearnClassifier(SVC())>

In [None]:
# Nu-Support Vector Classification classifier
nu_svc_model = SklearnClassifier(NuSVC())
nu_svc_model.train(train_data)

nu_svc_model1 = SklearnClassifier(NuSVC())
nu_svc_model1.train(train_data1)

<SklearnClassifier(NuSVC())>

Accuracy on Positive data example

In [None]:
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score))
print("MultinomialNB Model Accuracy Score: {}%".format(100 * classify.accuracy(multinomial_nb_model, test_data)))
print("BernoulliNB Model Accuracy Score: {}%".format(100 * classify.accuracy(bernoulli_nb_model, test_data)))
print("LogisticRegression Model Accuracy Score: {}%".format(100 * classify.accuracy(logit_model, test_data)))
print("SGDClassifier Model Accuracy Score: {}%".format(100 * classify.accuracy(sgd_model, test_data)))
print("SVC Model Accuracy Score: {}%".format(100 * classify.accuracy(svc_model, test_data)))
print("NuSVC Model Accuracy Score: {}%".format(100 * classify.accuracy(nu_svc_model, test_data)))

Accuracy Score of Base Model : 76.0%
MultinomialNB Model Accuracy Score: 80.0%
BernoulliNB Model Accuracy Score: 76.0%
LogisticRegression Model Accuracy Score: 83.0%
SGDClassifier Model Accuracy Score: 86.0%
SVC Model Accuracy Score: 86.0%
NuSVC Model Accuracy Score: 85.0%


Accuracy on Negative data example

In [None]:
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score1))
print("MultinomialNB Model Accuracy Score: {}%".format(100 * classify.accuracy(multinomial_nb_model1, test_data1)))
print("BernoulliNB Model Accuracy Score: {}%".format(100 * classify.accuracy(bernoulli_nb_model1, test_data1)))
print("LogisticRegression Model Accuracy Score: {}%".format(100 * classify.accuracy(logit_model1, test_data1)))
print("SGDClassifier Model Accuracy Score: {}%".format(100 * classify.accuracy(sgd_model1, test_data1)))
print("SVC Model Accuracy Score: {}%".format(100 * classify.accuracy(svc_model1, test_data1)))
print("NuSVC Model Accuracy Score: {}%".format(100 * classify.accuracy(nu_svc_model1, test_data1)))

Accuracy Score of Base Model : 87.0%
MultinomialNB Model Accuracy Score: 85.0%
BernoulliNB Model Accuracy Score: 87.0%
LogisticRegression Model Accuracy Score: 76.0%
SGDClassifier Model Accuracy Score: 73.0%
SVC Model Accuracy Score: 82.0%
NuSVC Model Accuracy Score: 83.0%


**We will build our classifier class for combining different algorithms**

In [None]:
# Importing ClassifierI from the NLTK library
from nltk.classify import ClassifierI

In [None]:
# Importing mode from the statistics module
from statistics import mode

In [None]:
# Creating a class for voting classifier and inheriting from ClassifierI class
class VotingClassifier(ClassifierI):

  # Next we will assign the list of classifiers that are passed to our class to self._classifiers
  def __init__(self, *classifiers):
     self._classifiers = classifiers


  # Creating our own classify method and we will name it 'classify' to invoke '.classify' later on
  def classify(self, features):

    # Creating a list for storing the votes
    votes = []

    # Will iterate through list of classifiers classify them on our feartures
    # and get the votes and append them in the votes list
    # and return the mode of votes (most popular vote)
    for algos in self._classifiers:
      v = algos.classify(features)
      votes.append(v)
    return mode(votes)

  # Creating a confidence method for getting the confidence indicator
  # We will tally the votes for and against the winning vote 
  def confidence(self, features):
    votes = []
    for algos in self._classifiers:
      v = algos.classify(features)
      votes.append(v)

    # Get the choice votes i.e. count the popular votes
    choice_votes = votes.count(mode(votes))
    
    # Get the confidence by dividing choice votes by total number of votes
    # and then return the confidence
    conf = choice_votes / len(votes)
    return conf

We will use some of the above created models in the voting classifier.

Positive data example

In [None]:
# We will now use the voting classifier and get the performance
# as combination of algorithms (+ve data example)
voted_model = VotingClassifier(base_model,
                               multinomial_nb_model,
                               bernoulli_nb_model,
                               logit_model,
                               sgd_model,
                               svc_model,
                               nu_svc_model)

Negative data example

In [None]:
# We will now use the voting classifier and get the performance
# as combination of algorithms (-ve data example)
voted_model1 = VotingClassifier(base_model1,
                               multinomial_nb_model1,
                               bernoulli_nb_model1,
                               logit_model1,
                               sgd_model1,
                               svc_model1,
                               nu_svc_model1)

In [None]:
# Printing out the voted model accuracy
print("Voted Model Accuracy Score (+ve data example): {}%".format(100 * classify.accuracy(voted_model, test_data)))
print("Voted Model Accuracy Score (-ve data example): {}%".format(100 * classify.accuracy(voted_model1, test_data1)))

Voted Model Accuracy Score (+ve data example): 84.0%
Voted Model Accuracy Score (-ve data example): 85.0%
