**Importing the required modules**

In [None]:
# Importing the NLTK library and Random module
import nltk
import random

In [None]:
# Downloading all the NLTK files
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Package abc is already up-to-date!
       | Downloading package alpino to /root/nltk_data...
       |   Package alpino is already up-to-date!
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Package averaged_perceptron_tagger is already up-to-date!
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Package averaged_perceptron_tagger_ru is already up-to-
       |       date!
       | Downloading package basque_grammars to /root/nltk_data...
       |   Package basque_grammars is already up-to-date!
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Package biocreative_ppi is already up-to-date!
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Package bllip_wsj_no_aux is already up-to-date!
       | Downloading package b


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
# Importing tokenizers
from nltk.tokenize import word_tokenize

**Uploading the text files in google colab**

In [None]:
# Improting file from google colab
from google.colab import files
uploaded = files.upload()

Saving negative.txt to negative.txt
Saving positive.txt to positive.txt


**Opening and reading the required text files (short reviews)**

In [None]:
# opening and reading the required text files
short_positives = open("/content/positive.txt", mode='r', encoding='ISO-8859-1').read()
short_negatives = open("/content/negative.txt", mode='r', encoding='ISO-8859-1').read()

**Storing the short reviews in a list**

In [None]:
# Creating an empty list for storing the documents
documents = []

# For each line of text in the short_positives and 
# short_negatives and appending them in documents list
for rev in short_positives.split('\n'):
  documents.append((rev, "pos"))

for rev in short_negatives.split('\n'):
  documents.append((rev, "neg"))

In [None]:
# Total reviews
len(documents)

10664

**Creating a list of all word tokens**

In [None]:
# Get all the word tokens of the short reviews and 
# store it in a list (all_words_tokens)
all_words_tokens = []
short_pos_word_tokens = word_tokenize(short_positives)
short_neg_word_tokens = word_tokenize(short_negatives)

for token in short_pos_word_tokens:
  all_words_tokens.append(token.lower())

for token in short_neg_word_tokens:
  all_words_tokens.append(token.lower())

**Frequency Distribution of all the word tokens**

In [None]:
# Get the frequency distribition of all the words
all_words_freq = nltk.FreqDist(all_words_tokens)

In [None]:
# Print out the length of the all frequent words list
print(len(all_words_freq))

20298


In [None]:
# Printing out the top 10 most common words 
all_words_freq.most_common(10)

[('.', 14010),
 ('the', 10113),
 (',', 10037),
 ('a', 7314),
 ('and', 6202),
 ('of', 6063),
 ('to', 4234),
 ('is', 3559),
 ("'s", 3537),
 ('it', 3422)]

In [None]:
# Since the elements of the most_common_word_tokens list are in the form of tuples, 
# we need to extract the first element of each tuple to get the words as word features
# we can call the keys of the tuples
word_features = list(all_words_freq.keys())[:5000]

# Print out the top 10 word features
print(word_features[:10])

['the', 'rock', 'is', 'destined', 'to', 'be', '21st', 'century', "'s", 'new']


In [None]:
# Length of the word_features list
len(word_features)

5000

**Creating a Feature Set**

In [None]:
# Creating a function to get the features (words) in a dictionary
def doc_features(doc):
    
    doc_words = word_tokenize(doc)
    
    # creating an empty features list
    features = {}
    
    # Will iterate through all the words present in the word_features list
    for word in word_features:
        
        # Get that word and see its presence in the document (will return a bollean value)
        features[word] = (word in doc_words)
    
    return features

In [None]:
# Now, we are going to create a feature set which will contain the word features of the review and its correspoding category
feature_sets = [(doc_features(review), category) for (review, category) in documents]

In [None]:
# Shuffling the feature_sets
random.shuffle(feature_sets)

In [None]:
# Length of the feature_sets
len(feature_sets)

10664

**Model Training**

Now, we will create training and testing sets.

In [None]:
# Training set and Testing set
train_data = feature_sets[:9000]
test_data = feature_sets[9000:]

In [None]:
# Length of training set
len(train_data)

9000

In [None]:
# Length of testing set
len(test_data)

1664

In [None]:
(len(train_data)/len(feature_sets), len(test_data)/len(feature_sets))

(0.8439609902475619, 0.15603900975243812)

We will be using the **Naive Bayes Classifier** for our training.

In [None]:
# Importing the NaiveBayesClassifier from nltk
from nltk import NaiveBayesClassifier

# Creating an instance of our classifier and training the model
base_model = NaiveBayesClassifier.train(train_data)

In [None]:
# Importing classify from nltk
from nltk import classify

# Calculating the accuracy of the base model 
accuracy_score = classify.accuracy(base_model, test_data)
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score))

Accuracy Score of Base Model : 72.35576923076923%


In [None]:
# Show 15 most informative features
print(base_model.show_most_informative_features(15))

Most Informative Features
              engrossing = True              pos : neg    =     18.9 : 1.0
              refreshing = True              pos : neg    =     13.6 : 1.0
                supposed = True              neg : pos    =     13.1 : 1.0
            refreshingly = True              pos : neg    =     12.3 : 1.0
               affecting = True              pos : neg    =     11.6 : 1.0
                powerful = True              pos : neg    =     11.2 : 1.0
           extraordinary = True              pos : neg    =     11.0 : 1.0
               inventive = True              pos : neg    =     11.0 : 1.0
               realistic = True              pos : neg    =     11.0 : 1.0
                touching = True              pos : neg    =     11.0 : 1.0
               wonderful = True              pos : neg    =     11.0 : 1.0
              thoughtful = True              pos : neg    =     10.6 : 1.0
                 quietly = True              pos : neg    =     10.3 : 1.0

**Importing Other Classifiers** 

In [None]:
# Importing scikit-learn module from NLTK (a wrapper for sklearn)
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
# Lets use some other types of Naive Bayes classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# Lets import some more classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

In [None]:
# Multinomial Naive Bayes classifier
multinomial_nb_model = SklearnClassifier(MultinomialNB())
multinomial_nb_model.train(train_data)

<SklearnClassifier(MultinomialNB())>

In [None]:
# Bernoulli Naive Bayes classifier
bernoulli_nb_model = SklearnClassifier(BernoulliNB())
bernoulli_nb_model.train(train_data)

<SklearnClassifier(BernoulliNB())>

In [None]:
# Stochastic Gradient Descent classifier
sgd_model = SklearnClassifier(SGDClassifier())
sgd_model.train(train_data)

<SklearnClassifier(SGDClassifier())>

In [None]:
# Linear Support Vector Classification classifier
linear_svc_model = SklearnClassifier(LinearSVC())
linear_svc_model.train(train_data)

<SklearnClassifier(LinearSVC())>

In [None]:
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score))
print("MultinomialNB Model Accuracy Score: {}%".format(100 * classify.accuracy(multinomial_nb_model, test_data)))
print("BernoulliNB Model Accuracy Score: {}%".format(100 * classify.accuracy(bernoulli_nb_model, test_data)))
print("SGDClassifier Model Accuracy Score: {}%".format(100 * classify.accuracy(sgd_model, test_data)))
print("LinearSVC Model Accuracy Score: {}%".format(100 * classify.accuracy(linear_svc_model, test_data)))

Accuracy Score of Base Model : 72.35576923076923%
MultinomialNB Model Accuracy Score: 71.15384615384616%
BernoulliNB Model Accuracy Score: 71.93509615384616%
SGDClassifier Model Accuracy Score: 70.85336538461539%
LinearSVC Model Accuracy Score: 70.85336538461539%


**We will build our classifier class for combining different algorithms**

In [None]:
# Importing ClassifierI from the NLTK library
from nltk.classify import ClassifierI

In [None]:
# Importing mode from the statistics module
from statistics import mode

In [None]:
# Creating a class for voting classifier and inheriting from ClassifierI class
class VotingClassifier(ClassifierI):

  # Next we will assign the list of classifiers that are passed to our class to self._classifiers
  def __init__(self, *classifiers):
     self._classifiers = classifiers


  # Creating our own classify method and we will name it 'classify' to invoke '.classify' later on
  def classify(self, features):

    # Creating a list for storing the votes
    votes = []

    # Will iterate through list of classifiers classify them on our feartures
    # and get the votes and append them in the votes list
    # and return the mode of votes (most popular vote)
    for algos in self._classifiers:
      v = algos.classify(features)
      votes.append(v)
    return mode(votes)

  # Creating a confidence method for getting the confidence indicator
  # We will tally the votes for and against the winning vote 
  def confidence(self, features):
    votes = []
    for algos in self._classifiers:
      v = algos.classify(features)
      votes.append(v)

    # Get the choice votes i.e. count the popular votes
    choice_votes = votes.count(mode(votes))
    
    # Get the confidence by dividing choice votes by total number of votes
    # and then return the confidence
    conf = choice_votes / len(votes)
    return conf

We will use all the models in the voting classifier.

In [None]:
# We will now use the voting classifier and get the performance
# as combination of algorithms
voted_model = VotingClassifier(base_model,
                               multinomial_nb_model,
                               bernoulli_nb_model,
                               sgd_model,
                               linear_svc_model)

In [None]:
# Printing out the voted model accuracy
print("Voted Model Accuracy Score: {}%".format(100 * classify.accuracy(voted_model, test_data)))

Voted Model Accuracy Score: 72.05528846153845%
