**Importing the required modules**

In [None]:
# Importing the NLTK library and Random module
import nltk
import random

In [None]:
# Downloading all the NLTK files
nltk.download_shell()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to /root/nltk_data...
       |   Unzipping grammars/book_gr


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
# Importing tokenizers
from nltk.tokenize import word_tokenize

**Uploading the text files in google colab**

In [None]:
# Improting file from google colab
from google.colab import files
uploaded = files.upload()

Saving negative.txt to negative.txt
Saving positive.txt to positive.txt


**Opening and reading the required text files (short reviews)**

In [None]:
# opening and reading the required text files
short_positives = open("/content/positive.txt", mode='r', encoding='ISO-8859-1').read()
short_negatives = open("/content/negative.txt", mode='r', encoding='ISO-8859-1').read()

**Storing the short reviews in documents and all word tokens lists**

In [None]:
# Creating an empty list for storing the documents
documents = []

# Creating an empty list for storing appropriate word tokens
all_words_tokens = []

# Allowing for only adjectives as PoS tag for our reviews
allowed_pos_tag = ["J"]

In [None]:
# Iterating through each review splitting with a new line in the short_positives
for rev in short_positives.split('\n'):

  # appending reviews in the documents list with 'pos' tag
  documents.append((rev, "pos"))

  # creating word tokens of the reviews
  word_tokens = word_tokenize(rev)

  # pos tagging of the word tokens
  pos = nltk.pos_tag(word_tokens)
  
  # Taking only the allowed pos tags and appending
  # them int he all_words_tokens list
  for token in pos:
    if token[1][0] in allowed_pos_tag:
      all_words_tokens.append(token[0].lower())

In [None]:
# Iterating through each review splitting with a new line in the short_negatives
for rev in short_negatives.split('\n'):

  # appending reviews in the documents list with 'pos' tag
  documents.append((rev, "neg"))

  # creating word tokens of the reviews
  word_tokens = word_tokenize(rev)

  # pos tagging of the word tokens
  pos = nltk.pos_tag(word_tokens)
  
  # Taking only the allowed pos tags and appending
  # them int he all_words_tokens list
  for token in pos:
    if token[1][0] in allowed_pos_tag:
      all_words_tokens.append(token[0].lower())

In [None]:
# Importing pickle module
import pickle

In [None]:
# Saving the documents list
save_docs = open("documents.pickle", "wb")
pickle.dump(documents, save_docs)
save_docs.close()

**Frequency Distribution of all the word tokens**

In [None]:
# Get the frequency distribition of all the words
all_words_freq = nltk.FreqDist(all_words_tokens)

In [None]:
# Print out the length of the all frequent words list
print(len(all_words_freq))

6178


In [None]:
# Printing out the top 10 most common words 
all_words_freq.most_common(10)

[('good', 369),
 ('more', 331),
 ('little', 265),
 ('funny', 245),
 ('much', 234),
 ('bad', 234),
 ('best', 208),
 ('new', 206),
 ('own', 185),
 ('many', 183)]

In [None]:
# Taking the 5000 most common words 
most_common_word_tokens = all_words_freq.most_common(5000)

In [None]:
# Since the elements of the most_common_word_tokens list are in the form of tuples, 
# we need to extract the keys of each tuple to get the words as word features
word_features = [word[0] for word in most_common_word_tokens]

# Print out the top 10 word features
print(word_features[:10])

['good', 'more', 'little', 'funny', 'much', 'bad', 'best', 'new', 'own', 'many']


In [None]:
# Length of the word_features list
len(word_features)

5000

In [None]:
# Saving the word_features list
save_word_features = open("word_features5k.pickle", "wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

**Creating a Feature Set**

In [None]:
# Creating a function to get the features (words) in a dictionary
def doc_features(doc):
    
    doc_words = word_tokenize(doc)
    
    # creating an empty features list
    features = {}
    
    # Will iterate through all the words present in the word_features list
    for word in word_features:
        
        # Get that word and see its presence in the document (will return a bollean value)
        features[word] = (word in doc_words)
    
    return features

In [None]:
# Now, we are going to create a feature set which will contain the word features of the review and its correspoding category
feature_sets = [(doc_features(review), category) for (review, category) in documents]

In [None]:
# Shuffling the feature_sets
random.shuffle(feature_sets)

In [None]:
# Saving feature sets
featuresets_f = open("featuresets.pickle", "wb")
pickle.dump(feature_sets, featuresets_f)
featuresets_f.close()

In [None]:
# Length of the feature_sets
len(feature_sets)

10664

**Model Training**

Now, we will create training and testing sets.

In [None]:
# Training set and Testing set
train_data = feature_sets[:8000]
test_data = feature_sets[8000:]

In [None]:
# Length of training set
len(train_data)

8000

In [None]:
# Length of testing set
len(test_data)

2664

In [None]:
(len(train_data)/len(feature_sets), len(test_data)/len(feature_sets))

(0.7501875468867217, 0.24981245311327832)

We will be using the **Naive Bayes Classifier** for our training.

In [None]:
# Importing the NaiveBayesClassifier from nltk
from nltk import NaiveBayesClassifier

# Creating an instance of our classifier and training the model
base_model = NaiveBayesClassifier.train(train_data)

In [None]:
# Importing classify from nltk
from nltk import classify

# Calculating the accuracy of the base model 
accuracy_score = classify.accuracy(base_model, test_data)
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score))

Accuracy Score of Base Model : 72.07207207207207%


In [None]:
# Show 15 most informative features
print(base_model.show_most_informative_features(15))

Most Informative Features
               wonderful = True              pos : neg    =     17.1 : 1.0
              engrossing = True              pos : neg    =     16.5 : 1.0
               inventive = True              pos : neg    =     14.4 : 1.0
                 routine = True              neg : pos    =     12.2 : 1.0
                powerful = True              pos : neg    =     12.2 : 1.0
                    imax = True              pos : neg    =     11.8 : 1.0
                  sexual = True              pos : neg    =     11.8 : 1.0
             masterpiece = True              pos : neg    =     11.1 : 1.0
             mesmerizing = True              pos : neg    =     11.1 : 1.0
                    loud = True              neg : pos    =     10.9 : 1.0
                  boring = True              neg : pos    =     10.6 : 1.0
              refreshing = True              pos : neg    =     10.4 : 1.0
                    flat = True              neg : pos    =     10.1 : 1.0

In [None]:
# Saving base_model (Naive Bayes Classifier)
save_classifier = open("base_model_naivebayes5k.pickle","wb")
pickle.dump(base_model, save_classifier)
save_classifier.close()

**Importing Other Classifiers** 

In [None]:
# Importing scikit-learn module from NLTK (a wrapper for sklearn)
from nltk.classify.scikitlearn import SklearnClassifier

In [None]:
# Lets use some other types of Naive Bayes classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# Lets import some more classifiers
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [None]:
# Multinomial Naive Bayes classifier
multinomial_nb_model = SklearnClassifier(MultinomialNB())
multinomial_nb_model.train(train_data)

<SklearnClassifier(MultinomialNB())>

In [None]:
# Bernoulli Naive Bayes classifier
bernoulli_nb_model = SklearnClassifier(BernoulliNB())
bernoulli_nb_model.train(train_data)

<SklearnClassifier(BernoulliNB())>

In [None]:
# Stochastic Gradient Descent classifier
sgd_model = SklearnClassifier(SGDClassifier())
sgd_model.train(train_data)

<SklearnClassifier(SGDClassifier())>

In [None]:
# Linear Support Vector Classification classifier
linear_svc_model = SklearnClassifier(LinearSVC())
linear_svc_model.train(train_data)

<SklearnClassifier(LinearSVC())>

In [None]:
# Nu-Support Vector Classification classifier
nu_svc_model = SklearnClassifier(NuSVC())
nu_svc_model.train(train_data)

<SklearnClassifier(NuSVC())>

In [None]:
print("Accuracy Score of Base Model (Naive Bayes) : {}%".format(100 * accuracy_score))
print("MultinomialNB Model Accuracy Score: {}%".format(100 * classify.accuracy(multinomial_nb_model, test_data)))
print("BernoulliNB Model Accuracy Score: {}%".format(100 * classify.accuracy(bernoulli_nb_model, test_data)))
print("SGDClassifier Model Accuracy Score: {}%".format(100 * classify.accuracy(sgd_model, test_data)))
print("LinearSVC Model Accuracy Score: {}%".format(100 * classify.accuracy(linear_svc_model, test_data)))
print("NuSVC Model Accuracy Score: {}%".format(100 * classify.accuracy(nu_svc_model, test_data)))

Accuracy Score of Base Model (Naive Bayes) : 72.07207207207207%
MultinomialNB Model Accuracy Score: 71.69669669669669%
BernoulliNB Model Accuracy Score: 71.47147147147147%
SGDClassifier Model Accuracy Score: 70.08258258258259%
LinearSVC Model Accuracy Score: 68.88138138138137%
NuSVC Model Accuracy Score: 71.05855855855856%


In [None]:
# Saving Multinomial Naive Bayes classifier
save_classifier = open("multinomial_nb_model5k.pickle","wb")
pickle.dump(multinomial_nb_model, save_classifier)
save_classifier.close()

In [None]:
# Saving Bernoulli Naive Bayes classifier
save_classifier = open("bernoulli_nb_model5k.pickle","wb")
pickle.dump(bernoulli_nb_model, save_classifier)
save_classifier.close()

In [None]:
# Saving Stochastic Gradient Descent classifier
save_classifier = open("sgd_model5k.pickle","wb")
pickle.dump(sgd_model, save_classifier)
save_classifier.close()

In [None]:
# Saving Linear Support Vector Classification classifier
save_classifier = open("linear_svc_model5k.pickle","wb")
pickle.dump(linear_svc_model, save_classifier)
save_classifier.close()

In [None]:
# Saving Nu-Support Vector Classification classifier
save_classifier = open("nu_svc_model5k.pickle","wb")
pickle.dump(nu_svc_model, save_classifier)
save_classifier.close()

**After creating the sentiment module and saving it as sentiment_module.py, we will now import the module**

In [None]:
from google.colab import files

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/NLP\ Internship\ /Data/sentiment_module.py /content

In [None]:
# Importing the sentiment moduel
import sentiment_module as sm

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
# Calling the method of the sentiment module
sm.give_sentiment("This is an awesome movie. The direction was brilliant, the acting was perfect and I loved every bit of it.")

('pos', 1.0)

In [None]:
sm.give_sentiment("This is a bad movie. I didn't like it at all. The acting was not that good and the videography was utter disappointment.")

('neg', 1.0)

In [None]:
sm.give_sentiment("I understand the language was meant for the broad audience to understand but the producers could have made a better attempt at having the cast take on a heavier Italian accent. That was strike one for me. The acting was not atrocious but the script was unnecessarily extended in many acts, strike two. Too many blips in the storyline. There was no smooth transition of the time period progressions of Ferrucio's life, strike three. I know this was a direct to video movie but there was a lot of potential, A LOT of potential for this and it was a complete failure. Do better. Hopefully a different director/producer/writer/studio can portray the life of Lamborghini appropriately. You already are beat with the new Ferrari movie that isn't even out yet.")

('neg', 0.8)

In [None]:
sm.give_sentiment("very beautiful")

('pos', 1.0)

In [None]:
sm.give_sentiment("good film")

('neg', 0.6)

In [None]:
sm.give_sentiment("This was the best movie.")

('pos', 1.0)

In [None]:
print(sm.give_sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
print(sm.give_sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

('pos', 1.0)
('neg', 1.0)
