# Text Classification
NLTK Movie Reviews

In [1]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


### Printing Movie Reviews

In [2]:
print(type(movie_reviews), movie_reviews)

<class 'nltk.corpus.util.LazyCorpusLoader'> <CategorizedPlaintextCorpusReader in '.../corpora/movie_reviews' (not loaded yet)>


In [3]:
print(movie_reviews.readme())

Sentiment Polarity Dataset Version 2.0
Bo Pang and Lillian Lee

http://www.cs.cornell.edu/people/pabo/movie-review-data/

Distributed with NLTK with permission from the authors.


Introduction

This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .


What's New -- June, 2004

This dataset represents an enhancement of the review corpus v1.0
described in README v1.1: it contains more reviews, and labels were
created with an improved rating-extraction system.


Citation Info 

This data was first used in Bo Pang and Lillian Lee,
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
Based on Minimum Cuts'',  Proceedings of the ACL, 2004.

@InProceedings{Pang+Lee:04a,
  author =       {Bo Pang and Lillian Lee},
  title =        {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
  booktitle =    "Proceedings of the ACL",
  year =      

### Total number of words in 'movie_reviews'

In [4]:
print(len(movie_reviews.words()))

1583820


### Frequency of words in 'movie_reviews'

In [5]:
all_words=nltk.FreqDist(movie_reviews.words())
all_words.most_common(20)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961)]

### Frequency of boring in 'movie_reviews'

In [6]:
print(f"Frequency of 'boring' :: {all_words['boring']}")

Frequency of 'boring' :: 270


###  Converting words to Features

In [7]:
len(movie_reviews.fileids('neg'))

1000

In [8]:
movie_reviews.words('neg/cv000_29416.txt')

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

### Creating a feature vector


In [9]:
common_words = all_words.most_common(3000)

feature_vector = []
for word,freq in common_words:
    feature_vector.append(word) 

### Function to tell whether a word is there in our positive or negative review documents, marking them as true or false

In [10]:
def find_features(document):
    words = set(document)
    features = {}
    for i in feature_vector:
        features[i] = (i in words)
    
    return features

In [11]:
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))



### Using Naive Bayes Classifier to find the accuracy of text classification

In [12]:
documents = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
feature_sets = [(find_features(words), category) for (words, category) in documents]
len(feature_sets)

2000

In [13]:
train = feature_sets[:1800]
test = feature_sets[1800:]

In [14]:
bayes = nltk.NaiveBayesClassifier.train(train)

In [15]:
print(f"Naive Bayes Classifier accuracy :: {nltk.classify.util.accuracy(bayes, test)}")

Naive Bayes Classifier accuracy :: 0.78


### Let's display some valuable words in this classifier!

In [16]:
bayes.show_most_informative_features(15)

Most Informative Features
                   mulan = True              pos : neg    =     11.2 : 1.0
             outstanding = True              pos : neg    =     11.0 : 1.0
                  seagal = True              neg : pos    =     10.9 : 1.0
                 idiotic = True              neg : pos    =     10.7 : 1.0
                   inept = True              neg : pos    =      7.4 : 1.0
                  finest = True              pos : neg    =      7.2 : 1.0
                   damon = True              pos : neg    =      7.1 : 1.0
                   flynt = True              pos : neg    =      7.1 : 1.0
             wonderfully = True              pos : neg    =      7.0 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
              schumacher = True              neg : pos    =      5.9 : 1.0
                  alicia = True              neg : pos    =      5.6 : 1.0
                  prinze = True              neg : pos    =      5.6 : 1.0

## Let's implement some more Naive Bayes Classifiers  

In [17]:
import pandas as pd
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

In [18]:
classifiers={}

In [19]:
MNB_clf = SklearnClassifier(MultinomialNB())
MNB_clf.train(train)
print("Multinomial Naive Bayes Classifier")
print(f"Accuracy : {nltk.classify.accuracy(MNB_clf, test)}")
classifiers["Multinomial Naive Bayes Classifier"]=nltk.classify.accuracy(MNB_clf, test)

Multinomial Naive Bayes Classifier
Accuracy : 0.8


In [20]:
BNB_clf = SklearnClassifier(BernoulliNB())
BNB_clf.train(train)
print("Bernoulli Naive Bayes Classifier")
print(f"Accuracy : {nltk.classify.accuracy(BNB_clf, test)}")
classifiers["Bernoulli Naive Bayes Classifier"]=nltk.classify.accuracy(BNB_clf, test)

Bernoulli Naive Bayes Classifier
Accuracy : 0.775


In [21]:
SVC_clf = SklearnClassifier(SVC())
SVC_clf.train(train)
print("Support Vector Classifier")
print(f"Accuracy : {nltk.classify.accuracy(SVC_clf, test)}")
classifiers["Support Vector Classifier"]=nltk.classify.accuracy(SVC_clf, test)

Support Vector Classifier
Accuracy : 0.805


In [22]:
ABClf = SklearnClassifier(AdaBoostClassifier())
ABClf.train(train)
print("Ada Boost Classifier")
print(f"Accuracy : {nltk.classify.accuracy(ABClf, test)}")
classifiers["Ada Boost Classifier"]=nltk.classify.accuracy(ABClf, test)

Ada Boost Classifier
Accuracy : 0.745


In [23]:
SGDClf = SklearnClassifier(SGDClassifier())
SGDClf.train(train)
print("Stochastic Gradient Descent Classifier")
print(f"Accuracy : {nltk.classify.accuracy(SGDClf, test)}")
classifiers["Stochastic Gradient Descent Classifier"]=nltk.classify.accuracy(SGDClf, test)

Stochastic Gradient Descent Classifier
Accuracy : 0.8


### Summary of all classifiers

In [24]:
log_cols=["Classifier", "Accuracy"]
pd.DataFrame(classifiers.items(),columns=log_cols)

Unnamed: 0,Classifier,Accuracy
0,Multinomial Naive Bayes Classifier,0.8
1,Bernoulli Naive Bayes Classifier,0.775
2,Support Vector Classifier,0.805
3,Ada Boost Classifier,0.745
4,Stochastic Gradient Descent Classifier,0.8
