In [170]:
## movie review dataset

In [171]:
from nltk.corpus import movie_reviews
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import pickle

In [172]:
# list of categories 
print(movie_reviews.categories())

['neg', 'pos']


In [173]:
for category in movie_reviews.categories():
    print(movie_reviews.fileids(category)[:3]) # sample of some of the file names in the various doocuments(['neg', 'pos'])
    print("number of the documents in " + str(category) + " : "  + str(len(movie_reviews.fileids(category))))

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt']
number of the documents in neg : 1000
['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt']
number of the documents in pos : 1000


In [174]:
# list of count of the words, 
words = []
for word in movie_reviews.words():
    #if word not in set(stopwords.words('english')): # with the removal of stop words
    words.append(word.lower())
print(len(words))

1583820


In [175]:
# Removing special charaters
words = list(re.sub('[^a-zA-Z]','', word) for word in words )
print(len(words))
clean_word = []
for word in words:
    if word != '':
        clean_word.append(word)
print(len(clean_word))

1583820
1331109


In [176]:
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_word = [word for word in clean_word if word not in stop_words]

In [177]:
len(filtered_word)

702383

In [178]:
# now taking the frequency distribution of the filtered_words
freq_filtered_words = nltk.FreqDist(filtered_word)

In [179]:
len(freq_filtered_words)

38809

In [180]:
# getting top 3000 words out of 38809
top_3000_words = list(freq_filtered_words.keys())[:3000]

In [181]:
top_3000_words[:5]

['plot', 'two', 'teen', 'couples', 'go']

In [182]:
# now seeing the top 3000 words in every document to create a dataframe

In [183]:
features_list = []
def find_features(file_id):
    #print(file_id)
    words = list(set(movie_reviews.words(file_id)))
    #print(words)
    features = {}
    for word in top_3000_words:
        if word in words:
            features[word] = True
        else:
            features[word] = False
    features_list.append(features)


In [184]:
for categories in movie_reviews.categories():
    for file_id in movie_reviews.fileids(categories):
        #print(categories)
        find_features(file_id)

In [185]:
df = pd.DataFrame(features_list)

In [186]:
df.head()

Unnamed: 0,abc,aberdeen,able,abo,absent,absolutely,accent,accentuate,accident,accidentally,...,young,youngsters,youth,yuppie,zero,zombified,zone,zoologist,zwigoff,zzzzzzz
0,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [187]:
len(df)

2000

In [188]:
# converting the data frame to the format in which nltk.train would work : https://stackoverflow.com/questions/29337714/how-to-run-naive-bayes-from-nltk-with-python-pandas
# featuresets should be of the form [(featureset, label)] , where the featureset variable is a dict
train = df.to_dict(orient='records')

In [189]:
train_list = []
for dict in train[:1000]:
    train_list.append((dict,'neg'))

In [190]:
for dict in train[1000:]:
    train_list.append((dict,'pos'))

In [191]:
# train data
training_set = train_list[:1800]

# test data
testing_set = train_list[1800:]

In [192]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [193]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 81.5


In [194]:
classifier.show_most_informative_features(15)

Most Informative Features
                  annual = True              pos : neg    =     11.2 : 1.0
                 idiotic = True              neg : pos    =     10.7 : 1.0
                 frances = True              pos : neg    =      9.6 : 1.0
                   sucks = True              neg : pos    =      8.5 : 1.0
                bothered = True              neg : pos    =      7.7 : 1.0
                     ugh = True              neg : pos    =      7.7 : 1.0
                 cunning = True              pos : neg    =      7.1 : 1.0
                  turkey = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      6.7 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
                  regard = True              pos : neg    =      6.2 : 1.0
                 singers = True              pos : neg    =      6.2 : 1.0
                 unravel = True              pos : neg    =      6.2 : 1.0

In [195]:
# creating a pickle file to save the classfier
save_classifier = open('mov_rev_naivebayes.pickle','wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [196]:
# loading the pickle file
classifier_f = open('mov_rev_naivebayes.pickle','rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

In [197]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from nltk.classify.scikitlearn import SklearnClassifier

In [198]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [199]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 81.5
Most Informative Features
                  annual = True              pos : neg    =     11.2 : 1.0
                 idiotic = True              neg : pos    =     10.7 : 1.0
                 frances = True              pos : neg    =      9.6 : 1.0
                   sucks = True              neg : pos    =      8.5 : 1.0
                bothered = True              neg : pos    =      7.7 : 1.0
                     ugh = True              neg : pos    =      7.7 : 1.0
                 cunning = True              pos : neg    =      7.1 : 1.0
                  turkey = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      6.7 : 1.0
                    lame = True              neg : pos    =      6.4 : 1.0
                  regard = True              pos : neg    =      6.2 : 1.0
                 singers = True              pos : neg    =      6.2 : 1.0
                 unravel 



LogisticRegression_classifier accuracy percent: 77.5




SGDClassifier_classifier accuracy percent: 80.5




SVC_classifier accuracy percent: 16.5
LinearSVC_classifier accuracy percent: 74.5
NuSVC_classifier accuracy percent: 77.5


In [200]:
## using other method to train
df_data = df.copy()

In [201]:
df_data.head()

Unnamed: 0,abc,aberdeen,able,abo,absent,absolutely,accent,accentuate,accident,accidentally,...,young,youngsters,youth,yuppie,zero,zombified,zone,zoologist,zwigoff,zzzzzzz
0,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [202]:
tag = []
for i in range(0,2000):
    if i < 1000:
        tag.append('neg')
    else:
        tag.append('pos')

In [203]:
df_data['tag'] = tag

In [225]:
from sklearn.model_selection import train_test_split
training, testing = train_test_split(df_data, test_size = .25, stratify = df_data['tag'], random_state = 20)

In [231]:
training = df_data[:1900]
testing = df_data[1900:]

In [232]:
training = training.copy()
y_train = training['tag']
training.drop(columns = ['tag'], inplace = True)
X_train = training

In [233]:
testing = testing.copy()
y_test = testing['tag']
testing.drop(columns = ['tag'], inplace = True)
X_test = testing

In [234]:
MNB_classifier = MultinomialNB()
MNB_classifier.fit(X_train, y_train)
y_pred_mnb = MNB_classifier.predict(X_test)
print("MNB_classifier accuracy percent:", accuracy_score(y_test, y_pred_mnb) * 100)

BNB_classifier = BernoulliNB()
BNB_classifier.fit(X_train, y_train)
y_pred_bnb = BNB_classifier.predict(X_test)
print("BernoulliNB accuracy percent:", accuracy_score(y_test, y_pred_bnb) * 100)

LogisticRegression_classifier = LogisticRegression()
LogisticRegression_classifier.fit(X_train, y_train)
y_pred_lr = LogisticRegression_classifier.predict(X_test)
print("Logistic Regression accuracy percent:", accuracy_score(y_test, y_pred_lr) * 100)

SGDClassifier_classifier = SGDClassifier()
SGDClassifier_classifier.fit(X_train, y_train)
y_pred_sgd = SGDClassifier_classifier.predict(X_test)
print("Sto. grad descent accuracy percent:", accuracy_score(y_test, y_pred_sgd) * 100)

MNB_classifier accuracy percent: 80.0
BernoulliNB accuracy percent: 80.0




Logistic Regression accuracy percent: 83.0
Sto. grad descent accuracy percent: 81.0
