In [2]:
import json
import os
import nltk
import random
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import pickle

In [31]:
filename = 'imdb_master.csv'
df = pd.read_csv(filename,encoding='latin1')

In [32]:
df_train = df[(df['type'] == 'train') & (df['label'] != 'unsup')]
df_test = df[(df['type'] == 'test') & (df['label'] != 'unsup')]

In [15]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
25000,25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
25001,25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
25002,25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
25003,25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
25004,25004,train,When I was little my parents took me along to ...,neg,10003_1.txt


In [16]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [33]:
def get_reviews_and_labels(df):
    reviews = df['review'].tolist()
    labels = df['label'].tolist()
    return reviews,labels

In [34]:
x_train, y_train = get_reviews_and_labels(df_train)
x_test, y_test = get_reviews_and_labels(df_test)

In [35]:
#clean all the transcripts (stop words and stemming)
def clean(x):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    filtered_x = []

    for t in x:
         t = t.lower()
         t_tokens = word_tokenize(t)
         stopworded_t = [w for w in t_tokens if not w in stop_words]
         stemmed_t = []
         for word in stopworded_t:
            stemmed_t.append(ps.stem(word))
         filtered_x.append(stemmed_t)
    return filtered_x

In [36]:
cleaned_x_train = clean(x_train)
cleaned_x_test =  clean(x_test)

In [37]:
filtered_x = cleaned_x_train + cleaned_x_test

# collect fetures
all_words = []

for row in filtered_x:
    for w in row:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:5000]

In [28]:
# create featuresets
def find_features(t):
    features = {}
    for w in word_features:
        features[w] = (w in t)

    return features

In [28]:
featuresets_train = []
featuresets_test = []

for i in range(len(cleaned_x_train)):
    featuresets_train.append((find_features(cleaned_x_train[i]), y_train[i]))

for i in range(len(cleaned_x_test)):
    featuresets_test.append((find_features(cleaned_x_test[i]), y_test[i]))



In [29]:
# Model and test
classifier = nltk.NaiveBayesClassifier.train(featuresets_train)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, featuresets_test))*100)
classifier.show_most_informative_features(15)

Original Naive Bayes Algo accuracy percent: 84.416
Most Informative Features
                    2/10 = True              neg : pos    =     75.0 : 1.0
                    *1/2 = True              neg : pos    =     57.0 : 1.0
                    3/10 = True              neg : pos    =     43.6 : 1.0
                    boll = True              neg : pos    =     39.7 : 1.0
                     uwe = True              neg : pos    =     37.7 : 1.0
                    1/10 = True              neg : pos    =     22.7 : 1.0
                   dreck = True              neg : pos    =     20.4 : 1.0
                intermin = True              neg : pos    =     19.7 : 1.0
                   wayan = True              neg : pos    =     19.7 : 1.0
               terrible. = True              neg : pos    =     19.0 : 1.0
                semblanc = True              neg : pos    =     16.2 : 1.0
                 unwatch = True              neg : pos    =     16.1 : 1.0
                 stinke

In [30]:
with open("NaiveBayesClassifier_imdb.pickle", "wb") as f:
    pickle.dump(classifier, f)

In [15]:
with open("NaiveBayesClassifier_imdb.pickle", "rb") as f:
    classifier = pickle.load(f)

In [16]:
transcripts_x = []
transcripts_y = []

file_list = os.listdir('data')
for file in file_list:
    with open('data/' + file, 'r') as f:
        transcripts = json.load(f)
        transcripts_x.extend(transcripts['text'].values())
        transcripts_y.extend(transcripts['sentiment'].values())

In [20]:
#build a pandas df and drop all neutral rows
d = {'review': transcripts_x, 'label': transcripts_y}
df_transcripts = pd.DataFrame(data=d)

#drop all neutral rows
df_transcripts = df_transcripts[df_transcripts['label'] != 'neutral']

In [23]:
#replace positive and negative to pos and neg
df_transcripts['label'] = df_transcripts['label'].str.replace('positive','pos')
df_transcripts['label'] = df_transcripts['label'].str.replace('negative','neg')

In [25]:
df_transcripts

Unnamed: 0,review,label
10,Great. Thank you for taking my question. I gue...,pos
11,"Thank you, Justin. Yeah, let me just remind yo...",pos
12,I'll just remind you that the units – those do...,pos
13,"In-stock is very strong, especially as we head...",pos
14,"And, Justin, this is Dave. Just to add on to t...",neg
18,But I also point out that we launched Turkey i...,pos
21,Sure. Let me start with Q3 and this will be co...,pos
23,"In our fulfillment center world, we had grown ...",pos
25,"If you look at capital leases, which is where ...",pos
28,"Yeah, thanks for your question. This growth ra...",pos


In [44]:
transcripts_x = df_transcripts['review'].tolist()
transcripts_y = df_transcripts['label'].tolist()

In [45]:
cleaned_transcripts_x = clean(transcripts_x)
featuresets_transcripts = []

for i in range(len(cleaned_transcripts_x)):
    featuresets_transcripts.append((find_features(cleaned_transcripts_x[i]), transcripts_y[i]))

In [46]:
transcripts_features = [f[0] for f in featuresets_transcripts]
transcripts_labels = [f[1] for f in featuresets_transcripts]

In [47]:
transcripts_features[0]

{'stori': False,
 'man': False,
 'unnatur': False,
 'feel': False,
 'pig': False,
 '.': True,
 'start': False,
 'open': False,
 'scene': False,
 'terrif': False,
 'exampl': False,
 'absurd': False,
 'comedi': False,
 'formal': False,
 'orchestra': False,
 'audienc': False,
 'turn': False,
 'insan': False,
 ',': True,
 'violent': False,
 'mob': False,
 'crazi': False,
 'chant': False,
 "'s": True,
 'singer': False,
 'unfortun': False,
 'stay': False,
 'whole': False,
 'time': False,
 'gener': False,
 'narr': False,
 'eventu': False,
 'make': False,
 'put': False,
 'even': False,
 'era': False,
 'cryptic': False,
 'dialogu': False,
 'would': False,
 'shakespear': False,
 'seem': False,
 'easi': False,
 'third': False,
 'grader': False,
 'technic': False,
 'level': False,
 'better': False,
 'might': False,
 'think': False,
 'good': False,
 'cinematographi': False,
 'futur': False,
 'great': True,
 'vilmo': False,
 'zsigmond': False,
 'star': False,
 'salli': False,
 'kirkland': False,
 'f

In [48]:
transcripts_labels[0]

'pos'

In [49]:
predicted_labels = classifier.classify_many(transcripts_features)

In [52]:
print('acc: ' + str(nltk.classify.accuracy(classifier, featuresets_transcripts)*100))

acc: 80.73878627968337


In [50]:
from nltk.metrics import ConfusionMatrix
print(ConfusionMatrix(transcripts_labels, predicted_labels))

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |  <8> 54 |
pos |  19<298>|
----+---------+
(row = reference; col = test)

