In [1]:
# -*- coding: utf-8 -*-

import json
import os
import nltk
import random
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [3]:
x = []
y = []

file_list = os.listdir('data')
for file in file_list:
    with open('data/' + file, 'r') as f:
        transcripts = json.load(f)
        x.extend(transcripts['text'].values())
        y.extend(transcripts['sentiment'].values())

In [None]:
#clean all the transcripts (stop words and stemming)
stop_words = set(stopwords.words('english'))
stop_words.add(',')
stop_words.add('.')
ps = PorterStemmer()
filtered_x = []

for t in x:
     t = t.lower()
     t_tokens = word_tokenize(t)
     stopworded_t = [w for w in t_tokens if not w in stop_words]
     stemmed_t = []
     for word in stopworded_t:
        stemmed_t.append(ps.stem(word))
     filtered_x.append(stemmed_t)

In [3]:
# collect fetures
all_words = []

for row in filtered_x:
    for w in row:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)


word_features = list(all_words.keys())[:1500]

In [4]:
# create featuresets
def find_features(t):
    features = {}
    for w in word_features:
        features[w] = (w in t)

    return features

featuresets = []
for i in range(len(y)):
    featuresets.append((find_features(filtered_x[i]), y[i]))



random.shuffle(featuresets)


In [5]:
#partitioning(80/20, 622 reviews in total)
training_set = featuresets[:497]
testing_set =  featuresets[498:]

In [19]:
testing_set[1]

({'amazon.com': False,
  'inc.': False,
  '(': False,
  'nasdaq': False,
  ':': False,
  'amzn': False,
  ')': False,
  'q3': False,
  '2018': False,
  'earn': False,
  'call': False,
  'octob': False,
  '25': False,
  '5:30': False,
  'pm': False,
  'et': False,
  'good': False,
  'day': False,
  'everyon': False,
  'welcom': False,
  'financi': False,
  'result': False,
  'teleconfer': False,
  'time': False,
  'particip': False,
  'listen-onli': False,
  'mode': False,
  'present': False,
  'conduct': False,
  'question-and-answ': False,
  'session': False,
  'today': False,
  "'s": False,
  'record': False,
  'open': True,
  'remark': False,
  'turn': False,
  'director': False,
  'investor': False,
  'relat': False,
  'dave': False,
  'fild': False,
  'pleas': False,
  'go': False,
  'ahead': False,
  'hello': False,
  'confer': False,
  'join': False,
  'us': False,
  'answer': False,
  'question': True,
  'brian': False,
  'olsavski': False,
  'cfo': False,
  'listen': False,
  

In [6]:
# Model and test
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Original Naive Bayes Algo accuracy percent: 64.51612903225806
Most Informative Features
               primarili = True           negati : positi =     16.9 : 1.0
                interact = True           negati : neutra =     15.8 : 1.0
                   lower = True           negati : positi =     13.8 : 1.0
                challeng = True           negati : positi =     13.8 : 1.0
                    less = True           negati : neutra =     10.9 : 1.0
                     due = True           negati : neutra =     10.9 : 1.0
                  though = True           negati : positi =     10.8 : 1.0
                  either = True           negati : positi =     10.1 : 1.0
                   creat = True           positi : neutra =      9.2 : 1.0
                 anticip = True           negati : neutra =      8.5 : 1.0
                 technic = True           negati : neutra =      8.5 : 1.0
                 program = True           negati : neutra =      8.5 : 1.0
            

In [20]:
testing_set_featuresets = [f[0] for f in testing_set]
testing_original_labels = [f[1] for f in testing_set]

In [21]:
testing_set_featuresets[0]

{'amazon.com': False,
 'inc.': False,
 '(': False,
 'nasdaq': False,
 ':': False,
 'amzn': False,
 ')': False,
 'q3': False,
 '2018': False,
 'earn': False,
 'call': False,
 'octob': False,
 '25': False,
 '5:30': False,
 'pm': False,
 'et': False,
 'good': True,
 'day': False,
 'everyon': False,
 'welcom': False,
 'financi': False,
 'result': False,
 'teleconfer': False,
 'time': False,
 'particip': False,
 'listen-onli': False,
 'mode': False,
 'present': False,
 'conduct': False,
 'question-and-answ': False,
 'session': False,
 'today': False,
 "'s": False,
 'record': False,
 'open': False,
 'remark': False,
 'turn': False,
 'director': False,
 'investor': False,
 'relat': False,
 'dave': False,
 'fild': False,
 'pleas': False,
 'go': False,
 'ahead': False,
 'hello': False,
 'confer': False,
 'join': False,
 'us': False,
 'answer': False,
 'question': False,
 'brian': False,
 'olsavski': False,
 'cfo': False,
 'listen': False,
 'encourag': False,
 'press': False,
 'releas': False,
 

In [22]:
testing_original_labels[0]

'neutral'

In [24]:
testing_predicted_labels = classifier.classify_many(testing_set_featuresets)

In [26]:
from nltk.metrics import ConfusionMatrix
print(ConfusionMatrix(testing_original_labels, testing_predicted_labels))

         |  n     p |
         |  e  n  o |
         |  g  e  s |
         |  a  u  i |
         |  t  t  t |
         |  i  r  i |
         |  v  a  v |
         |  e  l  e |
---------+----------+
negative | <1> 5  3 |
 neutral |  2<30>15 |
positive |  8 11<49>|
---------+----------+
(row = reference; col = test)

