In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize

In [2]:
short_pos = open("short_reviews/positive.txt",'r').read()
short_neg = open("short_reviews/negative.txt",'r').read()

In [3]:
allowed_word_types = ["J"] #adjective

In [4]:
documents = []
all_words = []
for r in short_pos.split('\n'):
    documents.append((r,"pos"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
            
for r in short_neg.split('\n'):
    documents.append((r,"neg"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

In [5]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(10))

[('good', 369), ('more', 331), ('little', 265), ('funny', 245), ('much', 234), ('bad', 234), ('best', 208), ('new', 206), ('own', 185), ('many', 183)]


In [6]:
word_features = list(all_words.keys())[:5000]

In [7]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [8]:
featuresets = [(find_features(rev), category) 
               for (rev, category) in documents]

In [9]:
random.shuffle(featuresets)

In [10]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

In [11]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print((nltk.classify.accuracy(classifier, testing_set))*100)

74.69879518072288


In [12]:
classifier.show_most_informative_features(15)

Most Informative Features
              engrossing = True              pos : neg    =     21.0 : 1.0
                 generic = True              neg : pos    =     17.0 : 1.0
                    loud = True              neg : pos    =     16.3 : 1.0
                mediocre = True              neg : pos    =     16.3 : 1.0
                 routine = True              neg : pos    =     16.3 : 1.0
                    flat = True              neg : pos    =     15.0 : 1.0
                  boring = True              neg : pos    =     14.4 : 1.0
              refreshing = True              pos : neg    =     13.7 : 1.0
               inventive = True              pos : neg    =     12.4 : 1.0
               wonderful = True              pos : neg    =     12.2 : 1.0
                    dull = True              neg : pos    =     11.9 : 1.0
                    warm = True              pos : neg    =     11.8 : 1.0
             mesmerizing = True              pos : neg    =     11.7 : 1.0

In [13]:
#import pickle

#save = open("naivebayes.pkl", "wb")
#pickle.dump(classifier,save)
#save.close()

In [14]:
#class_file = open("naivebayes.pkl", "rb")
#classifier = pickle.load(class_file)
#save.close()

In [15]:
print((nltk.classify.accuracy(classifier, testing_set))*100)

74.69879518072288


In [16]:
from nltk.classify.scikitlearn import SklearnClassifier

In [17]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [18]:
mnbclassifier = SklearnClassifier(MultinomialNB()).train(training_set)
print((nltk.classify.accuracy(mnbclassifier, testing_set))*100)

73.19277108433735


In [19]:
#gaussclassifier = SklearnClassifier(GaussianNB())
#gaussclassifier.train(training_set)
#print((nltk.classify.accuracy(gaussclassifier, testing_set))*100)

In [20]:
bernclassifier = SklearnClassifier(BernoulliNB())
bernclassifier.train(training_set)
print((nltk.classify.accuracy(bernclassifier, testing_set))*100)

75.0


In [21]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [22]:
logisticclassifier = SklearnClassifier(LogisticRegression()).train(training_set)
print((nltk.classify.accuracy(logisticclassifier, testing_set))*100)

71.3855421686747


In [23]:
sgdclassifier = SklearnClassifier(SGDClassifier()).train(training_set)
print((nltk.classify.accuracy(sgdclassifier, testing_set))*100)

71.53614457831326


In [24]:
svcclassifier = SklearnClassifier(SVC()).train(training_set)
print((nltk.classify.accuracy(svcclassifier, testing_set))*100)

71.6867469879518


In [25]:
linearsvcclassifier = SklearnClassifier(LinearSVC()).train(training_set)
print((nltk.classify.accuracy(linearsvcclassifier, testing_set))*100)

70.03012048192771


In [26]:
nusvcclassifier = SklearnClassifier(NuSVC()).train(training_set)
print((nltk.classify.accuracy(nusvcclassifier, testing_set))*100)

72.89156626506023


In [27]:
from nltk.classify import ClassifierI
from statistics import mode

In [28]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/ len(votes)
        return conf    

In [29]:
voted_classifier = VoteClassifier(classifier,nusvcclassifier,linearsvcclassifier,svcclassifier,
                                 sgdclassifier,logisticclassifier,bernclassifier,mnbclassifier )

In [30]:
print((nltk.classify.accuracy(voted_classifier, testing_set))*100)

72.89156626506023


In [31]:
def sentiment(text):
    feats = find_features(text)
    return (voted_classifier.classify(feats), voted_classifier.confidence(feats))

In [32]:
sentiment("This movie is great and plot was wonderful")

'pos'

In [33]:
sentiment("This movie is junk. There is no plot")

'neg'

In [None]:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json


#consumer key, consumer secret, access token, access secret.
ckey="asdfsafsafsaf"
csecret="asdfasdfsadfsa"
atoken="asdfsadfsafsaf-asdfsaf"
asecret="asdfsadfsadfsadfsadfsad"

from twitterapistuff import *

class listener(StreamListener):

    def on_data(self, data):
        all_data = json.loads(data)
        tweet = all_data["text"]
        sentiment_value, confidence = sentiment(tweet)
        print(tweet, sentiment_value, confidence)

        if confidence*100 >= 80:
            output = open("twitter-out.txt","a")
            output.write(sentiment_value)
            output.write('\n')
            output.close()

        return True

    def on_error(self, status):
        print(status)

auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)

twitterStream = Stream(auth, listener())
twitterStream.filter(track=["happy"])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style

style.use('ggplot')

fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)

def animate(i):
    graph_data = open('twitter-out.txt','r').read()
    lines = graph_data.split('\n')
    xs = []
    ys = []
    x=0
    y=0
    for line in lines:
        x += 1
        if "pos" in line:
            y += 1
        elif "pos" in line:
            y -= 0.3
        xs.append(x)
        ys.append(y)
    ax1.clear()
    ax1.plot(xs, ys)
    
ani = animation.FuncAnimation(fig, animate, interval=1000)
plt.show()