### Bayes Network inference for Sentiment Analysis

This method uses basic bayes network to infer the sentiment of a phrases using a bootstrapped dataset.

In [1]:
%matplotlib inline
import pandas as pd
import glob

# Loading unlabelled data
paths = glob.glob("./data/realdonaldtrump/*.json")
frames = []

for path in paths:
    partial_df = pd.read_json(path)
    partial_df['created_at'] = pd.to_datetime(partial_df['created_at'])
    partial_df.index = partial_df['created_at']
    del partial_df['created_at']
    frames.append(partial_df)

df = pd.concat(frames)

df.head()

Unnamed: 0_level_0,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-12-31 23:59:55,21,550441250965708800,,False,8,Twitter for Android,"""@ronmeier123: @Macys Your APPAREL is UNPARALL..."
2014-12-31 23:59:22,18,550441111513493504,,False,5,Twitter for Android,"""@gillule4: @realDonaldTrump incredible experi..."
2014-12-31 23:57:56,44,550440752254562304,,False,33,Twitter for Android,"""@JobSnarechs: Negotiation tip #1: The worst t..."
2014-12-31 23:57:25,26,550440620792492032,,False,8,Twitter for Android,"""@joelmch2os: @realDonaldTrump announce your p..."
2014-12-31 23:57:02,31,550440523094577152,,False,9,Twitter for Android,"""@djspookyshadow: Feeling a deep gratitude for..."


In [2]:
# Loading unlabelled data
paths = glob.glob("./annotated/*.csv")
a_frames = []

for path in paths:
    partial_df = pd.read_csv(path, encoding = "ISO-8859-1") # fix weird encoding thing
    partial_df['created_at'] = pd.to_datetime(partial_df['created_at'])
    partial_df.index = partial_df['created_at']
    del partial_df['created_at']
    a_frames.append(partial_df)

annotated = pd.concat(a_frames)

annotated.head()

Unnamed: 0_level_0,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text,Sentiment
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014-04-20 07:07:00,15,4.57778e+17,,False,6,Twitter for Android,"""@gileshenley: @lukemckinney I read your @real...",P
2016-09-15 14:34:00,12773,7.76429e+17,,False,3643,Twitter for iPhone,Thank you for having me! I enjoyed the tour an...,P
2012-10-18 12:37:00,93,2.5891e+17,,False,391,Twitter Web Client,Obama's spending and borrowing is burying Amer...,N
2012-05-08 19:37:00,6,1.99946e+17,,False,12,Twitter Web Client,VOTE 4 @mariamenounos &amp; derekhough#01 toni...,P
2014-06-29 06:26:00,40,4.83134e+17,,False,17,Twitter for Android,"""@onlinebizxpress: We LOVE your business updat...",Z


In [3]:
annotated_n = len(annotated)
not_annotated_n = (len(df) - len(annotated))

print("%i annotated rows" % annotated_n)
print("%i non-annotated rows" % not_annotated_n)

perc = annotated_n/not_annotated_n * 100
print("Annotated: %f%s" % (perc, "%"))

1000 annotated rows
31592 non-annotated rows
Annotated: 3.165358%


### Bootstrapping this shit

In [51]:
from nltk.tokenize import sent_tokenize
import numpy as np

def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})

pos_tags = annotated.loc[annotated['Sentiment'] == 'P']['text']
neg_tags = annotated.loc[annotated['Sentiment'] == 'N']['text']
neutral_tags = annotated.loc[annotated['Sentiment'] == 'Z']['text']

formatted_pos_sents = [(format_sentence(row),'P') for row in pos_tags]
formatted_neg_sents = [(format_sentence(row),'N') for row in neg_tags]
formatted_neu_sents = [(format_sentence(row),'Z') for row in neutral_tags]

formatted_docs = formatted_pos_sents + formatted_neg_sents + formatted_neu_sents
np.random.shuffle(formatted_docs)


# Test set is first 100 rows (10%)
test_set = formatted_docs[:100]
# Train set is the other 900 rows (90%)
train_set = formatted_docs[100:]

In [52]:
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

In [53]:
classifier.show_most_informative_features()

Most Informative Features
                   Obama = True                N : P      =     20.8 : 1.0
                   Thank = True                P : N      =     20.5 : 1.0
                 Hillary = True                N : P      =     14.6 : 1.0
                   after = True                N : P      =     11.6 : 1.0
                 success = True                Z : N      =      9.8 : 1.0
                   media = True                N : P      =      9.6 : 1.0
                     her = True                N : P      =      9.6 : 1.0
                   China = True                N : P      =      9.4 : 1.0
             BarackObama = True                N : Z      =      8.3 : 1.0
                    life = True                Z : P      =      7.8 : 1.0


In [54]:
from nltk.classify.util import accuracy
print(accuracy(classifier, test_set))

0.66
