# Exploring Sentiment Analysis with ML

In [108]:
from __future__ import unicode_literals
import classifiers
import re
import nltk
from Politweet import get_tweets, get_transcript
import ratings
from sentiment import polarity_train, classify, prob_classify, plus_df, minus_df
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from nltk import sent_tokenize, word_tokenize, FreqDist, WordNetLemmatizer
from nltk.corpus import stopwords
import classifiers
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sentiment import plus_regex, minus_regex, plus_regex, minus_regex
from sklearn.feature_extraction.text import TfidfTransformer
pd.set_option('display.max_colwidth', 1200)

# Get the tweets
tweets = get_tweets("./datasets/tweets.tsv")

## Choosing data for training

For train and test, we only use the tweets that have been marked with the same rating by AMT.

In [109]:
def prepare_sentiment_data(tweets):
    neg = [
        (t, 'neg')
        for i,t in ratings.all(tweets, ratings.NEGATIVE).iterrows()]

    pos = [
        (t, 'pos')
        for i,t in ratings.all(tweets, ratings.POSITIVE).iterrows()]

    other = [
        (t, 'other')
        for i,t in ratings.all(tweets, ratings.OTHER).iterrows()]

    train, test = train_test_split(
        pos + neg + other, 
        test_size = .2, 
        random_state = 20)
    return train, test

#### Make sure data is tokenized

In [110]:
def featurize(tweet):
    tokens = [token['lemma'] for token in tweet['clean'] if token['lemma'] != '']
    return tokens

#### Running a pipeline
The strategy is to use the pipeline design pattern. The input is data and the out put is a trained classifier ready to predict

In [111]:
def run_pipeline(train, test, clsfr):
    # fit the classifier with training data
    train_x, train_y = zip(*train)
    test_x, test_y = zip(*test)
    clsfr.fit(train_x, train_y)
    # get accuracy on the test
    scr = clsfr.score(test_x, test_y)
    return scr

## TF-IDF + Polarity rules classifier (pipeline)

#### Rule Based features
This matches +1, -1.. in tweets and adds a new entry polarity(+) or polarity(-) if encountred. Engineering this feature is going to help us to get 100% accuracy on twits that have this pattern.

In [112]:
class RuleBasedSent(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self
    
    def featurize(self, document):
        features = {}
        # Positive/Negative polarity if contains a +/-
        features['polarity(+)'] = not not plus_regex.match(document["content"])
        features['polarity(-)'] = not not minus_regex.match(document["content"])
        return features

    def transform(self, docs):
        return [self.featurize(d) for d in docs]

In [113]:
pipeline_tfidf = Pipeline([
    ('features', FeatureUnion([
        ('ngram_tf_idf', Pipeline([
            ('counts', CountVectorizer(tokenizer = featurize, lowercase=False)),
            ('tf_idf', TfidfTransformer())
        ])),
        ('rule_based_system', Pipeline([
                ('match', RuleBasedSent()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
    ])),
    ('classifier', LinearSVC())
])

## TF-IDF + (learned) MechTurks 

Since we already have the scores from AMT, we decided to learn on their labels and use as training set where they all agree

In [114]:
class MechTurks(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self
    
    def rating_to_score(self, rating):
        if (rating == ratings.POSITIVE):
            return 1
        elif (rating == ratings.NEGATIVE):
            return -1
        else:
            return 0
    
    def featurize(self, document):
        features = {}
        features['rating(1)'] = self.rating_to_score(document["rating.1"])
        features['rating(2)'] = self.rating_to_score(document["rating.2"])
        features['rating(3)'] = self.rating_to_score(document["rating.2"])
        features['rating(4)'] = self.rating_to_score(document["rating.2"])
        return features

    def transform(self, docs):
        return [self.featurize(d) for d in docs]

In [115]:
pipeline_amazon = Pipeline([
    ('features', FeatureUnion([
        ('ngram_tf_idf', Pipeline([
            ('counts', CountVectorizer(tokenizer = featurize, lowercase=False)),
            ('tf_idf', TfidfTransformer())
        ])),
        ('mechturks_pipe', Pipeline([
                ('mecturks', MechTurks()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
        ('rule_based_system', Pipeline([
                ('match', RuleBasedSent()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ]))
    ])),
    ('classifier', LinearSVC())
])

## Results

In [116]:
train, test = prepare_sentiment_data(tweets)
score_tfidf = run_pipeline(train, test, pipeline_tfidf)
score_tfidf

0.81775700934579443

In [117]:
train, test = prepare_sentiment_data(tweets)
score_amazon = run_pipeline(train, test, pipeline_amazon)
score_amazon

0.99065420560747663

## Annotate tweets with sentiment

In [118]:
def df_sentiment(tweets):
    tweets["sent_amazon"] = pd.Series(pipeline_amazon.predict([t for i,t in tweets.iterrows()]), index=tweets.index)
    tweets["sent_tfidf"] = pd.Series(pipeline_tfidf.predict([t for i,t in tweets.iterrows()]), index=tweets.index)
    return tweets

df_sentiment(tweets)[["content", "sent_amazon", "sent_tfidf", "rating.1"]][:3]

Unnamed: 0_level_0,content,sent_amazon,sent_tfidf,rating.1
tweet.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
936469851,Watching by myself #tweetdebate Not drinking :( waiting to start cringing at McCain blunders,neg,neg,1
936470432,"@ahg3 @MichDot Yeah, slime was actually my second choice, can't say what the first one was. Okay, we're rolling...",neg,neg,1
936472030,Preparing to have a heart attack #tweetdebate,neg,neg,1


In [119]:
from Politweet import df_setminus

obama = tweets[tweets["content"].str.contains('obama', flags=re.IGNORECASE)]
mccain = tweets[tweets["content"].str.contains('mccain', flags=re.IGNORECASE)]
oba_and_mccain = tweets.reindex(obama.index & mccain.index)
oba_or_mccain = tweets.reindex(obama.index | mccain.index)
none = tweets[~(tweets["content"].str.contains('obama|mccain', flags=re.IGNORECASE))]
other = ratings.all(tweets, ratings.OTHER)

only_mccain = df_setminus(mccain, oba_and_mccain)
only_obama = df_setminus(obama, oba_and_mccain)
other_none = df_setminus(df_setminus(other, oba_or_mccain), oba_and_mccain)

oba = [
    (t, 'oba')
    for i,t in only_obama.iterrows()]
print "tagged obama", len(oba)

mcc = [
    (t, 'mcc')
    for i,t in only_mccain.iterrows()]
print "tagged mcc", len(mcc)

both = [
    (t, 'both')
    for i,t in oba_and_mccain.iterrows()]
print "tagged both", len(both)

other = [
    (t, 'none')
    for i,t in other_none.iterrows()]
print "tagged other", len(other)

train, test = train_test_split(
    oba + mcc + both + other, 
    test_size = .2, 
    random_state = 20)

other_none

tagged obama 605
tagged mcc 717
tagged both 475
tagged other 126


Unnamed: 0_level_0,pub.date.GMT,content,author.name,author.nickname,rating.1,rating.2,rating.3,rating.4,rating.5,rating.6,rating.7,rating.8,clean,tokens,sent_amazon,sent_tfidf
tweet.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
936468396,9/27/08 1:00,@current one minute to get your stuff together #current,mobile jones,,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u'IN'}, {u'lemma': u'current', u'token': u'current', u'pos': u'JJ'}, {u'lemma': u'one', u'token': u'one', u'pos': u'CD'}, {u'lemma': u'minute', u'token': u'minute', u'pos': u'NN'}, {u'lemma': u'get', u'token': u'get', u'pos': u'VB'}, {u'lemma': u'stuff', u'token': u'stuff', u'pos': u'NN'}, {u'lemma': u'together', u'token': u'together', u'pos': u'RB'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'current', u'token': u'current', u'pos': u'JJ'}]","[current, one, minute, get, stuff, together, current]",other,other
936468782,9/27/08 1:01,Anyone else actually kinda nervous? #tweetdebate,themurmish,A. Murmann,4,4,4,,,,,,"[{u'lemma': u'anyone', u'token': u'anyone', u'pos': u'NN'}, {u'lemma': u'else', u'token': u'else', u'pos': u'RB'}, {u'lemma': u'actually', u'token': u'actually', u'pos': u'RB'}, {u'lemma': u'kinda', u'token': u'kinda', u'pos': u'NN'}, {u'lemma': u'nervous', u'token': u'nervous', u'pos': u'JJ'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}]","[anyone, else, actually, kinda, nervous, tweetdebate]",other,other
936470593,9/27/08 1:02,"#tweetdebate ready, set, go!",themurmish,A. Murmann,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}, {u'lemma': u'ready', u'token': u'ready', u'pos': u'RB'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'set', u'token': u'set', u'pos': u'VBN'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'go', u'token': u'go', u'pos': u'RB'}, {u'lemma': u'', u'token': u'', u'pos': u''}]","[tweetdebate, ready, set]",other,other
936471247,9/27/08 1:02,Any drinking game ideas for #tweetdebate?,danstuart,danstuart,4,4,4,,,,,,"[{u'lemma': u'drinking', u'token': u'drinking', u'pos': u'NN'}, {u'lemma': u'game', u'token': u'game', u'pos': u'NN'}, {u'lemma': u'idea', u'token': u'ideas', u'pos': u'NNS'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'JJ'}, {u'lemma': u'', u'token': u'', u'pos': u''}]","[drinking, game, idea, tweetdebate]",other,other
936473721,9/27/08 1:03,@mojosd refresh?,naum,Naum Trifanoff,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u'IN'}, {u'lemma': u'mojosd', u'token': u'mojosd', u'pos': u'VBN'}, {u'lemma': u'refresh', u'token': u'refresh', u'pos': u'JJ'}, {u'lemma': u'', u'token': u'', u'pos': u''}]","[mojosd, refresh]",other,other
936475560,9/27/08 1:05,#tweetdebate i never knew what the symbol of the Dems was till 12 yrs ago,themurmish,A. Murmann,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}, {u'lemma': u'never', u'token': u'never', u'pos': u'RB'}, {u'lemma': u'know', u'token': u'knew', u'pos': u'VBD'}, {u'lemma': u'symbol', u'token': u'symbol', u'pos': u'NN'}, {u'lemma': u'dems', u'token': u'dems', u'pos': u'NNS'}, {u'lemma': u'till', u'token': u'till', u'pos': u'RB'}, {u'lemma': u'12', u'token': u'12', u'pos': u'CD'}, {u'lemma': u'yr', u'token': u'yrs', u'pos': u'NNS'}, {u'lemma': u'ago', u'token': u'ago', u'pos': u'RB'}]","[tweetdebate, never, know, symbol, dems, till, ago]",other,other
936475824,9/27/08 1:05,@starweaver what happened?,jani_s,jani_s,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u'IN'}, {u'lemma': u'starweaver', u'token': u'starweaver', u'pos': u'NN'}, {u'lemma': u'happen', u'token': u'happened', u'pos': u'VBN'}, {u'lemma': u'', u'token': u'', u'pos': u''}]","[starweaver, happen]",other,other
936480186,9/27/08 1:07,#tweetdebate,danstuart,danstuart,4,4,4,,,,,,"[{u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}]",[tweetdebate],other,other
936485053,9/27/08 1:10,Just started watching the debate. Got home a few mins late. Got to get #current and then #tweetdebate,damnneargenius,,4,4,4,,,,,,"[{u'lemma': u'start', u'token': u'started', u'pos': u'VBN'}, {u'lemma': u'watch', u'token': u'watching', u'pos': u'VBG'}, {u'lemma': u'debate', u'token': u'debate', u'pos': u'NN'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'get', u'token': u'got', u'pos': u'VBD'}, {u'lemma': u'home', u'token': u'home', u'pos': u'NN'}, {u'lemma': u'min', u'token': u'mins', u'pos': u'NNS'}, {u'lemma': u'late', u'token': u'late', u'pos': u'JJ'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'get', u'token': u'got', u'pos': u'VBD'}, {u'lemma': u'get', u'token': u'get', u'pos': u'VB'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'current', u'token': u'current', u'pos': u'JJ'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}]","[start, watch, debate, get, home, min, late, get, get, current, tweetdebate]",other,other
936487555,9/27/08 1:11,Web analyst @jowyang post on how to be an armchair political analyst http://tinyurl.com/3jdy67 #tweetdebate,Scott_Drummond,Scott_Drummond,4,4,4,,,,,,"[{u'lemma': u'web', u'token': u'web', u'pos': u'NN'}, {u'lemma': u'analyst', u'token': u'analyst', u'pos': u'NN'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'jowyang', u'token': u'jowyang', u'pos': u'NN'}, {u'lemma': u'post', u'token': u'post', u'pos': u'NN'}, {u'lemma': u'armchair', u'token': u'armchair', u'pos': u'NN'}, {u'lemma': u'political', u'token': u'political', u'pos': u'JJ'}, {u'lemma': u'analyst', u'token': u'analyst', u'pos': u'NN'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tinyurlcom3jdy67', u'token': u'tinyurlcom3jdy67', u'pos': u'NONE'}, {u'lemma': u'', u'token': u'', u'pos': u''}, {u'lemma': u'tweetdebate', u'token': u'tweetdebate', u'pos': u'NN'}]","[web, analyst, jowyang, post, armchair, political, analyst, tinyurlcom3jdy67, tweetdebate]",other,neg


In [120]:
obama_regex = re.compile(".*(obama|barack).*")
mccain_regex = re.compile(".*(mccain|mcpain|).*")

class RuleBasedCandidate(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def featurize(self, document):
        document_words = set(document["tokens"])

        features = {}
        features['candidate(obama)'] = not not obama_regex.match(document["content"])
        features['candidate(mccain)'] = not not mccain_regex.match(document["content"])

        return features

    def transform(self, docs):
        return [self.featurize(d) for d in docs]

pipeline_candidates = Pipeline([
    ('features', FeatureUnion([
        ('ngram_tf_id', Pipeline([
            ('count', CountVectorizer(tokenizer = featurize, lowercase=False)),
            ('tf_id', TfidfTransformer())
        ])),
        ('rule_based_syste', Pipeline([
                ('match', RuleBasedCandidate()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ]))
    ])),
    ('classifier', LinearSVC())
])


score_candidates = run_pipeline(train, test, pipeline_candidates)
score_candidates

0.91168831168831166

In [121]:
tweets["candidate"] = pd.Series(pipeline_candidates.predict([t for i,t in tweets.iterrows()]), index=tweets.index)
tweets[["content", "sent_tfidf", "candidate"]]

Unnamed: 0_level_0,content,sent_tfidf,candidate
tweet.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
936469851,Watching by myself #tweetdebate Not drinking :( waiting to start cringing at McCain blunders,neg,mcc
936470432,"@ahg3 @MichDot Yeah, slime was actually my second choice, can't say what the first one was. Okay, we're rolling...",neg,mcc
936472030,Preparing to have a heart attack #tweetdebate,neg,mcc
936472042,"no debate moderators under 50, sorry #tweetdebate",other,mcc
936472907,@current Now staring at black screen on http://www.current.com/debate grrrrrrrrrrrrrrr #current,other,none
936472928,Introducing McCain and Obama. Wait...McCain SHOWED UP! Wow...guess the crisis in Washington is over? #tweetdebate,neg,both
936474032,#tweetdebate nice how Obama held McCain hand and used his free hand to wave to the audience while keeping McCain from doing so,neg,both
936475544,Obama -2 it an occupation not war #tweetdebate,neg,oba
936476318,"@TDefren I'm very nervous, yes #tweetdebate",other,none
936476394,#tweetdebate no golden parachutes no padding CEOs - Obama,pos,oba
