MaxDiff Twitter Sentiment Lexicon - unigrams and bigrams
------
**What it does**: Generates a set of features from the MaxDiff Twitter Sentiment Lexicon. Features generated follow the ones generated in [Mohammad et. al 2013](http://www.aclweb.org/website/old_anthology/S/S13/S13-2.pdf#page=357), which are (for each word `w` and polarity/sentiment `p`):
- Total count of tokens in tweet with `score(w, p) > 0`
- Sum of score within tweet for each `p`
- Maximum token score for each `p`
- Score of last token in each tweet

Source:  http://saifmohammad.com/WebPages/lexicons.html#EmoLex4

**Strengths**:  Generates features with a sense of scale rather than counts. 

**Weaknesses**: Some features return negative values which can't be used in `BernoulliNB`. Not count values so `MultinomialNB` doesn't make *intuitive* sense, but will still function.

**Hyperparameters**:  None

In [76]:
from collections import OrderedDict, defaultdict, Counter
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
from itertools import zip_longest

In [2]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [3]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [29]:
tt.tokenize("don't do that")

["don't", 'do', 'that']

In [4]:
tweets = sts_gold['tweet']

In [43]:
wordDict = defaultdict(float)

with open('../lexicons/MaxDiff-Twitter-Lexicon/Maxdiff-Twitter-Lexicon_-1to1.txt', 'r') as f:
    for row in f.readlines():
        row = row.split()
        wordDict[' '.join(row[1:])] = float(row[0])

In [44]:
tt = TweetTokenizer()

In [145]:
def polarity(x):
    score = wordDict[x]
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'
    else:
        return 'none'

In [146]:
def count_tokens_with_polarity(string, tokenizer):
    
    scorelist = []
    tokenized = tt.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    all_grams = tokenized + ngrams_list
    for token in all_grams:
        token = token.lower()
        score = polarity(token)
        scorelist.append(score)
        
    return dict(Counter(scorelist))

In [147]:
def polarity_sum(string, tokenizer):
    
    negList = []
    posList = []
    tokenized = tt.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    all_grams = tokenized + ngrams_list
    
    for token in all_grams:
        token = token.lower()
        if polarity(token) == 'positive':
            posList.append(wordDict[token])
        elif polarity(token) == 'negative':
            negList.append(abs(wordDict[token]))
        
    return {'pos_sum' : sum(posList), 'neg_sum' : sum(negList)}

In [148]:
def max_token(string, tokenizer):
    
    negList = []
    posList = []
    
    for token in tt.tokenize(string):
        token = token.lower()
        if polarity(token) == 'positive':
            posList.append(wordDict[token])
        elif polarity(token) == 'negative':
            negList.append(wordDict[token])
        
        
    try:
        pos_max = max(posList)
    except ValueError:
        pos_max = 0
    try:
        neg_max = min(negList)
    except ValueError:
        neg_max = 0
        
    return {'pos_max' : pos_max, 'neg_max' : neg_max}

In [149]:
def last_token(string, tokenizer):
    
    negList = []
    posList = []
    
    tokenized = tt.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    all_grams = tokenized + ngrams_list
    
    last_polarity_uni = 0
    last_polarity_bi = 0

    for unigram, bigram in list(zip_longest(reversed(tokenized), reversed(ngrams_list))):
        #unigram, bigram = unigram.lower(), bigram.lower()
        if polarity(unigram) != 'none' or polarity(bigram) != 'none':
            try:
                last_polarity_uni = wordDict[unigram]
            except KeyError:
                last_polarity_uni = 0
            try:
                last_polarity_bi = wordDict[bigram]
            except KeyError:
                last_polarity_uni = 0
            
            if abs(last_polarity_uni) > abs(last_polarity_bi):
                return {'last_polarity' : last_polarity_uni} 
            elif abs(last_polarity_uni) < abs(last_polarity_bi):
                return {'last_polarity' : last_polarity_bi} 
            elif abs(last_polarity_uni) == abs(last_polarity_bi):
                return {'last_polarity' : last_polarity_uni} 
            else:
                return {'last_polarity' : 0}
        else:
            continue
    
    else:
        return {'last_polarity' : 0}

In [150]:
def all_feats_dict(string, tokenizer):
    ct = count_tokens_with_polarity(string, tokenizer)
    pol = polarity_sum(string, tokenizer)
    max_tkn = max_token(string, tokenizer)
    last = last_token(string, tokenizer)
    
    complete = dict()
    
    for dictionary in [ct, pol, max_tkn, last]:
        complete.update(dictionary)
    return complete

In [151]:
emotionCounts = [all_feats_dict(tweet, tt) for tweet in tweets]

In [152]:
emotion_df = pd.DataFrame(emotionCounts, index=tweets.index)
emotion_df = emotion_df.fillna(0)

In [153]:
# Add percentages if desired per sentence

#for column in emotion_df.columns:
#    emotion_df[column + "_p"] = emotion_df[column] / emotion_df.sum(axis=1)

In [154]:
emotion_df.describe()

Unnamed: 0,last_polarity,neg_max,neg_sum,negative,none,pos_max,pos_sum,positive
count,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0
mean,0.094925,-0.234279,0.325528,0.929204,29.316618,0.40599,0.759318,2.185841
std,0.398565,0.296986,0.459732,1.033574,14.467123,0.282483,0.71557,1.706985
min,-0.938,-0.968,0.0,0.0,3.0,0.0,0.0,0.0
25%,-0.046,-0.5,0.0,0.0,17.0,0.156,0.188,1.0
50%,0.094,-0.046,0.078,1.0,28.0,0.406,0.586,2.0
75%,0.376,0.0,0.562,1.0,41.0,0.656,1.124,3.0
max,0.984,0.0,2.734,8.0,72.0,0.984,4.14,10.0


In [155]:
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.dummy import DummyClassifier

In [156]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC())
         ]

In [157]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = emotion_df, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB' or 'mNB':
        X = abs(X)
        if name == 'bNB':
            binarize = Binarizer()
            X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = X.as_matrix()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.7645	0.7475	0.7843
bNB	0.7025	0.6863	0.7328
svc	0.7679	0.7512	0.7794
