NRC Hashtag Sentiments (unigrams)
------
**What it does**: Generates a set of features from the **Unigram** Lexicon of the NRC Hashtag Sentiment Lexicon. Features generated follow the ones generated in [Mohammad et. al 2013](http://www.aclweb.org/website/old_anthology/S/S13/S13-2.pdf#page=357), which are (for each word `w` and polarity/sentiment `p`):
- Total count of tokens in tweet with `score(w, p) > 0`
- Sum of score within tweet for each `p`
- Maximum token score for each `p`
- Score of last token in each tweet

Source:  http://saifmohammad.com/WebPages/lexicons.html#EmoLex5

**Strengths**:  Generates features with a sense of scale rather than counts. 

**Weaknesses**: Some features return negative values which can't be used in `BernoulliNB`. Not count values so `MultinomialNB` doesn't make *intuitive* sense, but will still function.

**Hyperparameters**:  None

In [1]:
from collections import OrderedDict, defaultdict, Counter
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer

In [2]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [3]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [4]:
tweets = sts_gold['tweet']

In [5]:
wordDict = defaultdict(float)

with open('../lexicons/NRC-Hashtag-Sentiment-Lexicon-v0.1/unigrams-pmilexicon.txt', 'r') as f:
    for row in f.readlines():
        row = row.split()
        wordDict[row[0]] = float(row[1])

In [6]:
tt = TweetTokenizer()

In [7]:
def polarity(x):
    score = wordDict[x]
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'
    else:
        return 'none'

In [8]:
def count_tokens_with_polarity(string, tokenizer):
    
    scorelist = []
    for token in tt.tokenize(string):
        token = token.lower()
        score = polarity(token)
        scorelist.append(score)
        
    return dict(Counter(scorelist))

In [9]:
def polarity_sum(string, tokenizer):
    
    negList = []
    posList = []
    for token in tt.tokenize(string):
        token = token.lower()
        if polarity(token) == 'positive':
            posList.append(wordDict[token])
        elif polarity(token) == 'negative':
            negList.append(abs(wordDict[token]))
        
    return {'pos_sum' : sum(posList), 'neg_sum' : sum(negList)}

In [10]:
def max_token(string, tokenizer):
    
    negList = []
    posList = []
    
    for token in tt.tokenize(string):
        token = token.lower()
        if polarity(token) == 'positive':
            posList.append(wordDict[token])
        elif polarity(token) == 'negative':
            negList.append(wordDict[token])
        
        
    try:
        pos_max = max(posList)
    except ValueError:
        pos_max = 0
    try:
        neg_max = min(negList)
    except ValueError:
        neg_max = 0
        
    return {'pos_max' : pos_max, 'neg_max' : neg_max}

In [11]:
def last_token(string, tokenizer):
    
    negList = []
    posList = []
    for token in reversed(tt.tokenize(string)):
        token = token.lower()
        if polarity(token) == 'positive' or polarity(token) == 'negative':
            return {'last_polarity' : wordDict[token]}
        else:
            continue

In [12]:
def all_feats_dict(string, tokenizer):
    ct = count_tokens_with_polarity(string, tokenizer)
    pol = polarity_sum(string, tokenizer)
    max_tkn = max_token(string, tokenizer)
    last = last_token(string, tokenizer)
    
    complete = dict()
    for dictionary in [ct, pol, max_tkn, last]:
        complete.update(dictionary)
    return complete

In [13]:
emotionCounts = [all_feats_dict(tweet, tt) for tweet in tweets]

In [14]:
emotion_df = pd.DataFrame(emotionCounts, index=tweets.index)
emotion_df = emotion_df.fillna(0)

In [15]:
# Add percentages if desired per sentence

#for column in emotion_df.columns:
#    emotion_df[column + "_p"] = emotion_df[column] / emotion_df.sum(axis=1)

In [16]:
emotion_df.describe()

Unnamed: 0,last_polarity,neg_max,neg_sum,negative,none,pos_max,pos_sum,positive
count,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0
mean,0.033,-0.902261,2.645272,7.284661,1.482793,0.908191,2.78726,7.948378
std,0.590253,0.70896,1.996062,4.536141,1.316941,0.606584,2.038789,4.608402
min,-4.999,-4.999,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.146,-1.202,1.06575,4.0,1.0,0.4785,1.28275,4.0
50%,0.034,-0.727,2.216,7.0,1.0,0.757,2.3745,7.0
75%,0.212,-0.486,3.75475,10.0,2.0,1.241,3.8025,11.0
max,3.043,0.0,11.006,26.0,8.0,5.0,15.521,26.0


In [17]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import cross_val_score

In [18]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC(probability=True)),
          ('rf' , RandomForestClassifier()),
          ('lr' , LogisticRegressionCV())
         ]
models.append(('eclf', VotingClassifier(estimators=[models[i] for i in [1, 3, 4, 5]], voting='soft')))

In [19]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = emotion_df, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB' or 'mNB':
        X = abs(X)
        if name == 'bNB':
            binarize = Binarizer()
            X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = X.as_matrix()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.737	0.6985	0.7685
bNB	0.6893	0.6765	0.6995
svc	0.7375	0.7181	0.7488
rf	0.7409	0.7315	0.7586
lr	0.7542	0.7328	0.7709
eclf	0.7488	0.7304	0.766
