NRC Hashtag Sentiments (bigrams)
------
**What it does**: Generates a set of features from the **Bigram** Lexicon of the NRC Hashtag Sentiment Lexicon. Features generated follow the ones generated in [Mohammad et. al 2013](http://www.aclweb.org/website/old_anthology/S/S13/S13-2.pdf#page=357), which are (for each word `w` and polarity/sentiment `p`):
- Total count of bigrams in tweet with `score(w, p) > 0`
- Sum of score within tweet for each `p`
- Maximum bigram score for each `p`
- Score of last bigram in each tweet

Source:  http://saifmohammad.com/WebPages/lexicons.html#EmoLex5

**Strengths**:  Generates features with a sense of scale rather than counts. 

**Weaknesses**: Some features return negative values which can't be used in `BernoulliNB`. Not count values so `MultinomialNB` doesn't make *intuitive* sense, but will still function.

**Hyperparameters**:  None

In [4]:
from collections import OrderedDict, defaultdict, Counter
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams

In [5]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [6]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [7]:
tweets = sts_gold['tweet']

In [8]:
wordDict = defaultdict(float)

with open('../lexicons/NRC-Hashtag-Sentiment-Lexicon-v0.1/bigrams-pmilexicon.txt', 'r') as f:
    for row in f.readlines():
        row = row.split()
        wordDict[row[0] +" " + row[1]] = float(row[2])

In [9]:
tt = TweetTokenizer()

In [10]:
def polarity(x):
    score = wordDict[x]
    if score > 0:
        return 'positive'
    if score < 0:
        return 'negative'
    else:
        return 'none'

In [11]:
def count_tokens_with_polarity(string, tokenizer):
    
    scorelist = []
    tokenized = tokenizer.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        score = polarity(ngram)
        scorelist.append(score)
        
    return dict(Counter(scorelist))

In [12]:
def polarity_sum(string, tokenizer):
    
    negList = []
    posList = []
    tokenized = tokenizer.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        if polarity(ngram) == 'positive':
            posList.append(wordDict[ngram])
        elif polarity(ngram) == 'negative':
            negList.append(abs(wordDict[ngram]))
        
    return {'pos_sum' : sum(posList), 'neg_sum' : sum(negList)}

In [13]:
def max_token(string, tokenizer):
    
    negList = []
    posList = []
    
    tokenized = tokenizer.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    for ngram in ngrams_list:
        ngram = ngram.lower()
        if polarity(ngram) == 'positive':
            posList.append(wordDict[ngram])
        elif polarity(ngram) == 'negative':
            negList.append(wordDict[ngram])
        
        
    try:
        pos_max = max(posList)
    except ValueError:
        pos_max = 0
    try:
        neg_max = min(negList)
    except ValueError:
        neg_max = 0
        
    return {'pos_max' : pos_max, 'neg_max' : neg_max}

In [14]:
def last_token(string, tokenizer):
    
    negList = []
    posList = []
    tokenized = tokenizer.tokenize(string)
    ngrams_list = [' '.join(i) for i in ngrams(tokenized, 2)]
    
    for token in reversed(ngrams_list):
        token = token.lower()
        if polarity(token) == 'positive' or polarity(token) == 'negative':
            return {'last_polarity' : wordDict[token]}
        else:
            continue
    
    return {'last_polarity' : 0}

In [15]:
def all_feats_dict(string, tokenizer):
    ct = count_tokens_with_polarity(string, tokenizer)
    pol = polarity_sum(string, tokenizer)
    max_tkn = max_token(string, tokenizer)
    last = last_token(string, tokenizer)
    
    complete = dict()
    
    for dictionary in [ct, pol, max_tkn, last]:
        complete.update(dictionary)
        
    return complete

In [16]:
emotionCounts = [all_feats_dict(tweet, tt) for tweet in tweets]

In [17]:
emotion_df = pd.DataFrame(emotionCounts, index=tweets.index)
emotion_df = emotion_df.fillna(0)

In [18]:
# Add percentages if desired per sentence

#for column in emotion_df.columns:
#    emotion_df[column + "_p"] = emotion_df[column] / emotion_df.sum(axis=1)

In [19]:
emotion_df.describe()

Unnamed: 0,last_polarity,neg_max,neg_sum,negative,none,pos_max,pos_sum,positive
count,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0,2034.0
mean,-0.001384,-1.293763,3.172424,4.951819,5.342183,1.13781,3.215226,5.421829
std,0.959042,1.269207,3.162562,3.779152,3.545075,0.865197,3.05873,3.87724
min,-4.999,-4.999,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.437,-1.6,0.7765,2.0,3.0,0.571,1.06275,3.0
50%,0.072,-0.987,2.2975,4.0,5.0,0.989,2.431,5.0
75%,0.478,-0.49525,4.54275,7.0,7.0,1.552,4.398,8.0
max,5.0,0.0,20.344,21.0,20.0,5.0,29.201,27.0


In [20]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import cross_val_score

In [21]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC(probability=True)),
          ('rf' , RandomForestClassifier()),
          ('lr' , LogisticRegressionCV())
         ]
models.append(('eclf', VotingClassifier(estimators=[models[i] for i in [1, 3, 4, 5]], voting='soft')))

In [22]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = emotion_df, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB' or 'mNB':
        X = abs(X)
        if name == 'bNB':
            binarize = Binarizer()
            X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = X.as_matrix()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.7134	0.6912	0.7463
bNB	0.6942	0.674	0.7108
svc	0.733	0.6995	0.7537
rf	0.7286	0.7181	0.7414
lr	0.7414	0.7132	0.7635
eclf	0.7532	0.7279	0.7783
