Named Entities Count (Bag of Named Entities)
------
**What it does**: Finds all named entities in a corpus, then flags the occurence of each entity within each tweet.

**Strengths**: 

**Weaknesses**: Takes a long time to compute.

**Hyperparameters**:  None

In [1]:
import pandas as pd
import nltk
import re

In [2]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [3]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [4]:
#We're going to combine all text together and form a Named Entity Dictionary for Count Vectorizer
alltext = ' '.join([i for i in sts_gold['tweet']])

#remove hashtags
alltext_nohash = re.sub(r'\#\w+','', alltext)

#remove mentions
alltext_nohash_nomentions = re.sub(r'\@\w+','', alltext_nohash)

In [5]:
tokens = nltk.word_tokenize(alltext_nohash_nomentions)
pos_tags = nltk.pos_tag(tokens)
chunked_nes = nltk.ne_chunk(pos_tags, binary=True)
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)]

In [6]:
ne_vocabulary = list(set(nes))
print(len(ne_vocabulary))

687


In [7]:
ne_df = pd.DataFrame()
for word in ne_vocabulary:
    ne_df[word] = sts_gold['tweet'].str.count(word)
    
print(ne_df.shape, ne_df.values.mean(), ne_df.values.max())

(2034, 687) 0.00157153714367 3


### Feature Evaluation

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import cross_val_score

In [9]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC(probability=True)),
          ('rf' , RandomForestClassifier()),
          ('lr' , LogisticRegressionCV())
         ]
models.append(('eclf', VotingClassifier(estimators=[models[i] for i in [1, 3, 4, 5]], voting='soft')))

In [10]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = ne_df, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB':
        binarize = Binarizer()
        X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        #X = X.toarray()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.7173	0.7044	0.7315
bNB	0.7109	0.6961	0.7266
svc	0.7045	0.6985	0.7143
rf	0.7124	0.6897	0.7365
lr	0.708	0.6995	0.7167
eclf	0.7168	0.7083	0.7241
