In [15]:
from collections import OrderedDict
import pandas as pd
import nltk
import re

In [16]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [17]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [18]:
#We're going to combine all text together and form a Named Entity Dictionary for Count Vectorizer
alltext = ' '.join([i for i in sts_gold['tweet']])

#remove hashtags
alltext_nohash = re.sub(r'\#\w+','', alltext)

#remove mentions
alltext_nohash_nomentions = re.sub(r'\@\w+','', alltext_nohash)

In [19]:
tokens = nltk.word_tokenize(alltext_nohash_nomentions)
pos_tags = nltk.pos_tag(tokens)
chunked_nes = nltk.ne_chunk(pos_tags, binary=True)
nes = [' '.join(map(lambda x: x[0], ne.leaves())) for ne in chunked_nes if isinstance(ne, nltk.tree.Tree)]

In [20]:
ne_vocabulary = list(set(nes))
print(len(ne_vocabulary))

687


In [21]:
ne_df = pd.DataFrame()
for word in ne_vocabulary:
    ne_df[word] = sts_gold['tweet'].str.count(word)
    
print(ne_df.shape, ne_df.values.mean(), ne_df.values.max())

(2034, 687) 0.00157153714367 3


In [22]:
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.dummy import DummyClassifier

In [23]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC())]

In [25]:
for name, model in models:
    X, Y = ne_df, (sts_gold['polarity'] == 4).ravel()
    if name == 'bNB':
        binarize = Binarizer()
        X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = ss.fit_transform(X)
    print(name + ":\t", cross_val_score(model, X, Y).mean())

svc:	 0.700092819298
mNB:	 0.715335195585
bNB:	 0.706979448073
DUMMY:	 0.689282477424
