Hashtag BOW
------
**What it does**: Finds all hashtags in a tweet, splits their words into tokens, then runs a standard BOW model on the words contained within the hashtags.

ex. `#WhyImNotVotingForHillary` > `Why Im Not Voting For Hillary`

**Strengths**: 

**Weaknesses**: Does not find words not separated by propcaps

**Hyperparameters**:  None

In [1]:
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
sts_gold = pd.read_csv('../data/sts_gold_v03/sts_gold_tweet.csv', index_col='id', sep=';')

In [3]:
sts_gold.head()

Unnamed: 0_level_0,polarity,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1467933112,0,the angel is going to miss the athlete this we...
2323395086,0,It looks as though Shaq is getting traded to C...
1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
1990283756,0,drinking a McDonalds coffee and not understand...
1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


In [4]:
hashtagSeries = sts_gold['tweet'].apply(lambda x: re.findall(r'\#\w+', x))

In [5]:
def hashtag_splitter(hashtag_list):
    wordlist = []
    for hashtag in hashtag_list:
        token = re.findall('[A-Z][^A-Z]{1,}', hashtag.replace('\#',''))
        wordlist.extend(token)
    return ' '.join(wordlist)

In [6]:
hashtagSplitSeries = hashtagSeries.apply(lambda x: hashtag_splitter(x))

In [7]:
cv = CountVectorizer()
hashtagWords = cv.fit_transform(hashtagSplitSeries)
htColumns = ['#_' + _ for _ in cv.get_feature_names()]
hashtagSplit_df = pd.DataFrame(hashtagWords.toarray(), index=hashtagSplitSeries.index, columns=htColumns)

print(hashtagSplit_df.shape, hashtagSplit_df.values.mean(), hashtagSplit_df.values.max())

(2034, 21) 0.000538465140235 1


### Feature Evaluation

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.ensemble import VotingClassifier

from sklearn.cross_validation import cross_val_score

In [13]:
models = [('DUMMY', DummyClassifier(strategy='most_frequent')),
          ('mNB' , MultinomialNB()),
          ('bNB' , BernoulliNB()),
          ('svc' , SVC(probability=True)),
          ('rf' , RandomForestClassifier()),
          ('lr' , LogisticRegressionCV())
         ]
models.append(('eclf', VotingClassifier(estimators=[models[i] for i in [1, 3, 4, 5]], voting='soft')))

In [14]:
print('{0}\t{1:<1}\t{2:<4}\t{3:<4}'.format("MODEL", "MEAN CV", "MIN CV", "MAX CV"))

for name, model in models:    
    X, Y = hashtagWords, (sts_gold['polarity'] == 4).ravel()
    
    if name == 'bNB':
        binarize = Binarizer()
        X = binarize.fit_transform(X)
    elif name == 'svc':
        ss = StandardScaler()
        X = X.toarray()
        X = ss.fit_transform(X)
        
    cv = cross_val_score(model, X, Y, cv=5, scoring='accuracy')
    
    print('{0}\t{1:<3}\t{2:<4}\t{3:<4}'.format(name, round(cv.mean(), 4), round(cv.min(), 4), round(cv.max(), 4)))

MODEL	MEAN CV	MIN CV	MAX CV
DUMMY	0.6893	0.6887	0.6897
mNB	0.6903	0.6887	0.6936
bNB	0.6903	0.6872	0.6936
svc	0.6893	0.6887	0.6897
rf	0.6908	0.6887	0.6936
lr	0.6898	0.6887	0.6912
eclf	0.6903	0.6887	0.6921


