In [42]:
# General libraries.
import re
import numpy as np
import pandas as pd


# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [43]:
# LOAD THE DATA
filename = "data/Labeled_Colorado_Flu_Study_Tweets_AvI_RvN_SvO.csv"
coloradoData = pd.read_csv(filename, sep="\t")
print(coloradoData.shape)

(5270, 5)


## EDA

In [44]:
coloradoData[['Tweet_Content', 'Related_Label']].head(5)

Unnamed: 0,Tweet_Content,Related_Label
0,"don't worry it's not swine flu, i already got ...",
1,muh. if i am getting sick and it's not swine f...,1.0
2,what is up with my boy? this morning i though...,
3,"getting better,no more piggy flu 4 me,it was n...",1.0
4,@robbsterr yay for man txting you.. in other n...,0.0


In [45]:
# Preprocessing
# get all data with labels present for the column we care about (Related/NotRelated)
coloradoVal = coloradoData.dropna(subset=["Related_Label"])
print(coloradoVal.shape)

# extract X and Y as np arrays, so that we can feed them to tensors. 
X = coloradoVal["Tweet_Content"]
Y = coloradoVal["Related_Label"]
print(X.values.shape)
print(Y.values.shape)

(4413, 5)
(4413,)
(4413,)


In [46]:
coloradoVal.head(5)

Unnamed: 0,Tweet_ID,Tweet_Content,Awareness_Label,Related_Label,Self_Label
1,5222838706,muh. if i am getting sick and it's not swine f...,0.0,1.0,1.0
3,5918860304,"getting better,no more piggy flu 4 me,it was n...",0.0,1.0,1.0
4,4631607800,@robbsterr yay for man txting you.. in other n...,0.0,0.0,1.0
5,6004314210,swine flu has arrived at my daughter's kinderg...,0.0,1.0,0.0
6,5225053106,i think im getting flu soon.,0.0,1.0,1.0


In [47]:
# Split into train, test, dev (80%,10%,10%) (3530, 441, 442 each)
np.random.seed(42)
# train_data, train_labels = X[:3530], Y[:3530]
# test_data, test_labels = X[3530:3971], Y[3530:3971]
# dev_data, dev_labels = X[3971:], Y[3971:]

train, test, dev = np.split(X.sample(frac=1), [int(.8*len(X)), int(.9*len(X))])

train_y = Y[train.index]
test_y = Y[test.index]
dev_y = Y[dev.index]

# convert to numpy arrays
train_data, test_data, dev_data, train_labels, test_labels, dev_labels = \
    train.values, test.values, dev.values, \
    train_y.values, test_y.values, dev_y.values 
train_labels = train_labels.astype(int)
test_labels = test_labels.astype(int)
dev_labels = dev_labels.astype(int)

print(type(train_data))
print 'training label shape:', train_labels.shape
print 'test label shape:', test_labels.shape
print 'dev label shape:', dev_labels.shape

<type 'numpy.ndarray'>
training label shape: (3530,)
test label shape: (441,)
dev label shape: (442,)


In [48]:
train_labels

array([0, 0, 0, ..., 0, 1, 1])

In [49]:
train_data

array([ 'who is getting their flu shot for the first time! just did it, man am i dizzy',
       "my arm is sooo soar after the swine flu shot!!! it hurts... but i'm getting used to pain because i'm in a lot of pain a lot... soo......",
       "got my seasonal flu shot today, even though i'm a big scaredy cat who is afraid of needles.",
       ...,
       'good thing were not worried about swine flu. its a great time to be playing beer pong --jimmy fallon',
       "think i'm getting the stomach flu my mom had..ugh!",
       'finally getting over the flu ... wow this one was wild!  i am counting on a sick free remainder of the year!'], dtype=object)

### Find optimized parameter settings for logistic regression and Multinomial Bayes

In [50]:
# Find the best parameters using CountVectorizer
cv = CountVectorizer(analyzer='word')
cvtrain = cv.fit_transform(train_data)
# print(train.shape) # [samples, features] (3530, 8553)

# Find the best parameters for C in logistic regression
logit = LogisticRegression() # default penalty='l2'
clist = {'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}
lr = GridSearchCV(logit,clist)
lr.fit(cvtrain, train_labels)
print "Best logistic regression C :", lr.best_params_, "best score: ",lr.best_score_

# for c in clist['C']:
#     logit2=LogisticRegression(C=c)
#     logit2.fit(cvtrain, train_labels)
#     weight=[]
#     for x in range(len(logit2.coef_)):
#         weight.append(sum(logit2.coef_[x]**2))
#     print "C=", c, ", sum of squared weight values:", weight
    
# Find the best parameters for alpha in Multinomial Bayes
mb = MultinomialNB()
alphas = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}        
mnb= GridSearchCV(mb, alphas)
mnb.fit(cvtrain, train_labels)
print "Best multinomial bayes alpha: ", mnb.best_params_,"best score: ", mnb.best_score_

# for a in alphas['alpha']:
#     mb2=MultinomialNB(alpha=a)
#     mb2.fit(cvtrain, train_labels)
#     weight=[]
#     for x in range(len(mb2.coef_)):
#         weight.append(sum(mb2.coef_[x]**2))
#     print "alpha=", a, ", sum of squared weight values:", weight


Best logistic regression C : {'C': 0.1} best score:  0.749008498584
Best multinomial bayes alpha:  {'alpha': 0.5} best score:  0.750991501416


In [51]:
# Find the best parameters using TfidfVectorizer
tf= TfidfVectorizer(analyzer='word')
tftrain=tf.fit_transform(train_data)

# Find the best parameters for C in logistic regression
logit = LogisticRegression() # default penalty='l2'
clist = {'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}
lr = GridSearchCV(logit,clist)
lr.fit(tftrain, train_labels)
print "Best logistic regression C :", lr.best_params_, "best score: ",lr.best_score_

# for c in clist['C']:
#     logit2=LogisticRegression(C=c)
#     logit2.fit(train, train_labels)
#     weight=[]
#     for x in range(len(logit2.coef_)):
#         weight.append(sum(logit2.coef_[x]**2))
#     print "C=", c, ", sum of squared weight values:", weight
    
# Find the best parameters for alpha in Multinomial Bayes
mb = MultinomialNB()
alphas = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}        
mnb= GridSearchCV(mb, alphas)
mnb.fit(tftrain, train_labels)
print "Best multinomial bayes alpha: ", mnb.best_params_,"best score: ", mnb.best_score_

# for a in alphas['alpha']:
#     mb2=MultinomialNB(alpha=a)
#     mb2.fit(tftrain, train_labels)
#     weight=[]
#     for x in range(len(mb2.coef_)):
#         weight.append(sum(mb2.coef_[x]**2))
#     print "alpha=", a, ", sum of squared weight values:", weight


Best logistic regression C : {'C': 0.5} best score:  0.747875354108
Best multinomial bayes alpha:  {'alpha': 0.3} best score:  0.738526912181


### Preprocessing Tweets

In [52]:
def better_preprocessor(s):
    s = s.lower()
    s = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$','',s)
    s = re.sub('\s+', ' ', s).strip() 
    s = re.sub(r'\b\d+\b', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r"\\", "", s)    
    s = re.sub(r"\'", "", s)    
    s = re.sub(r"\"", "", s) 
    return s
# TO DO: add specific twitter preprocessor
# https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/

def preprocess():
    # no processing
    vect = CountVectorizer(preprocessor=None) # set preprocessor to default none
    cvdata=vect.fit_transform(train_data)
    logit = LogisticRegression() # default penalty='l2'
    logit.fit(cvdata, train_labels)
    
    dev=vect.transform(dev_data)
    pred = logit.predict(dev)
    score = metrics.f1_score(dev_labels, pred, average='micro')
    print "Dictionary size without preprocessing: ", len(vect.vocabulary_) # without preprocessing
    print "F1 score without preprocessing: ", score
    
    # preprocessing
    cv = CountVectorizer(preprocessor=better_preprocessor)
    cvdata2=cv.fit_transform(train_data)
    logit2 = LogisticRegression() # default penalty='l2'
    logit2.fit(cvdata2, train_labels)
    
    dev2=cv.transform(dev_data)
    pred2 = logit2.predict(dev2)
    score2 = metrics.f1_score(dev_labels, pred2, average='micro')
    print "Dictionary size with preprocessing: ", len(cv.vocabulary_)
    print "F1 score with preprocessing: ", score2
    print "Dictionary size reduction: ", len(vect.vocabulary_)-len(cv.vocabulary_)


preprocess()

Dictionary size without preprocessing:  8553
F1 score without preprocessing:  0.739819004525
Dictionary size with preprocessing:  8487
F1 score with preprocessing:  0.733031674208
Dictionary size reduction:  66


### Testing Variations of Baseline Models

In [53]:
def cv_log(param, data, labels, processor=None):

    cv= CountVectorizer(analyzer='word', preprocessor=processor)
    cvdata=tf.fit_transform(train_data)
    logit1 = LogisticRegression(C=param)
    logit1.fit(cvdata, train_labels)
    
    cvdev=tf.transform(data)
    
    # predict classification
    predict= logit1.predict(cvdev)
    post_prob = logit1.predict_proba(cvdev)
    
    accuracy=metrics.f1_score(labels, predict, average='micro')
    loss=metrics.log_loss(labels, post_prob)
    
    return [accuracy, loss]

# ## STUDENT END ###

# print'Dev data accuracy:', cv_log(0.1, dev_data, dev_labels)[0]
print'Dev data processed accuracy:', cv_log(0.1, dev_data, dev_labels, better_preprocessor)[0]

# print'Dev entropy loss:', cv_log(0.1, dev_data, dev_labels)[1]
print'Dev data processed entropy loss:', cv_log(0.1, dev_data, dev_labels, better_preprocessor)[1]

# print'Test data accuracy:', cv_log(0.1, test_data, test_labels)[0]
print'Test data processed accuracy:', cv_log(0.1, test_data, test_labels,better_preprocessor)[0]

# print'Test entropy loss:', cv_log(0.1, test_data, test_labels)[1]
print'Test data processed entropy loss:', cv_log(0.1, test_data, test_labels,better_preprocessor)[1]

Dev data processed accuracy: 0.748868778281
Dev data processed entropy loss: 0.566095281728
Test data processed accuracy: 0.702947845805
Test data processed entropy loss: 0.596831628413


In [54]:
def tfid_log(param, data, labels, processor=None):

    tf= TfidfVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    logit = LogisticRegression(C=param)
    logit.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= logit.predict(tfdev)
    post_prob = logit.predict_proba(tfdev)
    
    accuracy=metrics.f1_score(labels, predict, average='micro')
    loss=metrics.log_loss(labels, post_prob)
    return [accuracy, loss]


# print'dev data accuracy:', tfid_log(0.5, dev_data, dev_labels)[0]
print'dev data processed accuracy:', tfid_log(0.5, dev_data, dev_labels, better_preprocessor)[0]

# print'Dev entropy loss:', tfid_log(0.5, dev_data, dev_labels)[1]
print'Dev data processed entropy loss:', tfid_log(0.5, dev_data, dev_labels, better_preprocessor)[1]

# print'test data accuracy:', tfid_log(0.5, test_data, test_labels)[0]
print'test data processed accuracy:', tfid_log(0.5, test_data, test_labels, better_preprocessor)[0]

# print'Test entropy loss:', tfid_log(0.5, test_data, test_labels)[1]
print'Test data processed entropy loss:', tfid_log(0.5, test_data, test_labels, better_preprocessor)[1]

dev data processed accuracy: 0.755656108597
Dev data processed entropy loss: 0.509039494116
test data processed accuracy: 0.721088435374
Test data processed entropy loss: 0.5537123254


In [55]:
def cv_mnb(param, data, labels, processor=None):

    cv= CountVectorizer(analyzer='word', preprocessor=processor)
    cvdata=tf.fit_transform(train_data)
    mb = MultinomialNB(alpha=param)
    mb.fit(cvdata, train_labels)
    
    cvdev=tf.transform(data)
    
    # predict classification
    predict= mb.predict(cvdev)
    post_prob = mb.predict_proba(cvdev)
    
    accuracy=metrics.f1_score(labels, predict, average='micro')
    loss=metrics.log_loss(labels, post_prob)
    return [accuracy, loss]

# print'dev data accuracy:', cv_mnb(0.5, dev_data, dev_labels)[0]
print'Dev data processed accuracy:', cv_mnb(0.5, dev_data, dev_labels, better_preprocessor)[0]

# print'Dev entropy loss:', cv_mnb(0.5, dev_data, dev_labels)[1]
print'Dev data processed entropy loss:', cv_mnb(0.5, dev_data, dev_labels, better_preprocessor)[1]

# print'test data accuracy:', cv_mnb(0.5, test_data, test_labels)[0]
print'Test data processed accuracy:', cv_mnb(0.5, test_data, test_labels, better_preprocessor)[0]

# print'Test entropy loss:', cv_mnb(0.5, test_data, test_labels)[1]
print'Test data processed entropy loss:', cv_mnb(0.5, test_data, test_labels, better_preprocessor)[1]

Dev data processed accuracy: 0.739819004525
Dev data processed entropy loss: 0.524777142882
Test data processed accuracy: 0.718820861678
Test data processed entropy loss: 0.58813838173


In [56]:
def tf_mnb(param, data, labels, processor=None):

    tf= TfidfVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    mb = MultinomialNB(alpha=param)
    mb.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= mb.predict(tfdev)
    post_prob = mb.predict_proba(tfdev)
    
    accuracy=metrics.f1_score(labels, predict, average='micro')
    loss=metrics.log_loss(labels, post_prob)
    
    return [accuracy, loss]


# print'dev data accuracy:', tf_mnb(0.3, dev_data, dev_labels)[0]
print'Dev data processed accuracy:', tf_mnb(0.3, dev_data, dev_labels, better_preprocessor)[0]

# print'Dev entropy loss:', tf_mnb(0.3, dev_data, dev_labels)[1]
print'Dev data processed entropy loss:', tf_mnb(0.3, dev_data, dev_labels, better_preprocessor)[1]

# print'test data accuracy:', tf_mnb(0.3, test_data, test_labels)[0]
print'Test data processed accuracy:', tf_mnb(0.3, test_data, test_labels, better_preprocessor)[0]

# print'Test entropy loss:', tf_mnb(0.3, test_data, test_labels)[1]
print'Test data processed entropy loss:', tf_mnb(0.3, test_data, test_labels, better_preprocessor)[1]

Dev data processed accuracy: 0.755656108597
Dev data processed entropy loss: 0.537005501462
Test data processed accuracy: 0.716553287982
Test data processed entropy loss: 0.592988489634


### Error Analysis

In [61]:
# Take TFidFVectroizer and multinomial bayes as baseline and do error analysis

tf= TfidfVectorizer(analyzer='word', preprocessor=better_preprocessor)
tfdata=tf.fit_transform(train_data)
mb = MultinomialNB(alpha=0.3)
mb.fit(tfdata, train_labels)
    
tfdev=tf.transform(test_data)
    
# predict classification
predictions= list(mb.predict(tfdev))
falsePositives = []
falseNegatives = []
for idx,i in enumerate(test.index.values):
    predictedLabel = predictions[idx]
    datarow = coloradoData.iloc[i][["Tweet_Content", "Related_Label"]]
    trueLabel = datarow["Related_Label"]
    if int(trueLabel) == 1:
        if int(predictedLabel)==0:
            falseNegatives.append((datarow["Tweet_Content"], trueLabel, predictedLabel))
    elif int(trueLabel) == 0:
        if int(predictedLabel) == 1:
            falsePositives.append((datarow["Tweet_Content"], trueLabel, predictedLabel))
print"All Test data #:"+str(len(test_y.index.values))
print"False Positives:"+str(int(len(falsePositives)*100.0/len(test_y.index.values)))+"%"
print"False Negatives:"+str(int(len(falseNegatives)*100.0/len(test_y.index.values)))+"%"
print ''
print"Some false positives (tweet, true label, predicted label):"
for row in falsePositives[:5]:
    print row
print ''
print"Some false negatives (tweet, true label, predicted label):"
for row in falseNegatives[:5]:
    print row


All Test data #:441
False Positives:21%
False Negatives:6%

Some false positives (tweet, true label, predicted label):
('getting sick :| i think ima get the turtle flu lmfaaooo', 0.0, 1)
("taking the compact carrs for their piggy flu shots. i'm so against the idea, this better not make them sick! #fb #worried #hateshots", 0.0, 1)
("rt @ewjjr: difference between bird flu & swine flu: for bird flu you get tweetment. for swine flu you get oinkment. /that's so bad it's good", 0.0, 1)
("i want to see obama, his lovely bride and kids get the flu shot first... i don't trust them... talk about fear tactics!  wake up america!", 0.0, 1)
("my irrational fear of needles is not helping me get my act together and get a flu shot today... :'(", 0.0, 1)

Some false negatives (tweet, true label, predicted label):
('#caringcurrents #h1n1 swine flu alert: adults ages 50 and older getting sicker, dying in higher numbers  http://bit.ly/13bjgf  #eldercare', 1.0, 0)
("concerned about swine flu, wolfson childr