In [1]:
# General libraries.
import re
import numpy as np
import pandas as pd


# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *



In [2]:
# LOAD THE DATA
filename = "data/Labeled_Colorado_Flu_Study_Tweets_AvI_RvN_SvO.csv"
coloradoData = pd.read_csv(filename, sep="\t")
print(coloradoData.shape)

(5270, 5)


In [3]:
coloradoData.head(5)

Unnamed: 0,Tweet_ID,Tweet_Content,Awareness_Label,Related_Label,Self_Label
0,5926297601,"don't worry it's not swine flu, i already got ...",1.0,,1.0
1,5222838706,muh. if i am getting sick and it's not swine f...,0.0,1.0,1.0
2,5233654812,what is up with my boy? this morning i though...,0.0,,0.0
3,5918860304,"getting better,no more piggy flu 4 me,it was n...",0.0,1.0,1.0
4,4631607800,@robbsterr yay for man txting you.. in other n...,0.0,0.0,1.0


In [4]:
# Preprocessing
# get all data with labels present for the column we care about (Related/NotRelated)
coloradoVal = coloradoData.dropna(subset=["Related_Label"])
print(coloradoVal.shape)

# extract X and Y as np arrays, so that we can feed them to tensors. 
X = coloradoVal["Tweet_Content"]
Y = coloradoVal["Related_Label"]
print(X.values.shape)
print(Y.values.shape)

(4413, 5)
(4413,)
(4413,)


In [5]:
# Split into train, test, dev (80%,10%,10%) (3530, 441, 442 each)
np.random.seed(42)
# train_data, train_labels = X[:3530], Y[:3530]
# test_data, test_labels = X[3530:3971], Y[3530:3971]
# dev_data, dev_labels = X[3971:], Y[3971:]

train_data, test_data, dev_data = np.split(X.sample(frac=1), [int(.8*len(X)), int(.9*len(X))])

train_labels = Y[train_data.index]
test_labels = Y[test_data.index]
dev_labels = Y[dev_data.index]

# convert to numpy arrays
train_data, test_data, dev_data, train_labels, test_labels, dev_labels = \
    train_data.values, test_data.values, dev_data.values, \
    train_labels.values, test_labels.values, dev_labels.values 
train_labels = train_labels.astype(int)
test_labels = test_labels.astype(int)
dev_labels = dev_labels.astype(int)

print(type(train_data))
print 'training label shape:', train_labels.shape
print 'test label shape:', test_labels.shape
print 'dev label shape:', dev_labels.shape
# print(train_labels)
# print(train_data)


<type 'numpy.ndarray'>
training label shape: (3530,)
test label shape: (441,)
dev label shape: (442,)


In [6]:
# Find the best parameters using CountVectorizer
cv = CountVectorizer(analyzer='word')
cvtrain = cv.fit_transform(train_data)
# print(train.shape) # [samples, features] (3530, 8553)

# Find the best parameters for C in logistic regression
logit = LogisticRegression() # default penalty='l2'
clist = {'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}
lr = GridSearchCV(logit,clist)
lr.fit(cvtrain, train_labels)
print "Best logistic regression C :", lr.best_params_, "best score: ",lr.best_score_

# for c in clist['C']:
#     logit2=LogisticRegression(C=c)
#     logit2.fit(cvtrain, train_labels)
#     weight=[]
#     for x in range(len(logit2.coef_)):
#         weight.append(sum(logit2.coef_[x]**2))
#     print "C=", c, ", sum of squared weight values:", weight
    
# Find the best parameters for alpha in Multinomial Bayes
mb = MultinomialNB()
alphas = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}        
mnb= GridSearchCV(mb, alphas)
mnb.fit(cvtrain, train_labels)
print "Best multinomial bayes alpha: ", mnb.best_params_,"best score: ", mnb.best_score_

# for a in alphas['alpha']:
#     mb2=MultinomialNB(alpha=a)
#     mb2.fit(cvtrain, train_labels)
#     weight=[]
#     for x in range(len(mb2.coef_)):
#         weight.append(sum(mb2.coef_[x]**2))
#     print "alpha=", a, ", sum of squared weight values:", weight


Best logistic regression C : {'C': 0.1} best score:  0.749008498584
Best multinomial bayes alpha:  {'alpha': 0.5} best score:  0.750991501416


In [7]:
# Find the best parameters using TfidfVectorizer
tf= TfidfVectorizer(analyzer='word')
tftrain=tf.fit_transform(train_data)

# Find the best parameters for C in logistic regression
logit = LogisticRegression() # default penalty='l2'
clist = {'C': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}
lr = GridSearchCV(logit,clist)
lr.fit(tftrain, train_labels)
print "Best logistic regression C :", lr.best_params_, "best score: ",lr.best_score_

# for c in clist['C']:
#     logit2=LogisticRegression(C=c)
#     logit2.fit(train, train_labels)
#     weight=[]
#     for x in range(len(logit2.coef_)):
#         weight.append(sum(logit2.coef_[x]**2))
#     print "C=", c, ", sum of squared weight values:", weight
    
# Find the best parameters for alpha in Multinomial Bayes
mb = MultinomialNB()
alphas = {'alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0, 2.0, 10.0]}        
mnb= GridSearchCV(mb, alphas)
mnb.fit(tftrain, train_labels)
print "Best multinomial bayes alpha: ", mnb.best_params_,"best score: ", mnb.best_score_

# for a in alphas['alpha']:
#     mb2=MultinomialNB(alpha=a)
#     mb2.fit(tftrain, train_labels)
#     weight=[]
#     for x in range(len(mb2.coef_)):
#         weight.append(sum(mb2.coef_[x]**2))
#     print "alpha=", a, ", sum of squared weight values:", weight


Best logistic regression C : {'C': 0.5} best score:  0.747875354108
Best multinomial bayes alpha:  {'alpha': 0.3} best score:  0.738526912181


In [8]:
def better_preprocessor(s):
    s = s.lower()
    s = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$','',s)
    s = re.sub('\s+', ' ', s).strip() 
    s = re.sub(r'\b\d+\b', '', s)
    s = re.sub(r'<.*?>', '', s)
    s = re.sub(r"\\", "", s)    
    s = re.sub(r"\'", "", s)    
    s = re.sub(r"\"", "", s) 
    return s
# TO DO: add specific twitter preprocessor
# https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/

def preprocess():
    # no processing
    vect = CountVectorizer(preprocessor=None) # set preprocessor to default none
    cvdata=vect.fit_transform(train_data)
    logit = LogisticRegression() # default penalty='l2'
    logit.fit(cvdata, train_labels)
    
    dev=vect.transform(dev_data)
    pred = logit.predict(dev)
    score = metrics.f1_score(dev_labels, pred, average='micro')
    print "Dictionary size without preprocessing: ", len(vect.vocabulary_) # without preprocessing
    print "F1 score without preprocessing: ", score
    
    # preprocessing
    cv = CountVectorizer(preprocessor=better_preprocessor)
    cvdata2=cv.fit_transform(train_data)
    logit2 = LogisticRegression() # default penalty='l2'
    logit2.fit(cvdata2, train_labels)
    
    dev2=cv.transform(dev_data)
    pred2 = logit2.predict(dev2)
    score2 = metrics.f1_score(dev_labels, pred2, average='micro')
    print "Dictionary size with preprocessing: ", len(cv.vocabulary_)
    print "F1 score with preprocessing: ", score2
    print "Dictionary size reduction: ", len(vect.vocabulary_)-len(cv.vocabulary_)


preprocess()

Dictionary size without preprocessing:  8553
F1 score without preprocessing:  0.739819004525
Dictionary size with preprocessing:  8487
F1 score with preprocessing:  0.733031674208
Dictionary size reduction:  66


In [9]:
def cv_log(param, data, labels, processor=None):

    tf= CountVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    logit = LogisticRegression(C=param)
    logit.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= logit.predict(tfdev)

    accuracy=metrics.f1_score(labels, predict, average='micro')
    return accuracy

# ## STUDENT END ###

print('dev data accuracy:', cv_log(0.1, dev_data, dev_labels))
print('dev data processed accuracy:', cv_log(0.1, dev_data, dev_labels, better_preprocessor))
print('test data accuracy:', cv_log(0.1, test_data, test_labels))
print('test data processed accuracy:', cv_log(0.1, test_data, test_labels,better_preprocessor))

('dev data accuracy:', 0.75565610859728505)
('dev data processed accuracy:', 0.76244343891402711)
('test data accuracy:', 0.72108843537414979)
('test data processed accuracy:', 0.71882086167800452)


In [10]:
def tfid_log(param, data, labels, processor=None):

    tf= TfidfVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    logit = LogisticRegression(C=param)
    logit.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= logit.predict(tfdev)
    
    accuracy=metrics.f1_score(labels, predict, average='micro')
    return accuracy


print('dev data accuracy:', tfid_log(0.5, dev_data, dev_labels))
print('dev data processed accuracy:', tfid_log(0.5, dev_data, dev_labels, better_preprocessor))
print('test data accuracy:', tfid_log(0.5, test_data, test_labels))
print('test data processed accuracy:', tfid_log(0.5, test_data, test_labels, better_preprocessor))

('dev data accuracy:', 0.76018099547511309)
('dev data processed accuracy:', 0.75565610859728505)
('test data accuracy:', 0.72335600907029485)
('test data processed accuracy:', 0.72108843537414979)


In [11]:
def cv_mnb(param, data, label, processor=None):

    tf= CountVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    mb = MultinomialNB(alpha=param)
    mb.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= mb.predict(tfdev)
    
    accuracy=metrics.f1_score(label, predict, average='micro')
    return accuracy


print('dev data accuracy:', cv_mnb(0.5, dev_data, dev_labels))
print('dev data processed accuracy:', cv_mnb(0.5, dev_data, dev_labels, better_preprocessor))
print('test data accuracy:', cv_mnb(0.5, test_data, test_labels))
print('test data processed accuracy:', cv_mnb(0.5, test_data, test_labels, better_preprocessor))

('dev data accuracy:', 0.76923076923076927)
('dev data processed accuracy:', 0.76470588235294124)
('test data accuracy:', 0.74149659863945583)
('test data processed accuracy:', 0.73242630385487528)


In [12]:
def tf_mnb(param, data, label, processor=None):

    tf= TfidfVectorizer(analyzer='word', preprocessor=processor)
    tfdata=tf.fit_transform(train_data)
    mb = MultinomialNB(alpha=param)
    mb.fit(tfdata, train_labels)
    
    tfdev=tf.transform(data)
    
    # predict classification
    predict= mb.predict(tfdev)

    accuracy=metrics.f1_score(label, predict, average='micro')
    return accuracy


print('dev data accuracy:', tf_mnb(0.3, dev_data, dev_labels))
print('dev data processed accuracy:', tf_mnb(0.3, dev_data, dev_labels, better_preprocessor))
print('test data accuracy:', tf_mnb(0.3, test_data, test_labels))
print('test data processed accuracy:', tf_mnb(0.3, test_data, test_labels, better_preprocessor))

('dev data accuracy:', 0.76018099547511309)
('dev data processed accuracy:', 0.75565610859728505)
('test data accuracy:', 0.71201814058956925)
('test data processed accuracy:', 0.71655328798185947)
