In [1]:
from __future__ import division # ensure that all division is float division
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
#import all the necessary modules
import os, sys, re
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
sns.set_style("whitegrid")

# Load Dataset

In [2]:
os.chdir(r"C:\Users\Dominique Njinkeu\Documents\GitHub\Projects\notebook")
df=pd.read_csv("tweets_sample.csv")
df.Support.value_counts()

Neutral     1644
Panthers     827
Broncos      529
Name: Support, dtype: int64

In [3]:
# map the label values for scikit-learn
df["Label"]=df.Support.map({"Neutral":0,"Panthers":1,"Broncos":2})
#determine feature and label variables
X=df.text
y=df.Label

# Train/Test split

In [4]:
#perform train/test split with startify option to keep the proportion of teams the same for test and train variable
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2,stratify=y)

In [5]:
os.chdir(r"C:\Users\Dominique Njinkeu\Documents\GitHub\Projects\codes")
import ParseOutText

# Classifiers

In [6]:
#multinomialNB classifier
from sklearn.naive_bayes import MultinomialNB
clfNB=MultinomialNB()
#knn classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15)
#linear SVM
from sklearn import svm
linear_svc=svm.SVC(kernel='linear')
#RBF SVM
rbf_svm=svm.SVC(kernel='rbf',C=470)
#Logistic Regression
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)

# Naive Bayes: find the optimal parameters and number of features


In [7]:
vect=CountVectorizer()
X_train_dtm=vect.fit_transform(X_train)
print "Features: ",X_train_dtm.shape[1]

Features:  5827


In [9]:
ParseOutText.find_best_nb_of_features("vect",5827,clfNB)

maximum attainable score and features:  [[0.84399999999999997, 1106]]


[[0.84399999999999997, 1106]]

In [None]:
#for substanstial reduction of features we use the tdidf features
ParseOutText.find_best_nb_of_features("tfidf",2000,clfNB)

In [8]:
#Naive Bayes: find the optimal number of feature vectors
vect=CountVectorizer()
print "with no bi-gram and stopwords"
ParseOutText.test_classifier(vect,clfNB)
print "with the addition of Bi-gram"
vect=CountVectorizer(ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,clfNB)
print "with the removal of stopwords"
vect=CountVectorizer(max_features=1106,ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,clfNB)


with no bi-gram and stopwords
Features:  5827
Accuracy:  0.801333333333
with the addition of Bi-gram
Features:  17561
Accuracy:  0.834666666667
with the removal of stopwords
Features:  1106
Accuracy:  0.844


In [15]:
#tfidf vectorizer
tfidf=TfidfVectorizer(max_features=21,ngram_range=(1,3),stop_words='english')
ParseOutText.test_classifier(tfidf,clfNB)

Features:  21
Accuracy:  0.824


# Linear SVM: find the optimal parameters and number of features

In [25]:
vect=CountVectorizer()
print "with no bi-gram and stopwords"
ParseOutText.test_classifier(vect,linear_svc)
print "with the addition of Bi-gram"
vect=CountVectorizer(ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,linear_svc)
print "with the removal of stopwords"
vect=CountVectorizer(max_features=900,ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,linear_svc)

with no bi-gram and stopwords
Features:  5827
Accuracy:  0.881333333333
with the addition of Bi-gram
Features:  17561
Accuracy:  0.888
with the removal of stopwords
Features:  900
Accuracy:  0.886666666667


In [26]:
ParseOutText.find_best_nb_of_features("vect",800,linear_svc)

maximum attainable score and features:  [[0.89866666666666661, 768]]


[[0.89866666666666661, 768]]

In [27]:
ParseOutText.find_best_nb_of_features("tfidf",800,linear_svc)

maximum attainable score and features:  [[0.89200000000000002, 591]]


[[0.89200000000000002, 591]]

In [28]:
vect=CountVectorizer(max_features=768,ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,linear_svc)

Features:  768
Accuracy:  0.898666666667


In [30]:
tfidf_SVM=TfidfVectorizer(max_features=591,ngram_range=(1,3),stop_words='english')
ParseOutText.test_classifier(tfidf,linear_svc)

Features:  591
Accuracy:  0.892


# Logistic regression: find the optimal parameters and number of features

In [38]:
vect=CountVectorizer()
print "with no bi-gram and stopwords"
ParseOutText.test_classifier(vect,logreg)
print "with the addition of Bi-gram"
vect=CountVectorizer(ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,logreg)
print "with the removal of stopwords"
vect=CountVectorizer(max_features=900,ngram_range=(1,2),stop_words='english')
ParseOutText.test_classifier(vect,logreg)

with no bi-gram and stopwords
Features:  5827
Accuracy:  0.872
with the addition of Bi-gram
Features:  29920
Accuracy:  0.884
with the removal of stopwords
Features:  900
Accuracy:  0.801333333333


In [39]:
ParseOutText.find_best_nb_of_features("vect",5827,logreg)

maximum attainable score and features:  [[0.88400000000000001, 4742]]


[[0.88400000000000001, 4742]]

In [42]:
vect=CountVectorizer(ngram_range=(1,2),stop_words='english',max_features=4742)
ParseOutText.test_classifier(vect,logreg)

Features:  4742
Accuracy:  0.884


In [55]:
ParseOutText.find_best_nb_of_features("tfidf",5827,logreg)

maximum attainable score and features:  [[0.8746666666666667, 5539]]


[[0.8746666666666667, 5539]]

In [62]:
tfidf=TfidfVectorizer(max_features=5539,ngram_range=(1,3),stop_words='english')
ParseOutText.test_classifier(tfidf,logreg)

Features:  5539
Accuracy:  0.874666666667


# knn: find the optimal parameters and number of features

In [72]:
knn = KNeighborsClassifier(n_neighbors=15)
ParseOutText.find_best_nb_of_features("vect",5827,knn)

maximum attainable score and features:  [[0.85199999999999998, 5730]]


[[0.85199999999999998, 5730]]

In [75]:
ParseOutText.find_best_nb_of_features("tfidf",5827,knn)

maximum attainable score and features:  [[0.85333333333333339, 21]]


[[0.85333333333333339, 21]]

In [81]:
tfidf=TfidfVectorizer(max_features=150,ngram_range=(1,3),stop_words='english')
ParseOutText.test_classifier(tfidf,logreg)

Features:  21
Accuracy:  0.84


In [82]:
vect_NB=CountVectorizer(max_features=1106,ngram_range=(1,2),stop_words='english')
tfidf_NB=TfidfVectorizer(max_features=21,ngram_range=(1,3),stop_words='english')
vect_SVM=CountVectorizer(max_features=768,ngram_range=(1,2),stop_words='english')
tfidf_SVM=TfidfVectorizer(max_features=591,ngram_range=(1,3),stop_words='english')
vect_logreg=CountVectorizer(ngram_range=(1,2),stop_words='english',max_features=4742)
tfidf_logreg=TfidfVectorizer(max_features=5539,ngram_range=(1,3),stop_words='english')
tfidf_knn=TfidfVectorizer(max_features=150,ngram_range=(1,3),stop_words='english')

In [130]:
vect_SVM=CountVectorizer(max_features=1000,ngram_range=(1,2),stop_words='english',
                         analyzer=ParseOutText.split_into_lemmas
            )


In [131]:
ParseOutText.test_classifier(vect_SVM,linear_svc)

Features:  1000
Accuracy:  0.852
