In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.tools as pytools
import cred
import plotly.plotly as py
import plotly.graph_objs as go
import random

In [44]:
pytools.set_credentials_file(username=cred.username, api_key=cred.api_key)

In [45]:
df=pd.read_csv('train.tsv', sep="\t")


In [46]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [47]:
df['Sentiment'].unique()

array([1, 2, 3, 4, 0])

In [48]:
type(df)

pandas.core.frame.DataFrame

In [49]:
df.describe()

Unnamed: 0,PhraseId,SentenceId,Sentiment
count,156060.0,156060.0,156060.0
mean,78030.5,4079.732744,2.063578
std,45050.785842,2502.764394,0.893832
min,1.0,1.0,0.0
25%,39015.75,1861.75,2.0
50%,78030.5,4017.0,2.0
75%,117045.25,6244.0,3.0
max,156060.0,8544.0,4.0


In [50]:
#Tokenize tex with scikit learn
#Convert a collection of text documents to a matrix of token counts
count_vector=CountVectorizer()
#Learn the vocabulary dictionary and return term-document matrix.
x_df_counts=count_vector.fit_transform(df['Phrase'])
x_df_counts.shape

(156060, 15240)

In [51]:
count_vector.vocabulary_.get(u"movie")

8791

In [52]:
count_vector.get_feature_names()[1:10]

['10', '100', '101', '102', '103', '104', '105', '10th', '11']

In [53]:
tf_transformer=TfidfTransformer(use_idf=False).fit(x_df_counts)
x_df_tf=tf_transformer.transform(x_df_counts)
tfidf_transformer=TfidfTransformer()
x_df_tfidf=tfidf_transformer.fit_transform(x_df_counts)

# Separating Training and Test set

In [54]:
random.seed(10)
msk = np.random.rand(len(df)) < 0.8
train = x_df_tfidf[msk]
test = x_df_tfidf[~msk]

# Multinomial Naive Bayes

In [55]:
#Training  Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB


clf_MultinomialNB=MultinomialNB().fit(train, df[msk]['Sentiment'])
train_pred_MultinomialNB=clf_MultinomialNB.predict(train)
test_pred_MultinomialNB=clf_MultinomialNB.predict(test)


In [56]:
print(clf_MultinomialNB)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [57]:
#Model Performance Metrics
accuracy_MultinomialNB_train = accuracy_score( df[msk]['Sentiment'],train_pred_MultinomialNB)#TODO
accuracy_MultinomialNB_test = accuracy_score( df[~msk]['Sentiment'],test_pred_MultinomialNB)#TODO
print ("Train Accuracy: ",accuracy_MultinomialNB_train,"\nTest Accuracy:",accuracy_MultinomialNB_test)

confusion_MultinomialNB_train=confusion_matrix(df[msk]['Sentiment'],train_pred_MultinomialNB)
confusion_MultinomialNB_test=confusion_matrix(df[~msk]['Sentiment'],test_pred_MultinomialNB)


Train Accuracy:  0.6311603181581997 
Test Accuracy: 0.5897949297594057


In [58]:
#make training confusion matrix heat map
trace1 = go.Heatmap(z=confusion_MultinomialNB_train, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="Multinomial Naive Bayes Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='mnb-train-con-heatmap')

In [59]:
#Making test confusion matrix heat map
trace2 = go.Heatmap(z=confusion_MultinomialNB_test, x=[0,1,2,3,4], y=[0,1,2,3,4])
data2=[trace2]
layout2 = go.Layout(title="Multinomial Naive Bayes Confusion Matrix Testing",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2, filename='mnb-test-con-heatmap')

# Support Vector Machine (Linear Kernel)

In [60]:
from sklearn import svm 

clf_svm_Linear = svm.LinearSVC().fit(train, df[msk]['Sentiment'])
train_pred_svm_linear=clf_svm_Linear.predict(train)
test_pred_svm_linear=clf_svm_Linear.predict(test)

In [61]:
print(clf_svm_Linear)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


In [62]:
#Model Performance Metrics
accuracy_svm_linear_train= accuracy_score( df[msk]['Sentiment'],train_pred_svm_linear)
accuracy_svm_linear_test= accuracy_score( df[~msk]['Sentiment'],test_pred_svm_linear)#
print ("Train Accuracy: ",accuracy_svm_linear_train,"\nTest Accuracy:",accuracy_svm_linear_test)

confusion_svm_linear_train=confusion_matrix(df[msk]['Sentiment'],train_pred_svm_linear)
confusion_svm_linear_test=confusion_matrix(df[~msk]['Sentiment'],test_pred_svm_linear)


Train Accuracy:  0.7229145849154642 
Test Accuracy: 0.6479896657516551


In [63]:
#make training confusion matrix heat map
trace1 = go.Heatmap(z=confusion_svm_linear_train, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="SVM Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='svm-train-con-heatmap')

In [64]:
#Making test confusion matrix heat map
trace2 = go.Heatmap(z=confusion_svm_linear_test, x=[0,1,2,3,4], y=[0,1,2,3,4])
data2=[trace2]
layout2 = go.Layout(title="SVM Confusion Matrix Testing",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2, filename='svm-test-con-heatmap')

# Neural Network

In [65]:
from sklearn.neural_network import MLPClassifier


clf_mlp = MLPClassifier(solver='lbfgs',activation='logistic', 
                        alpha=1e-5,hidden_layer_sizes=(2), random_state=1)
clf_mlp=clf_mlp.fit(train, df[msk]['Sentiment'])
train_pred_mlp=clf_mlp.predict(train)
test_pred_mlp=clf_mlp.predict(test)

In [66]:
print(clf_mlp)

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=2, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [67]:
#Model Performance Metrics
accuracy_mlp_train= accuracy_score( df[msk]['Sentiment'],train_pred_mlp)
accuracy_mlp_test= accuracy_score( df[~msk]['Sentiment'],test_pred_mlp)#
print ("Train Accuracy: ",accuracy_mlp_train,"\nTest Accuracy:",accuracy_mlp_test)

confusion_mlp_train=confusion_matrix(df[msk]['Sentiment'],train_pred_mlp)
confusion_mlp_test=confusion_matrix(df[~msk]['Sentiment'],test_pred_mlp)

Train Accuracy:  0.6946400735441065 
Test Accuracy: 0.6481511383820442


In [68]:
#make training confusion matrix heat map
trace1 = go.Heatmap(z=confusion_mlp_train, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="Neural Network Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='nn-train-con-heatmap')

In [69]:
#Making test confusion matrix heat map
trace2 = go.Heatmap(z=confusion_mlp_test, x=[0,1,2,3,4], y=[0,1,2,3,4])
data2=[trace2]
layout2 = go.Layout(title="Neural Network Confusion Matrix Testing",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2, filename='nn-test-con-heatmap')

# Decision Trees

In [70]:
from sklearn import tree


clf_dtrees=tree.DecisionTreeClassifier().fit(train, df[msk]['Sentiment'])
train_pred_dtrees=clf_dtrees.predict(train)
test_pred_dtrees=clf_dtrees.predict(test)

In [71]:
print(clf_dtrees)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [72]:
#Model Performance Metrics
accuracy_dtrees_train= accuracy_score( df[msk]['Sentiment'],train_pred_dtrees)
accuracy_dtrees_test= accuracy_score( df[~msk]['Sentiment'],test_pred_dtrees)#
print ("Train Accuracy: ",accuracy_dtrees_train,"\nTest Accuracy:",accuracy_dtrees_test)

confusion_dtrees_train=confusion_matrix(df[msk]['Sentiment'],train_pred_dtrees)
confusion_dtrees_test=confusion_matrix(df[~msk]['Sentiment'],test_pred_dtrees)

Train Accuracy:  0.9528278508333666 
Test Accuracy: 0.580461811722913


In [73]:
#make training confusion matrix heat map
trace1 = go.Heatmap(z=confusion_dtrees_train, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="Decision Trees Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='dt-train-con-heatmap')

In [74]:
#Making test confusion matrix heat map
trace2 = go.Heatmap(z=confusion_dtrees_test, x=[0,1,2,3,4], y=[0,1,2,3,4])
data2=[trace2]
layout2 = go.Layout(title="Decision Trees Confusion Matrix Testing",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2, filename='dt-test-con-heatmap')

# Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

clf_forest=RandomForestClassifier(max_depth=None,random_state=0).fit(train, df[msk]['Sentiment'])
train_pred_forest=clf_forest.predict(train)
test_pred_forest=clf_forest.predict(test)

In [76]:
print(clf_forest)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


In [77]:
#Model Performance Metrics
accuracy_forest_train= accuracy_score( df[msk]['Sentiment'],train_pred_forest)
accuracy_forest_test= accuracy_score( df[~msk]['Sentiment'],test_pred_forest)#
print ("Train Accuracy: ",accuracy_forest_train,"\nTest Accuracy:",accuracy_forest_test)

confusion_forest_train=confusion_matrix(df[msk]['Sentiment'],train_pred_forest)
confusion_forest_test=confusion_matrix(df[~msk]['Sentiment'],test_pred_forest)

Train Accuracy:  0.9359766577401175 
Test Accuracy: 0.6261585661230421


In [78]:
#make training confusion matrix heat map
trace1 = go.Heatmap(z=confusion_forest_train, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="Random Forest Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='rf-train-con-heatmap')

In [80]:
#make testing confusion matrix heat map
trace1 = go.Heatmap(z=confusion_forest_test, x=[0,1,2,3,4], y=[0,1,2,3,4])
data1=[trace1]
layout1 = go.Layout(title="Random Forest Confusion Matrix Training",
                xaxis=dict(title='Predicted Sentiment'),
                yaxis=dict(title='True Sentiment'))
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1, filename='rf-test-con-heatmap')