In [1]:
# import required libraries

import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

# plotting library
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import init_notebook_mode

plotly.tools.set_credentials_file(username='shresthamanjil21', api_key='LhGoc8Zt7yNBNWCNOZtc')
plotly.tools.set_config_file(world_readable=True, sharing='public')



In [2]:
# read training and testing files
train_datas = pd.read_csv('train.tsv', sep='\t')
test_datas = pd.read_csv('test.tsv', sep='\t')

# separate training message and labels
X = train_datas['Phrase']
y = train_datas['Sentiment']

# remove numbers from data
def replace_numbers(val):
    return re.sub("[^a-zA-Z]", " ", val)

X = X.apply(replace_numbers)

In [3]:
# split training and testing data in 75-25 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [4]:
# function to train the model
def train(classifier, X_train, X_test, y_train, y_test):         
    classifier.fit(X_train, y_train)    
    return classifier

In [5]:
# pipeline for vectorizing data and naive bayes classifier
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.05))
])

clf = train(trial1, X_train, X_test, y_train, y_test)

nb_score = clf.score(X_test, y_test)     # accuracy score for naive bayes 
print ("Accuracy nb: %s" % nb_score)

Accuracy nb: 0.6009227220299884


In [6]:
# pipeline for vectorizing data and linear regression
trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', LinearRegression())
])

lg_clf = train(trial2, X_train, X_test, y_train, y_test)

lg_score = lg_clf.score(X_test, y_test)      # accuracy score for linear regression
print ("Accuracy lg: %s" % lg_score)

Accuracy lg: 0.42902678159937935


In [7]:
# extracting top20 most and least popular words with frequencies from tfidf vectorized dictionary
feature_vect = clf.named_steps['vectorizer'].vocabulary_

sorted_feature_vect_highest = sorted(feature_vect.items(), key=lambda t: t[1], reverse=True)[:20]
sorted_feature_vect_words_highest = [items[0] for items in sorted_feature_vect_highest]
sorted_feature_vect_frequencies_highest = [items[1] for items in sorted_feature_vect_highest]

sorted_feature_vect_lowest = sorted(feature_vect.items(), key=lambda t: t[1], reverse=True)[-20:]
sorted_feature_vect_words_lowest = [items[0] for items in sorted_feature_vect_lowest]
sorted_feature_vect_frequencies_lowest = [items[1] for items in sorted_feature_vect_lowest]

In [73]:
# showing the frequent words in bar diagram
fig1 = go.Bar(x=sorted_feature_vect_words_highest, y = sorted_feature_vect_frequencies_highest)
fig2 = go.Bar(x = sorted_feature_vect_words_lowest, y = sorted_feature_vect_frequencies_lowest)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Highest frequencies', 'Lowest frequencies'))

fig.append_trace(fig1, 1, 1)
fig.append_trace(fig2, 1, 2)

fig['layout'].update(height=600, width=1300, title='Highest & Lowest frequencies word!')

py.iplot(fig, filename='highest-lowest-frequency-word')


plotly.graph_objs.Font is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Font
  - plotly.graph_objs.layout.hoverlabel.Font
  - etc.




This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



In [52]:
# get the predictions for the testing data on the above naive bayes model
preds = clf.predict(test_datas.Phrase)

In [53]:
# getting counts of all classified data
neg_pred = np.count_nonzero(preds == 0)
somewhat_pos_pred = np.count_nonzero(preds == 1)
neut_pred = np.count_nonzero(preds == 2)
somewhat_neg_pred = np.count_nonzero(preds == 3)
pos_pred = np.count_nonzero(preds == 4)

In [54]:
# preparing data for pie chart
labels = ['Positive', 'Negative', 'Somewhat Positive', 'Somewhat NEgative', 'Neutral']
values = [pos_pred, neg_pred, somewhat_pos_pred, somewhat_neg_pred, neut_pred]
colors = ['#FEBFB3', '#E1396C', '#96D38C', '#67c29d', '#f6f3h1']

In [55]:
# plot classification amount in pie chart
trace = go.Pie(
            labels=labels, values=values,
            hoverinfo='label+percent', textinfo='value', 
            textfont=dict(size=20),
            marker=dict(colors=colors, 
            line=dict(color='#000000', width=2))
        )
py.iplot([trace], filename='styled_pie_chart')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~shresthamanjil21/0 or inside your plot.ly account where it is named 'styled_pie_chart'


In [56]:
# plot the comparison of naive bayes and linear regression accuracy scores
labels = ['Naive Bayes', 'Linear Regression']
values = [nb_score, lg_score]
colors = ['#FEBFB3', '#E1396C']

trace = go.Pie(
            labels=labels, values=values,
            hoverinfo='label+percent', textinfo='value', 
            textfont=dict(size=20),
            marker=dict(colors=colors, 
            line=dict(color='#000000', width=2))
        )
py.iplot([trace], filename='algorithm comparison')