In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_curve, auc

In [2]:
data_set = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', \
                       header=None, names=['comments', 'sentiment'])
pd.set_option('display.max_colwidth', 1000)
data_set['words'] = data_set.comments.str.strip().str.split('[\W_]+')
replace = {'\.':' ','\,':' ','\!':' ','\?':' '}
data_set['comments'].replace(replace,inplace=True,regex=True)
data_set.head()

Unnamed: 0,comments,sentiment,words
0,So there is no way for me to plug it in here in the US unless I go by a converter,0,"[So, there, is, no, way, for, me, to, plug, it, in, here, in, the, US, unless, I, go, by, a, converter, ]"
1,Good case Excellent value,1,"[Good, case, Excellent, value, ]"
2,Great for the jawbone,1,"[Great, for, the, jawbone, ]"
3,Tied to charger for conversations lasting more than 45 minutes MAJOR PROBLEMS,0,"[Tied, to, charger, for, conversations, lasting, more, than, 45, minutes, MAJOR, PROBLEMS, ]"
4,The mic is great,1,"[The, mic, is, great, ]"


In [3]:
rows = list()
for row in data_set[['sentiment', 'words']].iterrows():
    r = row[1]
    for word in r.words:
        rows.append((r.sentiment, word))

words = pd.DataFrame(rows, columns=['sentiment', 'word'])
words = words[words.word.str.len() > 0]
words['word'] = words.word.str.lower()
words.head()


Unnamed: 0,sentiment,word
0,0,so
1,0,there
2,0,is
3,0,no
4,0,way


In [4]:
counts = words.groupby('sentiment')\
    .word.value_counts()\
    .to_frame()\
    .rename(columns={'word':'n_w'})
#counts.sort_values(by='n_w', ascending=False)
word_sum = counts.groupby(level=0)\
    .sum()\
    .rename(columns={'n_w': 'n_d'})
tf = counts.join(word_sum)

tf['tf'] = tf.n_w/tf.n_d
tf.reset_index(drop=False, inplace=True, col_level=0, col_fill='')
a = ['the','and','i','it','is', 'a','this','to','my', 'of','for','that']
tf = tf[~tf['word'].isin(a)]
tf[tf['sentiment']==1].sort_values(by='tf', ascending=False)
keywords=['great','very','good','excellent','nice','best','recommend']



In [5]:
for key in keywords:

    data_set[str(key)] = data_set.comments.str.contains(str(key),
        case=False
    )
data_set['sentiment_bool'] = (data_set['sentiment'] == 1) 


In [6]:
data = data_set[keywords]
target = data_set['sentiment_bool']
bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 286


In [15]:
#lift_data = data_set[data_set['sentiment']==1].count()
for i, key in enumerate(keywords):
    lift_data = data_set[data_set['sentiment']==1].count()
    tpr_cal = data_set[(data_set[key]==True)&(data_set['sentiment']==1)].count()\
    / lift_data[0]
    print(tpr_cal)
 



comments          0.184
sentiment         0.184
words             0.184
great             0.184
very              0.184
good              0.184
excellent         0.184
nice              0.184
best              0.184
recommend         0.184
sentiment_bool    0.184
dtype: float64
comments          0.16
sentiment         0.16
words             0.16
great             0.16
very              0.16
good              0.16
excellent         0.16
nice              0.16
best              0.16
recommend         0.16
sentiment_bool    0.16
dtype: float64
comments          0.124
sentiment         0.124
words             0.124
great             0.124
very              0.124
good              0.124
excellent         0.124
nice              0.124
best              0.124
recommend         0.124
sentiment_bool    0.124
dtype: float64
comments          0.052
sentiment         0.052
words             0.052
great             0.052
very              0.052
good              0.052
excellent         0.052
nice  

In [8]:
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in keywords:
    y_true = data_set[data_set['sentiment']==1]
    y_score = data_set[(data_set[i]==True)&(data_set['sentiment']==1)
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
roc_auc
# Compute micro-average ROC curve and ROC area
#fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
#roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

SyntaxError: invalid syntax (<ipython-input-8-3ad489c6e4c4>, line 8)