In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
import nltk
from nltk.corpus import stopwords

In [2]:
data_set = pd.read_csv('amazon_cells_labelled.txt', delimiter='\t', \
                       header=None, names=['commentz', 'sentiment'])
pd.set_option('display.max_colwidth', 1000)
data_set['words'] = data_set.commentz.str.strip().str.split('[\W_]+')
replace = {'\.':' ','\,':' ','\!':' ','\?':' '}
data_set['commentz'].replace(replace,inplace=True,regex=True)


In [3]:
rows = list()
for row in data_set[['sentiment', 'words']].iterrows():
    r = row[1]
    for word in r.words:
        rows.append((r.sentiment, word))

words = pd.DataFrame(rows, columns=['sentiment', 'word'])
words = words[words.word.str.len() > 0]
words['word'] = words.word.str.lower()



In [9]:
counts = words.groupby('sentiment')\
    .word.value_counts()\
    .to_frame()\
    .rename(columns={'word':'n_w'})
#counts.sort_values(by='n_w', ascending=False)
word_sum = counts.groupby(level=0)\
    .sum()\
    .rename(columns={'n_w': 'n_d'})
tf = counts.join(word_sum)

tf['tf'] = tf.n_w/tf.n_d
tf.reset_index(drop=False, inplace=True, col_level=0, col_fill='')
#stop_words = set(stopwords.words('english'))
parts = ['e','t','r','o','a','s','i','d','m','he','re','ve','ll']
tf = tf[~tf['word'].isin(parts)]
tf[tf['sentiment']==1].sort_values(by='tf', ascending=False)
keywords=['great','good','excellent','nice','very','best','recommend']
all_data_set = data_set

all_keywords = tf[tf['sentiment']==1]['word'].tolist()



In [8]:
for key in keywords:
    col_list = keywords
    data_set[key] = 0
    data_set['hit'] = data_set[col_list].sum(axis=1)
    np.where(data_set['hit']<2,\
             np.where(data_set.commentz.str.contains(key,case=False),1,0),0)

data_set
       




KeyError: "['good' 'excellent' 'nice' 'very' 'best' 'recommend'] not in index"

In [None]:
for all_key in all_keywords:
    all_data_set[all_key] = all_data_set.commentz.str.contains(all_key,case=False )


In [None]:
data_set['sentiment_bool'] = (data_set['sentiment'] == 1)
data = data_set[keywords]
target = data_set['sentiment_bool']
bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

In [None]:
kw = list(keywords)
scores = dict()

for key in kw:
    scores.update({key:[data_set[(data_set[key]==1)\
            &(data_set['sentiment']==1)][key].count(),\
                        data_set[(data_set[key]==1)\
           &(data_set['sentiment']==0)][key].count(),\
                       data_set[(data_set[key]==0)\
           &(data_set['sentiment']==1)][key].count(),\
                       data_set[(data_set[key]==0)\
           &(data_set['sentiment']==0)][key].count()]})

tp = pd.DataFrame.from_dict(scores, orient='index')
tp.reset_index(inplace=True)
tp.rename({'index':'key_word',0:'true_pos',1:'false_neg',\
          2:'false_pos',3:'true_neg',4:'hit'},axis=1,inplace=True)
tp.sort_values(['true_pos'],ascending=False,inplace=True)

tp['perc'] = tp['true_pos'] / (tp['true_pos'] + tp['false_pos'])
tp['acc'] = (tp['true_pos'] + tp['true_neg']) / (tp['true_pos'] + tp['false_pos'] \
                                            + tp['false_neg'] + tp['true_neg'])

tp['f'] = 2 / ((1/tp['perc']) + (1/tp['true_pos']))


tp['tp_rate'] = tp['true_pos'] / (tp['true_pos'] + tp['false_neg'])
tp['fp_rate'] = tp['false_pos'] / (tp['false_pos'] + tp['true_neg'])

tp

In [None]:
plt.scatter(tp.fp_rate,tp.tp_rate,label='model')
plt.plot([0,1],[0,1],'b--', label='chance')
plt.title('Lift Curve\nAmazon Customer Reviews')
plt.xlabel('False Negative Rate\n(1-specificity)')
plt.xlim(0,1)
plt.ylabel('True Positive Rate\n(Sensitivity)')
plt.ylim(0,1)
plt.legend()
plt.grid()
plt.show()

In [None]:
all_kw = list(all_keywords)
all_scores = dict()

for key in all_kw:
    all_scores.update({key:[all_data_set[(all_data_set[key]==True)\
           &(all_data_set['sentiment']==1)][key].count(),\
                        all_data_set[(all_data_set[key]==True)\
           &(all_data_set['sentiment']==0)][key].count(),
                       all_data_set[(all_data_set[key]==False)\
           &(all_data_set['sentiment']==1)][key].count(),
                       all_data_set[(all_data_set[key]==False)\
           &(all_data_set['sentiment']==0)][key].count()]})

all_tp = pd.DataFrame.from_dict(all_scores, orient='index')
all_tp.reset_index(inplace=True)
all_tp.rename({'index':'key_word',0:'true_pos',1:'false_pos',\
          2:'false_neg',3:'true_neg'},axis=1,inplace=True)
all_tp.sort_values(['true_pos','false_pos'],inplace=True)
all_tp['cum_tp'] = all_tp['true_pos'].cumsum()
all_tp['cum_fp'] = all_tp['false_pos'].cumsum()
all_tp['cum_fn'] = all_tp['false_neg'].cumsum()
all_tp['cum_tn'] = all_tp['true_neg'].cumsum()

all_tp['perc'] = all_tp['cum_tp'] / (all_tp['cum_tp'] + all_tp['cum_fp'])
all_tp['acc'] = (all_tp['cum_tp'] + all_tp['cum_tn']) / (all_tp['cum_tp'] + all_tp['cum_fp'] \
                                            + all_tp['cum_fn'] + all_tp['cum_tn'])
all_tp['tp_rate'] = all_tp['cum_tp'] / (all_tp['cum_tp'] + all_tp['cum_fn'])
all_tp['fp_rate'] = all_tp['cum_fp'] / (all_tp['cum_fp'] + all_tp['cum_tn'])
all_tp['f'] = 2 / ((1/all_tp['perc']) + (1/all_tp['cum_tp']))


all_tp

In [None]:
plt.scatter(all_tp.fpr_cum,all_tp.tpr_cum,label='model')
plt.plot([0,1],[0,1],'b--', label='chance')
plt.title('Lift Curve\nAmazon Customer Reviews')
plt.xlabel('False Negative Rate\n(1-specificity)')
plt.xlim(0,1)
plt.ylabel('True Positive Rate\n(Sensitivity)')
plt.ylim(0,1)
plt.legend()
plt.grid()
plt.show()