In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = r'C:\Users\Toshiba P55w\sentiment labelled sentences\sentiment labelled sentences\yelp_labelled.txt'
yelp = pd.read_csv(data_path, delimiter='\t', header=None)
yelp.columns = ['Review', 'Positive']
yelp.head()

Unnamed: 0,Review,Positive
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Goal: build a naive bayes classifier that predicts whether a review is positive or negative

## Feature engineering overview:
I am going to identify words that are used a lot more in one type of review than the other. For each one of these words there will be a feature that shows whether the review contains the word. 

In [3]:
yelp['Word_list'] = yelp['Review'].str.lower().str.replace(r'[^\w\s]', '',).str.strip().str.split()

In [4]:
yelp.head()

Unnamed: 0,Review,Positive,Word_list
0,Wow... Loved this place.,1,"[wow, loved, this, place]"
1,Crust is not good.,0,"[crust, is, not, good]"
2,Not tasty and the texture was just nasty.,0,"[not, tasty, and, the, texture, was, just, nasty]"
3,Stopped by during the late May bank holiday of...,1,"[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[the, selection, on, the, menu, was, great, an..."


In [5]:
word_count_positive_dct = {}

word_count_negative_dct = {}

#turn every column into list of words. steps: lower case, replace nonalphanumeric chars with space, strip, split at spaces
review_word_list_positive = yelp[yelp['Positive']==1]['Word_list']

review_word_list_negative = yelp[yelp['Positive']==0]['Word_list']
#iterate through word lists and count words in dictionary
for review in review_word_list_positive:
    for word in review:
        word_count_positive_dct.setdefault(word, 0)
        word_count_positive_dct[word] +=1

for review in review_word_list_negative:
    for word in review:
        word_count_negative_dct.setdefault(word, 0)
        word_count_negative_dct[word] +=1

In [6]:
all_words = set(list(word_count_positive_dct.keys())+list(word_count_negative_dct.keys()))
word_counts = pd.DataFrame(list(all_words), columns=['Word'])

word_counts['Positive_count'] = word_counts['Word'].apply(lambda x: word_count_positive_dct.get(x, 0))
word_counts['Negative_count'] = word_counts['Word'].apply(lambda x: word_count_negative_dct.get(x, 0))
word_counts['Total_count'] = word_counts['Positive_count'] + word_counts['Negative_count']

# ratio of reviews word is in that are positive
word_counts['P_ratio'] = round(word_counts['Positive_count']/(word_counts['Positive_count'] + word_counts['Negative_count']),4)

# ratio of reviews word is in that are negative
word_counts['N_ratio'] = round(word_counts['Negative_count']/(word_counts['Positive_count'] + word_counts['Negative_count']),4)


# use words that show up in one type of review at a minimum ratio and that show up in all review a minimum number of times
min_ratio = .8
min_total_count = 5
keywords_df = word_counts[((word_counts['P_ratio'] > min_ratio) | (word_counts['N_ratio'] > min_ratio)) &
                          (word_counts['Total_count'] > min_total_count)
                         ]

keywords_df

Unnamed: 0,Word,Positive_count,Negative_count,Total_count,P_ratio,N_ratio
22,awesome,12,0,12,1.0000,0.0000
74,5,7,1,8,0.8750,0.1250
90,waited,0,10,10,0.0000,1.0000
100,town,7,0,7,1.0000,0.0000
104,prices,9,1,10,0.9000,0.1000
177,her,0,6,6,0.0000,1.0000
195,delicious,23,0,23,1.0000,0.0000
197,great,70,0,70,1.0000,0.0000
243,think,3,13,16,0.1875,0.8125
246,fantastic,12,0,12,1.0000,0.0000


In [7]:
keywords = list(keywords_df['Word'])

for key in keywords:
    
    yelp[str(key)] = yelp['Word_list'].apply(lambda x: key in x)

In [8]:
data = yelp[keywords]
target = yelp['Positive']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("accuracy : {}".format(
    (target == y_pred).sum()/data.shape[0]
))

accuracy : 0.792
