In [2]:
import nltk
import numpy as np
from sklearn.utils import shuffle
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
#from bs4 import BeautifulSoup
import pandas as pd

In [3]:
positive_reviews = pd.read_csv('yelp_positive.csv')

negative_reviews = pd.read_csv('yelp_negative.csv')

In [4]:
negative_reviews.shape

(1676, 1)

In [5]:
positive_reviews.shape

(3337, 1)

In [6]:
wordnet_lemmatizer = WordNetLemmatizer()


In [7]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))


In [8]:
positive_reviews = positive_reviews[:len(negative_reviews)]

In [9]:
positive_reviews.head()

Unnamed: 0,review
0,My wife took me here on my birthday for breakf...
1,I have no idea why some people give bad review...
2,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
3,General Manager Scott Petello is a good egg!!!...
4,Drop what you're doing and drive here. After I...


In [10]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [11]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [12]:
for reviews in positive_reviews['review']:
    orig_reviews.append(reviews)
    tokens = my_tokenizer(reviews)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for reviews in negative_reviews['review']:
    orig_reviews.append(reviews)
    tokens = my_tokenizer(reviews)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [13]:
negative_tokenized

[['wa',
  'worth',
  'salad',
  'pizza',
  'absolutely',
  'bad',
  'service',
  'maybe',
  'guy',
  'grandma',
  'died',
  "n't",
  'tell',
  'you',
  'mad',
  'experience',
  'pizza',
  'salad',
  'guy',
  'cared',
  'le',
  'sat',
  'looking',
  'hmm',
  'sign',
  'saying',
  'pizza',
  'salad',
  'wow',
  'nice',
  'guy',
  'told',
  'left',
  'hungry',
  'mad',
  'unsatisfied',
  'owner',
  'teach',
  'employee',
  'value',
  'upselling',
  'telling',
  'special',
  'affect',
  'customer',
  'experience',
  'negatively',
  'salad',
  'severely',
  'overpriced',
  "n't",
  'unless',
  'desperate'],
 ['check',
  'car',
  'wan',
  'buy',
  'wrong',
  'move',
  'car',
  'service',
  'biggest',
  'mistake',
  'life',
  'time',
  'girlfriend',
  'car',
  'oil',
  'service',
  'guess',
  'ripped',
  'girlfriend',
  'lying',
  'bad',
  'car',
  'fixing',
  'bring',
  'serious',
  'accident',
  'then',
  'brand',
  'tire',
  'timing',
  'belt',
  'brake',
  'pad',
  'worst',
  'changed',
 

In [15]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [16]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 17937


In [17]:
print(word_index_map)



In [18]:
N = len(positive_tokenized) + len(negative_tokenized)

In [20]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1


  


In [21]:
data=data[~np.isnan(data).any(axis=1)]

In [22]:
print(data)

[[0.01408451 0.01408451 0.01408451 ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.01149425 0.01149425 0.        ]]


In [88]:
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1]
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [89]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.7606890187634574
Test accuracy: 0.71


In [94]:
threshold = 0.5
for word, index in iter(word_index_map.items()):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

breakfast 0.539166271477585
wa -5.311243818518141
excellent 2.09217035329581
perfect 1.2706978654983652
waitress -0.6207108232767361
food -0.6293357973290714
looked -0.5176712959312461
you 0.9116379768656463
've 1.3045094635351937
fresh 1.3352931406638262
amazing 3.329446465617491
menu 0.539118021836512
tasty 0.6958368477224489
delicious 2.5983903523767298
meal 0.7222614766253636
n't -4.4856645029796045
wait 0.5592082425000111
people -0.8972722386002832
bad -1.92690129333864
review -0.8528350296086079
... -1.1464865594933589
server -0.5002280653627025
decided -0.518682635004006
pizza 0.6078513575203965
try 0.6795574453800768
awesome 2.2357385621801704
home 0.6155330450252205
love 5.207678508956052
wonderful 1.1523691346661458
clean 0.9796254253376105
manager -0.6637211027219659
staff 0.9495819446758721
customer -0.9079597563725598
day 0.672214126968047
little 0.6822457051291575
waiting -0.5959774108976805
dinner 0.5126628795607546
loved 1.4033022171951726
hot 0.581909726925716
yummy 0.