In [20]:
import pandas as pd
import spacy
import nltk
nlp = spacy.load('en_core_web_sm')

path = r'C:\Users\user\Desktop\Revature\Projects\Yelp\yelp_reviews_sample.csv'

yelpSmall = pd.read_csv(path)

In [21]:
yelpSmall['tokenized_sents'] = yelpSmall.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

print(yelpSmall['tokenized_sents'].head())

0    [Total, bill, for, this, horrible, service, ?,...
1    [I, *adore*, Travis, at, the, Hard, Rock, 's, ...
2    [I, have, to, say, that, this, office, really,...
3    [Went, in, for, a, lunch, ., Steak, sandwich, ...
4    [Today, was, my, second, out, of, three, sessi...
Name: tokenized_sents, dtype: object


In [22]:
yelpReviews = []

for i in range(1000):
    yelpReviews.append(yelpSmall['text'].iloc[i])

yelpRev = ' '.join(yelpReviews)

doc = nlp(yelpRev)
sentences = list(doc.sents)


In [23]:
from spacy import displacy
for i in range(20):
    displacy.render(nlp(str(yelpSmall['text'].iloc[i])), jupyter = True, style='ent')
#We could hone in on organizations, costs, time, etc.

  "__main__", mod_spec)


In [24]:
from collections import Counter
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'MONEY': 255,
         'PERSON': 866,
         'LOC': 79,
         'ORG': 775,
         'TIME': 394,
         'CARDINAL': 879,
         'GPE': 718,
         'PERCENT': 22,
         'DATE': 614,
         'ORDINAL': 258,
         'WORK_OF_ART': 34,
         'NORP': 332,
         'QUANTITY': 61,
         'PRODUCT': 62,
         'FAC': 70,
         'LAW': 7,
         'LANGUAGE': 8,
         'EVENT': 11})

In [25]:
#What follows below is absolute franken-code.
#As it turns out, sentences are unlabeled spans and (at least in my case) they don't play well with ents. So...I made every sentence its own doc.
for sentence in sentences:
    selection = sentence.text #turning the sentence object back to text to make it into its own doc.
    newQ = nlp(selection)
    for ent in newQ.ents:
        if ent.label_ == 'MONEY':
            print(newQ)
            break
#In theory, we could match price entities to sentiment analysis. We'd probably want to group by business somehow. Then we could give approximate pricing guidelines.

Over $8Gs.
These crooks actually had the nerve to charge us $69 for 3 pills.
I checked online the pills can be had for 19 cents EACH!
you can purchase for $80 something a year and this gave me 25% off all of my dental work, plus they helped me get signed up for care credit which I knew nothing about before this visit!  
A simple taro with coconut with tapioca pearls was like $5.25 or something.
Basically all the desserts were more than $5.
I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50.
A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert.
The service is slow & my salad, which was $15, was as bad as it gets.


A bottle of Riesling, calamari app, two delicious entrees and dessert for $92! 

So...after waiting in line for a few hours we were notified the lower portion of the theater was reserved for paid customers who previously purchased wris

In [26]:
from textblob import TextBlob

for i in range(1000):
    yelp = yelpSmall['text'].iloc[i]
    stars = yelpSmall['stars'].iloc[i]
    blob = TextBlob(yelp)
    sentiment = blob.sentiment.polarity
    print(yelp, '\n Stars: ', stars, '\n Sentiment: ', sentiment)

d is pretty good overall.
Smokehouse wings are the best in C-town.
The Brewben is one of the best sandwiches I've ever had.
The root beer floats are an old timey treat for desert.
Giant portions,be warned. 
 Stars:  5.0 
 Sentiment:  0.5166666666666666
Very pleased with everything I've bought here: scallops, salmon, halibut, trout.  The owner provides great customer service. They just started carrying low acid coffee which we used to have to buy from Trader Joes in Chicago. Ask to receive the fish and meat emails. 
 Stars:  5.0 
 Sentiment:  0.4833333333333334
It's pretty rare for me to write fast food reviews. But considering I have always have superb experiences, it's only fair to give them the credit they deserve. As far as I know, until now, Vegas only had one Freddy's location. I was stoked to find out the new one was opening near me. For me, fast food consists of maybe 5 places- and this is one of them. 
They serve steak-burgers, so it may cost more than average, but not by much.

In [27]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

"The polarity score is a float within the range [-1.0, 1.0]."
#Even without a trained model, we can get sentiment for the sentences and metrics. This is just to get an idea.

sentiments = []
stars = []

for i in range(100000):
    yelp = yelpSmall['text'].iloc[i]
    star = yelpSmall['stars'].iloc[i]
    blob = TextBlob(yelp)
    sentiment = blob.sentiment.polarity
    sentiments.append(sentiment)
    stars.append(star)

y_true = []
for star in stars:
    y = (star - 3) / 2
    y_true.append(y)

y_pred = sentiments

print(mean_absolute_error(y_true, y_pred), r2_score(y_true, y_pred))
#The absolute error and r2 score show that TextBlob is doing worse than guessing. However, we know from looking at the data that TextBlob is assigning a positive sentiment most of the time. This drastically skews the results. We need an algorithm that is sensitive to negative results.

0.5463699021119978 0.26072096739600337


In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

SIA = SentimentIntensityAnalyzer()

sentiments = []
stars = []

for i in range(100000):
    yelp = yelpSmall['text'].iloc[i]
    star = yelpSmall['stars'].iloc[i]
    ss = SIA.polarity_scores(yelp)
    sentiments.append(ss.get('compound'))
    stars.append(star)

y_true = []
for star in stars:
    y = (star - 3) / 2
    y_true.append(y)

y_pred = sentiments
print(mean_absolute_error(y_true, y_pred), r2_score(y_true, y_pred))
#The score is a little better than guessing here, but clearly we're going to need a trained model.

0.4208555909999999 0.25549700270116127


In [29]:
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

noStopYelp = yelpSmall['text']
noStopYelp = noStopYelp.transform(remove_stopwords)
noStopYelp = noStopYelp.transform(strip_punctuation)
noStopYelp = noStopYelp.str.lower()
print(noStopYelp.head())

0    total horrible service  over  8gs  these crook...
1    i  adore  travis hard rock s new kelly cardena...
2    i office together  organized friendly  dr  j  ...
3    went lunch  steak sandwich delicious  caesar s...
4    today second sessions i paid for  although ses...
Name: text, dtype: object


In [30]:
blob_sentiments = []
vader_sentiments = []
stars = []

for i in range(100000):
    yelp = noStopYelp.iloc[i]
    star = yelpSmall['stars'].iloc[i]
    blob = TextBlob(yelp)
    sentiment = blob.sentiment.polarity
    blob_sentiments.append(sentiment)
    ss = SIA.polarity_scores(yelp)
    vader_sentiments.append(ss.get('compound'))
    stars.append(star)

y_true = []
for star in stars:
    y = (star - 3) / 2
    y_true.append(y)

y_pred_blob = blob_sentiments
y_pred_vader = vader_sentiments

print(mean_absolute_error(y_true, y_pred_blob), r2_score(y_true, y_pred_blob))
print(mean_absolute_error(y_true, y_pred_vader), r2_score(y_true, y_pred_vader))

0.5386034804595622 0.2743067136924444
0.45896870900000003 0.14132268265967307
