In [1]:
# import libraries
import pandas as pd    
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.dates as mdates
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re
from sklearn.metrics import accuracy_score

from datetime import datetime

import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Load data

In [2]:
user_reviews = pd.read_pickle('../data/users_review_raw.pkl')

In [3]:
user_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
5,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,1.0,2,0,0,"This place used to be a cool, chill place. Now...",2018-01-21 04:41:03
6,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2.0,0,0,0,"The setting is perfectly adequate, and the foo...",2006-04-16 02:58:44
9,KKVFopqzcVfcubIBxmIjVA,99RsBrARhhx60UnAC4yDoA,EEHhKSxUvJkoPSzeGKkpVg,5.0,0,0,0,I work in the Pru and this is the most afforda...,2014-05-07 18:10:21
18,btNWW2kdJYfwpTDyzJO3Iw,DECuRZwkUw8ELQZfNGef2Q,zmZ3HkVCeZPBefJJxzdJ7A,4.0,0,0,0,Nothing special but good enough. I like anoth...,2012-12-04 04:29:47


In [4]:
# reset index
user_reviews = user_reviews.reset_index()

In [5]:
# Check missing value
user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282296 entries, 0 to 2282295
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   index        int64  
 1   review_id    object 
 2   user_id      object 
 3   business_id  object 
 4   stars        float64
 5   useful       int64  
 6   funny        int64  
 7   cool         int64  
 8   text         object 
 9   date         object 
dtypes: float64(1), int64(4), object(5)
memory usage: 174.1+ MB


## Preprocessing text

1. Removing punctuations 
2. Lower casing
3. Removing stopword
4. Tokenization
5. Lemmatization


In [6]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
# Declare functions for preprocessing
def preprocessing(text):
    punctuationfree= "".join([i for i in text if i not in string.punctuation]) ## remove punctuation
    sentence = punctuationfree.lower() #convert uppercase to lowercase
    nostopword = " ".join([word for word in sentence.split() if word not in (stopwords)])
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') # extract the sentence that contain alphabat and numbers only.
    tokens = tokenizer.tokenize(nostopword) #tokenize the word
    lemma_words=[lemmatizer.lemmatize(w) for w in tokens]
    new_text = ' '.join(lemma_words) #rejoin the sentence
    return nostopword

In [9]:
user_reviews['clean_msg']= user_reviews['text'].apply(lambda x:preprocessing(x))
user_reviews.head()

Unnamed: 0,index,review_id,user_id,business_id,stars,useful,funny,cool,text,date,clean_msg
0,0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02,apparently prides osteria rough summer evidenc...
1,5,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,1.0,2,0,0,"This place used to be a cool, chill place. Now...",2018-01-21 04:41:03,place used cool chill place bunch neanderthal ...
2,6,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2.0,0,0,0,"The setting is perfectly adequate, and the foo...",2006-04-16 02:58:44,setting perfectly adequate food comes close di...
3,9,KKVFopqzcVfcubIBxmIjVA,99RsBrARhhx60UnAC4yDoA,EEHhKSxUvJkoPSzeGKkpVg,5.0,0,0,0,I work in the Pru and this is the most afforda...,2014-05-07 18:10:21,work pru affordable tasty place food court dea...
4,18,btNWW2kdJYfwpTDyzJO3Iw,DECuRZwkUw8ELQZfNGef2Q,zmZ3HkVCeZPBefJJxzdJ7A,4.0,0,0,0,Nothing special but good enough. I like anoth...,2012-12-04 04:29:47,nothing special good enough like another one m...


In [13]:
# Calculate numbers of words
user_reviews['words'] = [len(x.split()) for x in user_reviews['clean_msg'].tolist()]

In [14]:
user_reviews = user_reviews.reset_index()

In [15]:
user_reviews.shape

(2282296, 13)

### Sentiment Analysis using hugging face

In [16]:
#!pip install --upgrade tensorflow
#!pip install transformers

In [17]:
from transformers import pipeline

In [18]:
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [87]:
user_reviews['sentiment_dict'] = user_reviews['clean_msg'].apply(lambda review: sentiment(review, truncation=True))

user_reviews

Unnamed: 0,index,review_id,user_id,business_id,stars,useful,funny,cool,text,date,clean_msg,words,sentiment_dict
0,35,nJTSr-EGNhhA5o146THkPg,T9O5pkKKlNvr-qqfefDlbA,luOZQ9YBrWwP8mYrS4rNoA,4.0,2,0,0,This place has some of the BEST chinese take-o...,2008-12-03 04:39:26,place best chinese takeout portland dont let i...,49,"[{'label': 'POSITIVE', 'score': 0.997427761554..."
1,42,lJ7rzbvT-l8KO8lHfEsXsg,LV1ME-ibA2h0IGyFUUWhaQ,H_RM2u1WWGU1HkKZrYq2Ow,5.0,0,0,0,Incredible donuts. Sometimes you have to go ea...,2017-08-07 19:34:13,incredible donuts sometimes go early full sele...,10,"[{'label': 'POSITIVE', 'score': 0.999814927577..."
2,45,m-9DK7NwYedIPj1RQ_sXdw,JuM-lH05m6Ln8OPUTg8p0g,H_RM2u1WWGU1HkKZrYq2Ow,5.0,0,0,0,"Dont bother going to voodoo, just come here in...",2016-09-18 17:06:01,dont bother going voodoo come instead unique f...,14,"[{'label': 'NEGATIVE', 'score': 0.877250492572..."
3,49,EO5rALvJMkK8QEvUNs1gxg,u2xPfv6_wcKt-lW-C1cV8A,9P-lp3AWDXGayDqJz9VPwQ,2.0,0,0,0,The ramen here is less than great. It came out...,2018-02-11 03:30:12,ramen less great came luke warm oily service r...,11,"[{'label': 'POSITIVE', 'score': 0.629967510700..."
4,67,OH9E5SaGBQsPX3IktM30mg,7mWnNVk2n99JxkvV3PW0nA,Un6u2cECyV4nZb_HGZ-uTA,4.0,1,0,1,It's crazy how establishments on the west coas...,2011-02-13 16:38:09,crazy establishments west coast many yelp revi...,101,"[{'label': 'POSITIVE', 'score': 0.958464384078..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
753050,8635355,CCv8DP1zZyoI0FvgEdeTgg,7pGvyqTBe5vcmKG29Gcz3Q,d69y3CN9_SQKrEnXXqQr8g,5.0,1,0,0,King burrito is my go to Burritos when I'm nea...,2020-10-05 18:30:46,king burrito go burritos im near enough area b...,79,"[{'label': 'POSITIVE', 'score': 0.995506882667..."
753051,8635371,pevp4H0U6Q7UKl-J9PBLLQ,yDeqOLo8pp1xpzHWBKFEfA,2JKien1H998FluEYs0xIrg,5.0,3,0,1,Where do I start with Buranko? Everything I tr...,2020-12-06 23:19:24,start buranko everything tried matcha cocktail...,18,"[{'label': 'NEGATIVE', 'score': 0.976583719253..."
753052,8635391,XpVatkv32ZiY3Mv_cR26Mw,UvlvbgQaADuIoE2bEQYJ1A,_VF1CWhsQWv77Yi92ORo1w,5.0,0,0,0,Wonderful! We stopped in twice while taking ou...,2021-01-20 20:07:19,wonderful stopped twice taking son bacon nonba...,60,"[{'label': 'POSITIVE', 'score': 0.994185268878..."
753053,8635397,FfhmA0G0zrRjHskp-7O8UQ,IlxM3NGJOtNXPz5cupqNDQ,dmkDZKPsK8lmwFuLiFQ0Zw,5.0,0,0,0,Yes please! We had the Arepas falafel style an...,2021-01-25 14:53:13,yes please arepas falafel style maccurles fres...,15,"[{'label': 'POSITIVE', 'score': 0.999734461307..."


### Create New column for distilbert analysis 'label'

In [32]:
user_reviews_final['sentiment_label'] = user_reviews_final['sentiment_dict'].apply(lambda score_dict: score_dict[0]['label'])

In [33]:
## Label POSITIVE = 1 , NEGATIVE = -1

user_reviews_final['sentiment_label'].replace({'POSITIVE': 1, 'NEGATIVE': -1},inplace=True)

### Create New column for distilbert analysis 'score'

In [34]:
user_reviews_final['sentiment_score'] = user_reviews_final['sentiment_dict'].apply(lambda score_dict: score_dict[0]['score'])

In [35]:
# Multiply sentiment_label * sentiment_score to get compound score.
user_reviews_final['sentiment_final'] = user_reviews_final['sentiment_label'] * user_reviews_final['sentiment_score']

In [36]:
### Export user_reviews with sentiment score
user_reviews_final.to_pickle('./data/user_reviews_sentiment_final.pkl')

### Calculate the mean sentiment score for each restaurants groupby 'businessid'

In [37]:
user_reviews_final.groupby(by='business_id')['sentiment_final'].mean()

business_id
--6COJIAjkQwSUZci_4PJQ    0.594357
--UNNdnHRhsyFUbDgumdtQ    0.417455
-00d-Qb0q2TcWn-8LBHDZg   -0.273694
-0Gbsd7ztvTyFpl7jF0DIw    0.297091
-0iqnv7MjKrgh7Q7bYRlUQ   -0.000902
                            ...   
zyNQhunb1mcSUUbnqVcU1w   -0.148889
zyauuvAYdVweBK4L7wBRmw   -0.118130
zzO0rjxjVAutcqFnI4VvAg   -0.483112
zzcdycb7S42VnnZkwE4yNA   -0.728696
zzpmoTVq4yn86U7ArHyFBQ    0.380740
Name: sentiment_final, Length: 18093, dtype: float64

In [45]:
# Create dataframe for the mean sentiment score
sentiment_score = user_reviews_final.groupby(by='business_id')['sentiment_final'].mean().to_frame().reset_index()

In [46]:
sentiment_score.head()

Unnamed: 0,business_id,sentiment_final
0,--6COJIAjkQwSUZci_4PJQ,0.594357
1,--UNNdnHRhsyFUbDgumdtQ,0.417455
2,-00d-Qb0q2TcWn-8LBHDZg,-0.273694
3,-0Gbsd7ztvTyFpl7jF0DIw,0.297091
4,-0iqnv7MjKrgh7Q7bYRlUQ,-0.000902


In [47]:
sentiment_score.describe()

Unnamed: 0,sentiment_final
count,18093.0
mean,0.079334
std,0.383843
min,-0.999254
25%,-0.170261
50%,0.118951
75%,0.353336
max,0.999456


In [49]:
## Export sentiment_score
sentiment_score.to_csv('./data/sentiment_score.csv', index =False)