# Sentiment Analysis of Restaurant reviews using BERT

## Install dependencies

In [None]:
# !pip install torch torchvision torchaudio

In [None]:
# !pip install transformers requests beautifulsoup4 pandas numpy.v

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import time

## Instantiate Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

tokenizer_config.json: 100%|██████████| 39.0/39.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config.json: 100%|██████████| 953/953 [00:00<00:00, 833kB/s]
vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 3.70MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 34.5kB/s]
pytorch_model.bin: 100%|██████████| 669M/669M [01:46<00:00, 6.30MB/s] 


## Encode and Calculate Sentiment

In [5]:
# Example

tokens = tokenizer.encode("I loved this movie very much", return_tensors='pt')

In [6]:
tokens

tensor([[  101,   151, 46747, 10372, 13113, 12495, 12977,   102]])

In [7]:
result = model(tokens)

In [8]:
result

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4563, -2.3910, -0.6497,  1.6045,  3.0400]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [9]:
torch.argmax(result.logits)

tensor(4)

### Decoding the result of the model

The ratings from the model are from 1-5

In [10]:
int(torch.argmax(result.logits)) + 1

5

## Collecting Reviews


- We will be collecting reviews for Sushi Tomi in Mountain View California. This restaurant is considered one of the best sushi places in the Bay Area. Let's see what people think about it

In [11]:
# Initialize variables
reviews = []
regex = re.compile('.*comment.*') # extract the comments from the html document

# mimic a web browser by using headers
headers = {
   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
NUM_PAGES = 251
for page_num in range(NUM_PAGES + 1):
    page_index = page_num*10
    url = "https://www.yelp.com/biz/sushi-tomi-mountain-view-2?start={}".format(page_index)

    success = False
    while not success:
        try:
            html_doc = requests.get(url).text
            success = True
        except ConnectionError:
            print("Connection failed. Retrying...")
            time.sleep(30)

    soup = BeautifulSoup(html_doc, 'html.parser')
    result_per_page = soup.find_all('p', {'class':regex})
    reviews_per_page = [result.text for result in result_per_page]
    reviews.extend(reviews_per_page)
    time.sleep(6)


In [12]:
reviews[:2]

['Great Japanese restaurant in the area, with well executed classics for an affordable price. But be prepared for a wait. Visited on a Sunday evening and the restaurant was packed. All tables full and maybe a 15 minute wait? Service was attentive but not doting, which is fine. A few highlights of the food: Sashimi is very good - huge portions and high quality. We got a few bento combos that had sashimi, and where most places would give you 4-5 2"x1"x0.5" pieces, here you get absolutely gigantic pieces, maybe 50% more fish per slice. Quality is super good, very fresh and well selected cuts. No stringiness! Major highlight was the tuna kama (pictured) which was a special. Huge, roasty, and rich fish flavor. Bring a few friends, because this is definitely meant to be shared. Would definitely come again, although maybe on a quieter night. (Sushi, sashimi, grill items, tuna kama)',
 'Service was incredibly slow and felt incredibly chaotic inside. Servers were running and rushing around, fel

In [13]:
len(reviews)

981

In [14]:
reviews[0]

'Great Japanese restaurant in the area, with well executed classics for an affordable price. But be prepared for a wait. Visited on a Sunday evening and the restaurant was packed. All tables full and maybe a 15 minute wait? Service was attentive but not doting, which is fine. A few highlights of the food: Sashimi is very good - huge portions and high quality. We got a few bento combos that had sashimi, and where most places would give you 4-5 2"x1"x0.5" pieces, here you get absolutely gigantic pieces, maybe 50% more fish per slice. Quality is super good, very fresh and well selected cuts. No stringiness! Major highlight was the tuna kama (pictured) which was a special. Huge, roasty, and rich fish flavor. Bring a few friends, because this is definitely meant to be shared. Would definitely come again, although maybe on a quieter night. (Sushi, sashimi, grill items, tuna kama)'

## Load Reviews into a Pandas DataFrame and get the sentiment Scores

In [15]:
df = pd.DataFrame(np.array(reviews), columns = ['review'])

# save the dataframe to .csv file for future usage 
filename = "sushi_tomi_yelp_reviews.csv"
df.to_csv(filename)

In [16]:
df.head(10)

Unnamed: 0,review
0,"Great Japanese restaurant in the area, with we..."
1,Service was incredibly slow and felt incredibl...
2,Came with a friend on a weekday for lunch. It ...
3,"We came around Wednesday dinner time, and ther..."
4,Looking for fresh sushi in downtown MV? This i...
5,Been coming here for the past few months and t...
6,I quite like this place!I guess I have been he...
7,One of a few Japanese restaurant that is owned...
8,My family and I came by for dinner and were ve...
9,Good quality fish at Mountain View prices. Per...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  981 non-null    object
dtypes: object(1)
memory usage: 7.8+ KB


In [18]:
# check first review
df['review'].iloc[0]

'Great Japanese restaurant in the area, with well executed classics for an affordable price. But be prepared for a wait. Visited on a Sunday evening and the restaurant was packed. All tables full and maybe a 15 minute wait? Service was attentive but not doting, which is fine. A few highlights of the food: Sashimi is very good - huge portions and high quality. We got a few bento combos that had sashimi, and where most places would give you 4-5 2"x1"x0.5" pieces, here you get absolutely gigantic pieces, maybe 50% more fish per slice. Quality is super good, very fresh and well selected cuts. No stringiness! Major highlight was the tuna kama (pictured) which was a special. Huge, roasty, and rich fish flavor. Bring a few friends, because this is definitely meant to be shared. Would definitely come again, although maybe on a quieter night. (Sushi, sashimi, grill items, tuna kama)'

In [19]:
# define sentiment score

def sentiment_score(text):
    tokens = tokenizer.encode(text, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1


df['sentiment_score'] = df['review'].apply(lambda x: sentiment_score(x[:512])) # limited to 512 tokens per review

In [20]:
df.head(20)

Unnamed: 0,review,sentiment_score
0,"Great Japanese restaurant in the area, with we...",4
1,Service was incredibly slow and felt incredibl...,2
2,Came with a friend on a weekday for lunch. It ...,4
3,"We came around Wednesday dinner time, and ther...",4
4,Looking for fresh sushi in downtown MV? This i...,4
5,Been coming here for the past few months and t...,5
6,I quite like this place!I guess I have been he...,4
7,One of a few Japanese restaurant that is owned...,5
8,My family and I came by for dinner and were ve...,5
9,Good quality fish at Mountain View prices. Per...,4


In [21]:
df['sentiment_score'].mean()

3.8297655453618757

In [22]:
df['sentiment_score'].mode()

0    5
Name: sentiment_score, dtype: int64

In [23]:
df['sentiment_score'].median()

4.0