# Sentiment Analysis of Restaurant reviews using BERT

## Install dependencies

In [None]:
!pip install torch torchvision torchaudio

In [None]:
!pip install transformers requests beautifulsoup4 pandas numpy

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import time

## Instantiate Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

## Encode and Calculate Sentiment

In [None]:
# Example

tokens = tokenizer.encode("I loved this movie very much", return_tensors='pt')

In [None]:
tokens

In [None]:
result = model(tokens)

In [None]:
result

In [None]:
torch.argmax(result.logits)

### Decoding the result of the model

The ratings from the model are from 1-5

In [None]:
int(torch.argmax(result.logits)) + 1

## Collecting Reviews


- We will be collecting reviews for Sushi Tomi in Mountain View California. This restaurant is considered one of the best sushi places in the Bay Area. Let's see what people think about it

In [None]:
# Initialize variables
reviews = []
regex = re.compile('.*comment.*') # extract the comments from the html document

# mimic a web browser by using headers
headers = {
   'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
NUM_PAGES = 251
for page_num in range(NUM_PAGES + 1):
    page_index = page_num*10
    url = "https://www.yelp.com/biz/sushi-tomi-mountain-view-2?start={}".format(page_index)

    success = False
    while not success:
        try:
            html_doc = requests.get(url).text
            success = True
        except ConnectionError:
            print("Connection failed. Retrying...")
            time.sleep(30)

    soup = BeautifulSoup(html_doc, 'html.parser')
    result_per_page = soup.find_all('p', {'class':regex})
    reviews_per_page = [result.text for result in result_per_page]
    reviews.extend(reviews_per_page)
    time.sleep(6)


In [None]:
reviews[:2]

In [None]:
len(reviews)

In [None]:
reviews[0]

## Load Reviews into a Pandas DataFrame and get the sentiment Scores

In [None]:
df = pd.DataFrame(np.array(reviews), columns = ['review'])

# save the dataframe to .csv file for future usage 
filename = "sushi_tomi_yelp_reviews.csv"
df.to_csv(filename)

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
# check first review
df['review'].iloc[0]

In [None]:
# define sentiment score

def sentiment_score(text):
    tokens = tokenizer.encode(text, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1


df['sentiment_score'] = df['review'].apply(lambda x: sentiment_score(x[:512])) # limited to 512 tokens per review

In [None]:
df.head(20)

In [None]:
df['sentiment_score'].mean()

In [None]:
df['sentiment_score'].mode()