# BERT (Bidirectional Encoder Representations from Transformers)
### Robby Jeffries
### Reference: https://github.com/nicknochnack/BERTSentiment/blob/main/Sentiment.ipynb

# 1. Install and Import Dependencies

In [None]:
conda install pytorch torchvision torchaudio -c pytorch

In [None]:
conda install transformers requests beautifulsoup4 pandas numpy

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
import re

# 2. Instantiate Model

Multilingual support for
* English
* Dutch
* German
* French
* Italian
* Spanish

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# 3. Playground to Encode and Calculate Sentiment

In [None]:
tokens = tokenizer.encode('covid', return_tensors='pt')

In [None]:
result = model(tokens)

In [None]:
result.logits

In [None]:
# Add one so that scores range from 1-5 instead of 0-4
int(torch.argmax(result.logits))+1

# 4. Import Reviews

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
os.chdir('/Users/robbyjeffries/MSEA2022/Spring 2022/ECON 5763, Economic Analytics/Data')

In [None]:
raw = pd.read_csv('CSV_completed/Electronics_clean.csv', sep='\t')

In [None]:
df = pd.DataFrame(np.array(raw), columns=['marketplace','customer_id','review_id','product_id','product_parent','product_title','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date','year','product_category'])

In [None]:
df['review_body'].iloc[0]

# 5. Score Sentiment and Calculate RMSE

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
sentiment_score(df['review_body'].iloc[1])

In [None]:
# Calculate a sentiment score for every review
df['sentiment'] = df['review_body'].apply(lambda x: sentiment_score(x[:1001]))

In [None]:
df

In [None]:
star_rating = df['star_rating']
sentiment = df['sentiment']
rms = np.sqrt(mean_squared_error(star_rating, sentiment))
print('RMSE =', rms)