# BERT (Bidirectional Encoder Representations from Transformers)
### Robby Jeffries
### Reference: https://github.com/nicknochnack/BERTSentiment/blob/main/Sentiment.ipynb

# 1. Install and Import Dependencies

In [2]:
conda install pytorch torchvision torchaudio -c pytorch

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [3]:
conda install transformers requests beautifulsoup4 pandas numpy

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
import re

# 2. Instantiate Model

Multilingual support for
* English
* Dutch
* German
* French
* Italian
* Spanish

In [2]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# 3. Playground to Encode and Calculate Sentiment

In [7]:
tokens = tokenizer.encode('it''s alright', return_tensors='pt')

In [8]:
result = model(tokens)

In [9]:
result.logits

tensor([[-1.5823, -0.4876,  0.9463,  1.0014,  0.1038]],
       grad_fn=<AddmmBackward0>)

In [10]:
# Add one so that scores range from 1-5 instead of 0-4
int(torch.argmax(result.logits))+1

4

# 4. Import Reviews

In [38]:
import os
import numpy as np
import pandas as pd

In [44]:
os.chdir('/Users/robbyjeffries/MSEACapstone/Data')

In [54]:
raw = pd.read_csv('2015_top_1000.csv')

In [55]:
df = pd.DataFrame(np.array(raw), columns=['marketplace','customer_id','review_id','product_id','product_parent','product_title','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date','year','product_category'])

In [56]:
df['review_body'].iloc[0]

'Cute'

# 5. Score Sentiment and Calculate RMSE

In [84]:
from sklearn.metrics import mean_squared_error

In [57]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [59]:
sentiment_score(df['review_body'].iloc[1])

5

In [87]:
# Calculate a sentiment score for every review
df['sentiment'] = df['review_body'].apply(lambda x: sentiment_score(x[:1001]))

In [88]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,year,product_category,sentiment
0,US,27234583,RYSGVGQU5HB5C,B0091JKY0M,941945933,Amazon.com Gift Card for Any Amount in a Santa...,5,0,0,N,Y,Five Stars,Cute,16436,2015,Gift_Card,4
1,US,41200446,R31Z6984SLA3AB,B004LLIKVU,473048287,Amazon.com eGift Cards,5,0,0,N,Y,Five Stars,Very good,16436,2015,Gift_Card,5
2,US,37861590,R2XEC6TIXZX2ZE,BT00CTP4TW,775486538,Amazon.com Gift Card in a Greeting Card (Vario...,5,0,0,N,Y,Five Stars,great,16436,2015,Gift_Card,5
3,US,1776698,R3IYIAD43DSX3B,B004LLIKVU,473048287,Amazon.com eGift Cards,5,0,0,N,Y,Five Stars,EXCELENT!,16436,2015,Gift_Card,5
4,US,38926751,R3LSH0P496X2FT,B0069VHL5Q,493863665,Amazon Gift Card - Print - Holiday Scene (Colo...,5,0,0,N,Y,Best printing option for B&W printer,I purchased 4 of these for Christmas gifts. Wh...,16436,2015,Gift_Card,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,US,2133051,R25Y65N46PJ5OK,B00B2TFSO6,89375983,Amazon eGift Card - Thank You (Note),5,0,0,N,Y,Five Stars,bien,16469,2015,Gift_Card,4
996,US,42245473,R2BC427TSS7PKD,B00CHSWGFU,728247461,Amazon eGift Card - Smile!,5,0,0,N,Y,Five Stars,IT WAS A WONDERFUL GIFT FROM A VERY LOVING FAM...,16469,2015,Gift_Card,5
997,US,15972562,R3IA8EI4LVH00N,B007RZ6DSO,473048287,Amazon.com eGift Cards,5,0,0,N,Y,My girlfreinds daughter liked hers too,"It's what it is supposed to be, a gift card. M...",16469,2015,Gift_Card,4
998,US,50026506,R260JKWUSH3UTD,B00PG40SDY,812178203,Amazon eGift Card - Deck the Halls,4,0,0,N,Y,Four Stars,it was received and spent.,16469,2015,Gift_Card,4


In [90]:
star_rating = df['star_rating']
sentiment = df['sentiment']
rms = np.sqrt(mean_squared_error(star_rating, sentiment))
print('RMSE =', rms)

RMSE = 0.8899438184514796
