#                     Sentiment Analysis on AMAZON Food Reviews

### Importing required modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import string

plt.style.use('ggplot')

import nltk as nlp

### Reading Data

In [None]:
data = pd.read_csv('Data/Reviews.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
print(data.shape)
data = data.head(500)
print(data.shape)

### Quick EDA

In [None]:
ax = data['Score'].value_counts()\
                  .sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

### Basic NLTK

In [None]:
example = data['Text'][50]
print(example)

In [None]:
nlp.sent_tokenize(example)

In [None]:
upd_example = example.translate(str.maketrans('', '', string.punctuation))
tokens = nlp.tokenize.word_tokenize(upd_example)
tokens[:10]

In [None]:
tagged = nlp.pos_tag(tokens)
tagged[:10]

In [None]:
chk = nlp.chunk.ne_chunk(tagged)
chk.pprint()

# VADER Sentiment Analysis

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm

In [None]:
sia = SentimentIntensityAnalyzer()
sw = stopwords.words('english')

In [None]:
sia.polarity_scores(example)

In [None]:
# Polarity Scores for Entire dataset

result = {}
for i, d in tqdm(data.iterrows(), total=len(data)):
    row = d['Text']
    myid = d['Id']
    result[myid] = sia.polarity_scores(row)

In [None]:
vaders = pd.DataFrame(result).T
vaders = vaders.reset_index().rename(columns= {'index': 'Id'})
vaders = vaders.merge(data, how='left')
vaders.head(2)

In [None]:
import seaborn as sns

ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score by Amazon Star Reviews')
plt.show()

# Using Roberta Model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# Previous Results of Vader Class
print(example)
sia.polarity_scores(example)

In [None]:
# Output of Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_out = {
        'roberta-negative' = scores[0],
        'roberta-neutral' = scores[1],
        'roberta-positive' = scores[2]
    }
    return cores_out

In [None]:
result = {}
for i, d in tqdm(data.iterrows(), total=len(data)):
    try:
        row = d['Text']
        myid = d['Id']
        vader_result = sia.polarity_scores(row)

        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f'vader_{key}'] = value
        roberta_result = polarity_scores_roberta(row)
        both = {**vader_result_rename, **roberta_result}
        result[myid] = both
    except RuntimeError:
        print(f'Problem occurred for id {myid}')

In [None]:
combined_results = pd.DataFrame(result).T
combined_results = combined_results.reset_index().rename(columns= {'index': 'Id'})
combined_results = combined_results.merge(data, how='left')

In [None]:
# We now have both results : 1) Vader Results and 2) Roberta Results
combined_results.head()