In [None]:
import pandas as pd
import numpy as np

import nltk

In [None]:
df = pd.read_csv('Reviews.csv')
df = df.head(500)
print(df.shape)

In [None]:
df.head()

## Quick EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of reviews by stars', figsize=(10,5))

## Basic NLTK

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
# nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(tokens) # pos - parts of speach

In [None]:
tagged[:10]

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')


In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

## Using VADER sentiment scoring

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores(example)

In [None]:
# Run the polarity score on the entire dataset:
res = {}
for i, row in df.iterrows():
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
# now we have sentiment score and metadata
vaders.head()

## plot VADER results

In [None]:
sns.barplot(data=vaders, x='Score', y='compound')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15,5))
sns.barplot(data=vaders, x='Score', y='pos', axs=[0])
sns.barplot(data=vaders, x='Score', y='neu', axs=[1])
sns.barplot(data=vaders, x='Score', y='neg', axs=[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[0].set_title('Negative')

plt.show()

## Roberta Pretrained Model
<ul>
    <li>Use a model trained of a large corpus of data</li>
    <li>Transformer model accounts for the words but also the context related to other words.</li>
</ul>

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment" # model trained on a bunch of twitter comments.
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# VADER results on example
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Model

encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}

print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict


In [None]:
res = {}
for i, row in df.iterrows():
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}

        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
            roberta_result = polarity_scores_roberta(text)
            both = { **vader_result_rename, **roberta_result }    
            res[myid] = both
    except RuntimeError:
        print(f"Broke for id{myid}")

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={"index":"Id"})
results_df = results_df.merge(df, how="left")

## Compare Scores between models

In [None]:
results_df.columns

## Combine and Compare

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data=results_df,
             vars=["vader_neg", "vader_neu", "vader_pos", 
                   "roberta_neg", "roberta_neu", "roberta_pos"],
             hue="Score",
             palette="tab10")
plt.show()

## Review Examples
<ul>
    <li>Positive 1-Star and Negative 5-Star Reviews</li>
</ul>
Looking at some examples where the model scoring and review score differ the most.

In [None]:
results_df.query("Score == 1").sort_values("roberta_pos", ascending=False)["Text"].values[0]

In [None]:
results_df.query("Score == 1").sort_values("vader_pos", ascending=False)["Text"].values[0]

In [None]:
# Negative Sentiment 5-Star review
results_df.query("Score == 5").sort_values("roberta_neg", ascending=False)["Text"].values[0]

In [None]:
results_df.query("Score == 5").sort_values("vader_neg", ascending=False)["Text"].values[0]

## The transformer pipeline


In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")