# Sentiment Analysis in Python
Sentiment analysis in python using two different techniques:
1. VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach
2. Roberta Pretrained Model from hugging face
3. Huggingface Pipeline

# Read in Data and NLTK Basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
# Read in data
df = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')
print(df.shape)
df = df.head(500) #reducing to 500 rows for faster processing
print(df.shape)

In [None]:
df.head() 

## Quick EDA

In [None]:
ax = df['Score'].value_counts().sort_index().plot(kind = "bar", 
                                             title = "Count of Reviews by Stars", 
                                             figsize=(10, 5))
ax.set_xlabel("Review Stars")
plt.show()

## Basic NLTK

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

# 1. VADER Seniment Scoring
Using NLTK's `SentimentIntensityAnalyzer` to get the neg/neu/pos scores of the text.


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia .polarity_scores('I am so happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores(example)

In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(df, how = 'left')

In [None]:
vaders.head()

## Plotting VADER results

In [None]:
ax = sns.barplot(data = vaders, x='Score', y = 'compound')
ax.set_title('Compound Score by Amazon Star Reviews')
plt.show()

In [None]:
fig, axs = plt.subplots(1,3, figsize = (12,5))
sns.barplot(data=vaders, x='Score', y ='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y ='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y ='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

# 2. Roberta Pretrained Model
- Using a model trained of a large corpus of data.


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# VADER results on example
print(example)
sia.polarity_scores(example)

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_results = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_results.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_results = polarity_scores_roberta(text)

        both = {**vader_result_rename, **roberta_results}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id{myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns = {'index': 'Id'})
results_df = results_df.merge(df, how = 'left')
results_df.head()

## Compare Scores between models

In [None]:
results_df.columns

# 3. Combine and compare

In [None]:
sns.pairplot(data=results_df, 
             vars= ['vader_neg', 'vader_neu', 'vader_pos',
              'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue = 'Score',
            palette = 'tab10')
plt.show()


# 4: Review Examples:

- Positive 1-Star and Negative 5-Star Reviews


In [None]:
results_df.query('Score == 1').sort_values('roberta_pos', ascending = False)['Text'].values[0]

In [None]:
results_df.query('Score == 1').sort_values('vader_pos', ascending = False)['Text'].values[0]

In [None]:
# negative sentiment 5-Star view

In [None]:
results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('vader_neg', ascending=False)['Text'].values[0]

 # The Transformers Pipeline


In [None]:
from transformers import pipeline
sent_pipeline = pipeline('sentiment-analysis')

In [None]:
sent_pipeline('I love sentiment analysis!')