In [None]:
import numpy as np
import pandas as pd


In [None]:
df=pd.read_csv('Reviews.csv')

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
plt.style.use('ggplot')
import nltk 

In [None]:
df['Text'].values[0]

In [None]:
df.shape

In [None]:
df=df.head(500)
df.shape

In [None]:
df

In [None]:
##quick eda
df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars',figsize=(10,5))

In [None]:
ax = df['Score'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()

In [None]:
#basic nltk stuff
import nltk
nltk.download('punkt')

example = df['Text'][50]
print(example)

In [None]:
print(example)

In [None]:
nltk.word_tokenize(example)


In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens)

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:10]


In [None]:

import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities= nltk.chunk.ne_chunk(tagged)
entities.pprint()

In [None]:
#VADER Sentiment Scoring
#we will use nltk sentimentintensityAnalyzer to het the neg/neu/pos scores of the text 
#this uses a bag of words approach
#1 stop words are removed 
#2 each word is scored and combined to a total score 


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')
sia= SentimentIntensityAnalyzer()

In [None]:
sia

In [None]:
sia.polarity_scores('I am so happy')

In [None]:
sia.polarity_scores('This is the worst thing ever')

In [None]:
sia.polarity_scores(example)

In [None]:
#run the polarity score in the entire dataset 
res ={}
for  i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid= row['Id']
    res[myid]=sia.polarity_scores(text)
    

In [None]:
res

In [None]:
vaders=pd.DataFrame(res).T
vaders=vaders.reset_index().rename(columns={'index': 'Id'})
vaders=vaders.merge(df, how='left')

In [None]:
#now we have sentiment score and meta data 
vaders

In [None]:
##plot vaders result 
ax=sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score by Amazon Star Reviews')
plt.show()


In [None]:
fig, axs=plt.subplots(1,3, figsize=(12,3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.show()


In [None]:
#step 3 Roberta pretrained model
#use a model trained of a large corpus of data 
#transformer model accounts for the words but also the context related to words 
#like sometimes even the sarcastic model is not always negative may be it gives you positive meaning 
#so vaders model was unable to catch that relationship b/w words so we use roberta model using library from hugging phase
#named autoTokenizer and AutoModelForSequenceClassification using transformers

In [None]:
!pip install transformers

In [None]:
!pip install scipy

In [None]:

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# Install the PyTorch library
!pip install torch torchvision torchaudio


In [None]:
!pip install --ignore-installed tbb


In [None]:
# Install the TensorFlow library
!pip install tensorflow


In [None]:
# Install PyTorch
!pip install torch torchvision torchaudio


In [None]:
!pip install --ignore-installed tbb


In [None]:
# Verify PyTorch installation
import torch
print("PyTorch version:", torch.__version__)


In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
MODEL= f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer =AutoTokenizer.from_pretrained(MODEL)
model=AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
import torch
print(torch.__version__)


In [None]:
import torch
print(torch.__version__)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [None]:
print(example)

In [None]:
sia.polarity_scores(example)

In [None]:
#run for roberta model
encoded_text =tokenizer(example, return_tensors='pt')

In [None]:
output=model(**encoded_text)
scores=output[0][0].detach().numpy()
scores=softmax(scores)

In [None]:
scores

In [None]:
scores_dict={
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

In [None]:
sia.polarity_scores(example)

In [None]:
#seeing the text we can see it is a negative comment but while we see it from vaders model it is showing neutral but from 
#roberta model it is showing 97% negative comment which means roberta is better model than vaders
print(example)

In [None]:
def polarity_scores_roberta(example):
    encoded_text=encoded_text =tokenizer(example, return_tensors='pt')
    output=model(**encoded_text)
    scores=output[0][0].detach().numpy()
    scores=softmax(scores)
    scores_dict={
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res={}
for i, row in tqdm(df.iterrows(),total=len(df)):
    try:
        text=row['Text']
        myid=row['Id']
        vader_result =sia.polarity_scores(text)
        vader_result_rename={}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] =value
        roberta_result=polarity_scores_roberta(text)
        both={**vader_result_rename, **roberta_result}
        res[myid]=both
    except RuntimeError:
        print(f'Broke for id{myid}')

In [None]:
both

In [None]:
vader_result

In [None]:
roberta_result

In [None]:
results_df=pd.DataFrame(res).T
results_df=results_df.reset_index().rename(columns={'index': 'Id'})
results_df=results_df.merge(df, how='left')

In [None]:
results_df.head()

In [None]:
results_df.columns

In [None]:
#compare scores between models 
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()

In [None]:
#step 4 
#Review Examples
#positive 1 star and negative 5 star reviews
#lets look at some examples where the model scoring and reveiw score differ the most 


In [None]:
results_df.query('Score==1').sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score==1').sort_values('vader_pos', ascending=False)['Text'].values[0]

In [None]:
#negative sentiment 5 star reveiws 
results_df.query('Score==5').sort_values('roberta_neg',ascending='False')['Text'].values[0]

In [None]:
#negative sentiment 5 star reveiws 
results_df.query('Score==5').sort_values('vader_neg',ascending='False')['Text'].values[0]

In [None]:
#extra: The Transformers Pipeline
#quick and easy way to run sentiment predictions

In [None]:
from transformers import pipeline
sent_pipeline=pipeline("sentiment-analysis")

In [None]:
sent_pipeline('I love sentiment analysis')

In [None]:
sent_pipeline('make sure to like and subscribe')

In [None]:
sent_pipeline('boo')

In [None]:
sent_pipeline('who are you')

In [None]:
sent_pipeline('he is black')

In [None]:
sent_pipeline('he is white')

In [None]:
sent_pipeline(' ')