# Sentiment Analysis using below two technique
1. VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach
2. Roberta Pretrained Model from 🤗 (from Huggingface Pipeline)

In [1]:
# Loading required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta, datetime
import nltk
import json

In [2]:
# plt.style.use('ggplot')
plt.style.use('default')
plt.style.use('dark_background')

In [3]:
df_comments = pd.read_csv('comments.csv')

In [4]:
df_comments.shape

(548941, 7)

Step 1. VADER Seniment Scoring
We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.

This uses a "bag of words" approach:
Stop words are removed
each word is scored and combined to a total score.

Important - This method does not account for the relationship between words, which in human speech is very important, but would give general idea

In [124]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [125]:
sia = SentimentIntensityAnalyzer()

In [126]:
# some examples -- compound score goes from -1 to +1
sia.polarity_scores("I am so happy!")

{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}

In [127]:
sia.polarity_scores("This is the worst day ever!")

{'neg': 0.468, 'neu': 0.532, 'pos': 0.0, 'compound': -0.6588}

**Transformer method**

In [128]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [129]:
# Using pre-trained roberta-model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [131]:
# Function to apply this model on all the comments data in reviews
def polarity_scores_roberta(text):
    encoded_txt = tokenizer(text, return_tensors='pt')
    try:
        output = model(**encoded_txt)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        scores_dict = {
            'roberta_neg' : scores[0],
            'roberta_neu' : scores[1],
            'roberta_pos' : scores[2]
        }
    except:
        scores_dict = {}
    return scores_dict

In [6]:
# Dropping null comments
df_comments.dropna(subset=['clean_comments'], inplace=True)

In [142]:
res = {}
for i, row in df_comments.iterrows():      
    try:
        text = row['clean_comments']
        # Applying vader --> will be useful, when roberta does not work on some comments
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        # Applying roberta
        roberta_result = polarity_scores_roberta(text)
        both = {'comment': text, **vader_result_rename, **roberta_result}
        res[i] = both
    except RuntimeError as ex:
        print(ex)
        print(f'Broke for id {i}')
    except Exception as ex:
        print(ex)
        print(i)

pd.DataFrame(res).T.to_csv('comments_with_sentiments.csv')

### Joining the vader and roberta sentiments scores to the data which contains reviews file other columns

In [5]:
# Making sure the number of rows are same before joining
df_comments_sentiments = pd.read_csv('comments_with_sentiments.csv')
df_comments_sentiments.shape

(547928, 9)

In [7]:
df_comments.shape

(547928, 7)

In [10]:
# By default join happens on indexes
df_comments_wsent = df_comments.join(df_comments_sentiments, how='left')[[
    'listing_id',
    'id',
    'date',
    'reviewer_id',
    'reviewer_name',
    'comments',
    'clean_comments',
    'vader_neg',
    'vader_neu',
    'vader_pos',
    'vader_compound',
    'roberta_neg',
    'roberta_pos',
    'roberta_neu'
]]

In [12]:
df_comments_wsent.to_csv('comments_with_sentiments.csv')