# Sentiment Analysis using below two technique
1. VADER (Valence Aware Dictionary and sEntiment Reasoner) - Bag of words approach
2. Roberta Pretrained Model from 🤗 (from Huggingface Pipeline)

In [2]:
# Loading required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import timedelta, datetime
import nltk
import json

In [34]:
# plt.style.use('ggplot')
plt.style.use('default')
plt.style.use('dark_background')

In [35]:
df_comments = pd.read_csv('./data_after_processing/reviews_related_data/comments_with_keyphrases.csv', parse_dates=['date'])

In [36]:
df_comments.shape

(546736, 9)

Step 1. VADER Seniment Scoring
We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.

This uses a "bag of words" approach:
Stop words are removed
each word is scored and combined to a total score.

Important - This method does not account for the relationship between words, which in human speech is very important, but would give general idea

In [37]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

In [38]:
sia = SentimentIntensityAnalyzer()

In [39]:
# some examples -- compound score goes from -1 to +1
sia.polarity_scores("I am so happy!")

{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}

In [40]:
sia.polarity_scores("This is the worst day ever!")

{'neg': 0.468, 'neu': 0.532, 'pos': 0.0, 'compound': -0.6588}

**Transformer method**

In [41]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [42]:
# Using pre-trained roberta-model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [43]:
# Function to apply this model on all the comments data in reviews
def polarity_scores_roberta(text):
    encoded_txt = tokenizer(text, return_tensors='pt')
    try:
        output = model(**encoded_txt)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        scores_dict = {
            'roberta_neg' : scores[0],
            'roberta_neu' : scores[1],
            'roberta_pos' : scores[2]
        }
    except:
        scores_dict = {}
    return scores_dict

In [44]:
res = {}
for i, row in df_comments.iterrows():      
    try:
        text = row['clean_comments']
        # Applying vader --> will be useful, when roberta does not work on some comments
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        # Applying roberta
        roberta_result = polarity_scores_roberta(text)
        both = {'comment': text, **vader_result_rename, **roberta_result}
        res[i] = both
    except RuntimeError as ex:
        print(ex)
        print(f'Broke for id {i}')
    except Exception as ex:
        print(ex)
        print(i)

pd.DataFrame(res).T.to_csv('./data_after_processing/reviews_related_data/comments_with_sentiments.csv')

### Joining the vader and roberta sentiments scores to the data which contains reviews file other columns

In [3]:
# Making sure the number of rows are same before joining
df_comments_sentiments = pd.read_csv('./data_after_processing/reviews_related_data/comments_with_sentiments.csv')
df_comments_sentiments.shape

(546736, 15)

In [49]:
# By default join happens on indexes
df_comments_wsent = df_comments.join(df_comments_sentiments, how='left')[[
    'listing_id',
    'id',
    'date',
    'reviewer_id',
    'reviewer_name',
    'comments',
    'clean_comments',
    'keyphrases',
    'vader_neg',
    'vader_neu',
    'vader_pos',
    'vader_compound',
    'roberta_neg',
    'roberta_pos',
    'roberta_neu'
]]

In [50]:
# Final file which will have sentiments on the clean comments
df_comments_wsent.to_csv('./data_after_processing/reviews_related_data/comments_with_sentiments.csv')

## Load the Roberta scores from the earlier execute files

In [26]:
# earlier executed file
df_earlier_sentiment = pd.read_csv('./data_after_processing/comments_with_sentiments_earlier.csv')
df_latest_sentiment = pd.read_csv('./data_after_processing/reviews_related_data/comments_with_sentiments.csv')

In [27]:
df_latest_sentiment.shape

(546736, 15)

In [28]:
df_latest_sentiment[df_latest_sentiment.duplicated(subset = ['listing_id','date','id','reviewer_id'], keep = False)]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,keyphrases,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_pos,roberta_neu
298820,44208340.0,6.68e+17,7/10/2022,341534968,Tara,This is such a fun and unique hotel. I stayed ...,thi be such a fun and unique hotel i stay in a...,"[[""unique hotel"", 0.6277], [""hotel"", 0.6169], ...",0.0,0.763,0.237,0.8979,0.002169,0.969624,0.028207
298821,44208340.0,6.68e+17,7/10/2022,341534968,Tara,This Airbnb really makes you feel more like a ...,thi airbnb really make you feel more like a lo...,"[[""airbnb really"", 0.7086], [""thi airbnb"", 0.6...",0.0,0.792,0.208,0.699,0.002199,0.957361,0.04044
330369,6.09e+17,7.06e+17,9/1/2022,179330149,Tiffanie,"Clean, modern look. Comfortable. Easy check in...",clean modern look comfortable easy check in an...,"[[""clean modern"", 0.4293], [""downtown area"", 0...",0.022,0.692,0.286,0.9531,0.002766,0.958726,0.038508
330370,6.09e+17,7.06e+17,9/1/2022,179330149,Tiffanie,Great place.,great place,"[[""great place"", 1.0], [""place"", 0.6453], [""gr...",0.0,0.196,0.804,0.6249,0.006706,0.932469,0.060825
337636,6.34e+17,6.37e+17,5/28/2022,37630698,Ashley,We loved our stay here and highly recommend it...,we love our stay here and highly recommend it ...,"[[""decor beautiful"", 0.4506], [""condo"", 0.4118...",0.0,0.709,0.291,0.9633,0.001846,0.992267,0.005887
337967,6.34e+17,6.37e+17,5/28/2022,37630698,Ashley,This place is stunning. We were amazed at the ...,thi place be stun we be amaze at the design an...,"[[""downtown"", 0.5692], [""downtown highly"", 0.5...",0.0,0.773,0.227,0.9738,0.00148,0.974107,0.024413
337968,6.34e+17,6.4e+17,6/1/2022,333353279,Sarah,Absolutely AMAZING stay for our bachelorette w...,absolutely amaze stay for our bachelorette wee...,"[[""bachelorette weekend"", 0.5126], [""stay bach...",0.0,0.726,0.274,0.9703,0.001445,0.990881,0.007674
337969,6.34e+17,6.41e+17,6/3/2022,62461671,Tim,We had such a great time here. Check in proces...,we have such a great time here check in proces...,"[[""visit nashville"", 0.6359], [""nashville"", 0....",0.0,0.652,0.348,0.9949,0.001447,0.987834,0.010719
338026,6.34e+17,6.4e+17,6/1/2022,333353279,Sarah,Joes Place is spectacular! This place is exact...,joe place be spectacular thi place be exactly ...,"[[""view nashville"", 0.5825], [""nashville skyli...",0.03,0.706,0.264,0.9855,0.002739,0.971101,0.02616
338027,6.34e+17,6.41e+17,6/3/2022,62461671,Tim,Loved the design of this place - everything wa...,love the design of thi place everythe be high ...,"[[""rooftop area"", 0.532], [""perfect location"",...",0.0,0.746,0.254,0.9712,0.00143,0.986561,0.012008


In [29]:
df_earlier_sentiment.drop(columns = ['Unnamed: 0'], inplace=True)
df_earlier_sentiment[['listing_id', 'id']] = df_earlier_sentiment[['listing_id', 'id']].astype('float')
df_earlier_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 547928 entries, 0 to 547927
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   listing_id      547928 non-null  float64
 1   id              547928 non-null  float64
 2   date            547928 non-null  object 
 3   reviewer_id     547928 non-null  int64  
 4   reviewer_name   547927 non-null  object 
 5   comments        547928 non-null  object 
 6   clean_comments  547928 non-null  object 
 7   vader_neg       546921 non-null  float64
 8   vader_neu       546921 non-null  float64
 9   vader_pos       546921 non-null  float64
 10  vader_compound  546921 non-null  float64
 11  roberta_neg     546657 non-null  float64
 12  roberta_pos     546657 non-null  float64
 13  roberta_neu     546657 non-null  float64
dtypes: float64(9), int64(1), object(4)
memory usage: 58.5+ MB


In [31]:
df_latest_sentiment[df_latest_sentiment['roberta_neg'].isna()].shape

(249, 15)

In [32]:
df_earlier_sentiment[df_earlier_sentiment['roberta_neg'].isna()].shape

(1271, 14)

In [33]:
# do the set difference to identify records which have the roberta values in the first execution
df_merge_new = df_latest_sentiment[df_latest_sentiment['roberta_neg'].isna()].merge(df_earlier_sentiment, how = 'left', 
                                                                                    on = ['listing_id','id','reviewer_id'])
df_merge_new.shape

(249, 26)

In [37]:
df_merge_new.columns

Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments',
       'clean_comments', 'keyphrases', 'vader_neg_x', 'vader_neu_x',
       'vader_pos_x', 'vader_compound_x', 'roberta_neg_x', 'roberta_pos_x',
       'roberta_neu_x', 'roberta_neg_y', 'roberta_pos_y', 'roberta_neu_y'],
      dtype='object')

In [35]:
df_merge_new.drop(columns = ['date_y','reviewer_name_y','comments_y','clean_comments_y','vader_neg_y','vader_neu_y','vader_pos_y','vader_compound_y'],inplace=True)
df_merge_new.rename(columns = {'date_x' : 'date' , 'reviewer_name_x':'reviewer_name','comments_x' : 'comments', 'clean_comments_x' : 'clean_comments'}, inplace=True)

In [38]:
df_merge_new.head(2)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,keyphrases,vader_neg_x,vader_neu_x,vader_pos_x,vader_compound_x,roberta_neg_x,roberta_pos_x,roberta_neu_x,roberta_neg_y,roberta_pos_y,roberta_neu_y
0,17669492.0,485400987.0,7/10/2019,234139964,Sarah,Seth was a very communicative host and overall...,seth be a very communicative host and overall ...,"[[""place cleaner"", 0.4353], [""stay house"", 0.3...",0.049,0.814,0.137,0.9938,,,,0.001985,0.976501,0.021514
1,16827497.0,352682042.0,11/25/2018,33913805,Rebekah,My husband and I thoroughly enjoyed staying at...,my husband and i thoroughly enjoy stay at ambe...,"[[""recommend bath"", 0.5464], [""amber place"", 0...",0.01,0.779,0.211,0.9991,,,,0.001502,0.989222,0.009276


In [39]:
# Copy the roberta_y values into roberta_x and the one's which do have the roberta_y values also copy the vader score

def fill_values_neg(row): 
    if np.isnan(row['roberta_neg_y']):
       return row['vader_neg_x']
    else:
        return row['roberta_neg_y']    
   
df_merge_new['roberta_neg_x'] = df_merge_new.apply(fill_values_neg, axis = 1)

def fill_values_pos(row): 
    if np.isnan(row['roberta_pos_y']):
       return row['vader_pos_x']
    else:
        return row['roberta_pos_y']    
   
df_merge_new['roberta_pos_x'] = df_merge_new.apply(fill_values_pos, axis = 1)

def fill_values_neu(row): 
    if np.isnan(row['roberta_neu_y']):
       return row['vader_neu_x']
    else:
        return row['roberta_neu_y']    
   
df_merge_new['roberta_neu_x'] = df_merge_new.apply(fill_values_neu, axis = 1)

In [40]:
df_merge_new1 = df_merge_new[['listing_id','id','date','reviewer_id',
                                'reviewer_name','comments','clean_comments','keyphrases','vader_neg_x',
                                'vader_neu_x', 'vader_pos_x','vader_compound_x','roberta_neg_x','roberta_pos_x','roberta_neu_x']].\
                                    rename(columns={'vader_neg_x' : 'vader_neg', 'vader_neu_x' : 'vader_neu', 'vader_pos_x':'vader_pos',
                                                    'vader_compound_x':'vader_compound' ,'roberta_neg_x' : 'roberta_neg', 'roberta_pos_x':'roberta_pos','roberta_neu_x':'roberta_neu'}).copy()
df_merge_new1.head(2)                                                    

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,keyphrases,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_pos,roberta_neu
0,17669492.0,485400987.0,7/10/2019,234139964,Sarah,Seth was a very communicative host and overall...,seth be a very communicative host and overall ...,"[[""place cleaner"", 0.4353], [""stay house"", 0.3...",0.049,0.814,0.137,0.9938,0.001985,0.976501,0.021514
1,16827497.0,352682042.0,11/25/2018,33913805,Rebekah,My husband and I thoroughly enjoyed staying at...,my husband and i thoroughly enjoy stay at ambe...,"[[""recommend bath"", 0.5464], [""amber place"", 0...",0.01,0.779,0.211,0.9991,0.001502,0.989222,0.009276


In [42]:
for _, row in df_merge_new1.iterrows():
    index = df_latest_sentiment[(df_latest_sentiment['listing_id'] == row['listing_id']) 
    & (df_latest_sentiment['id'] == row['id'])
    & (df_latest_sentiment['reviewer_id'] == row['reviewer_id'])].index
    df_latest_sentiment.loc[index[0], 'roberta_neg'] = row['roberta_neg']
    df_latest_sentiment.loc[index[0], 'roberta_pos'] = row['roberta_pos']
    df_latest_sentiment.loc[index[0], 'roberta_neu'] = row['roberta_neu']


In [43]:
df_latest_sentiment[df_latest_sentiment.isnull().T.any()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,keyphrases,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_pos,roberta_neu
154364,3710914.0,115989408.0,11/27/2016,24562860,,this house is amazing. My family really love t...,thi house be amaze my family really love thi c...,"[[""thi house"", 0.583], [""nice house"", 0.5484],...",0.0,0.579,0.421,0.9498,0.001727,0.990759,0.007514


In [44]:
# Now finally update the file 
df_latest_sentiment.to_csv('./data_after_processing/reviews_related_data/comments_with_sentiments_imputed.csv',index=False )