In this notebook, we use 2 different methods to analyze the sentiments in the comments extracted in step1.

--[VADER](https://github.com/cjhutto/vaderSentiment) (Valence Aware Dictionary and sEntiment Reasoner), a dictionary and rule-based emotion analysis tool, designed for emotions expressed in social media.

--[RoBERTa](https://arxiv.org/abs/1907.11692), a well-trained transformer-based language model, to perform sentiment analysis on the comments.

Finally, we make comparisons of the results of the analysis: between the two methods, and between Reddit and Youtube.

## 1. Install Libraries and Load Comment Data

In [None]:
!pip install nltk
!pip install transformers

In [2]:
# import libraries
import pandas as pd
import numpy as np
import csv

In [3]:
# Open file and load comment data as a dataframe
with open('./Reddit_YouTube Comments.csv') as f:
  lines = f.read().splitlines()

df_inter = pd.DataFrame(lines)
df_inter.columns = ['comments']

df_inter

Mounted at /content/drive


Unnamed: 0,comments
0,Lectron CCS to Tesla Adapter : Dual-Level Tes...
1,"""I was a landlord for 20 years and this sounds..."
2,"""Really interesting video, great to see more d..."
3,This sounds like a nightmare of a business. Ku...
4,"""I only got through 6 trips on my 2019 model 3..."
...,...
5262,And Toyota is late to the party. So let’s get ...
5263,The title is super misleading. It’s just one T...
5264,BuT wHaT aBoUt ThE pAnEl GaPs??? /s
5265,Did they just decide to do this recently? Seem...


## 2. Sentiment Analysis

### 2.1. Sentiment analysis with VADER

In [4]:
import nltk
from collections import Counter

# Initialize the VADER sentiment analyzer
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
counter = Counter(sid.lexicon)

# Compute the VADER scores of every comment
df_inter['scores_vader'] = df_inter['comments'].apply(lambda text: sid.polarity_scores(text))
df_inter['compound_vader']  = df_inter['scores_vader'].apply(lambda score_dict: score_dict['compound'])
df_inter['comp_score_vader'] = df_inter['compound_vader'].apply(lambda c: 'pos' if c >0 else 'neg' if c <0 else 'neu')
df_inter

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,comments,scores_vader,compound_vader,comp_score_vader
0,Lectron CCS to Tesla Adapter : Dual-Level Tes...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
1,"""I was a landlord for 20 years and this sounds...","{'neg': 0.172, 'neu': 0.703, 'pos': 0.125, 'co...",-0.7894,neg
2,"""Really interesting video, great to see more d...","{'neg': 0.107, 'neu': 0.767, 'pos': 0.126, 'co...",-0.2377,neg
3,This sounds like a nightmare of a business. Ku...,"{'neg': 0.0, 'neu': 0.766, 'pos': 0.234, 'comp...",0.7003,pos
4,"""I only got through 6 trips on my 2019 model 3...","{'neg': 0.106, 'neu': 0.871, 'pos': 0.023, 'co...",-0.7626,neg
...,...,...,...,...
5262,And Toyota is late to the party. So let’s get ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",0.4019,pos
5263,The title is super misleading. It’s just one T...,"{'neg': 0.119, 'neu': 0.708, 'pos': 0.173, 'co...",0.2960,pos
5264,BuT wHaT aBoUt ThE pAnEl GaPs??? /s,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
5265,Did they just decide to do this recently? Seem...,"{'neg': 0.204, 'neu': 0.796, 'pos': 0.0, 'comp...",-0.3182,neg


### 2.2. Sentiment analysis with Roberta

In [5]:
# Import libraries
import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [None]:
# Load Model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
robert_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [7]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Analyzes the sentiment of the text.
def sentiment(text):
    try:
        text = preprocess(text[:512])
        encoded_input = tokenizer(text, return_tensors='pt')
        output = robert_model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        label = config.id2label[np.argmax(scores)]
        return label, np.max(scores)
    except Exception as e:
        print(text, e)
        return 'positive', np.float32(0.7832)

In [8]:
# Set lists to store analysis results
sent_label = []   # List to store sentiment labels
sent_value = []  # List to store sentiment values (probability)

# Analyze the sentiment of each comment
for idx, row in tqdm.tqdm(df_inter.iterrows()):
  label,value = sentiment(row['comments'])
  sent_label.append(label)
  sent_value.append(value)

5267it [16:32,  5.31it/s]


In [9]:
# Append the predicted label and values to the df_inter
df_inter['roberta_sent_label'] = sent_label
df_inter['roberta_sent_value'] = sent_value
df_inter

Unnamed: 0,comments,scores_vader,compound_vader,comp_score_vader,roberta_sent_label,roberta_sent_value
0,Lectron CCS to Tesla Adapter : Dual-Level Tes...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu,neutral,0.911012
1,"""I was a landlord for 20 years and this sounds...","{'neg': 0.172, 'neu': 0.703, 'pos': 0.125, 'co...",-0.7894,neg,negative,0.569149
2,"""Really interesting video, great to see more d...","{'neg': 0.107, 'neu': 0.767, 'pos': 0.126, 'co...",-0.2377,neg,positive,0.955658
3,This sounds like a nightmare of a business. Ku...,"{'neg': 0.0, 'neu': 0.766, 'pos': 0.234, 'comp...",0.7003,pos,negative,0.679217
4,"""I only got through 6 trips on my 2019 model 3...","{'neg': 0.106, 'neu': 0.871, 'pos': 0.023, 'co...",-0.7626,neg,negative,0.847127
...,...,...,...,...,...,...
5262,And Toyota is late to the party. So let’s get ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",0.4019,pos,neutral,0.589608
5263,The title is super misleading. It’s just one T...,"{'neg': 0.119, 'neu': 0.708, 'pos': 0.173, 'co...",0.2960,pos,negative,0.829832
5264,BuT wHaT aBoUt ThE pAnEl GaPs??? /s,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu,neutral,0.558695
5265,Did they just decide to do this recently? Seem...,"{'neg': 0.204, 'neu': 0.796, 'pos': 0.0, 'comp...",-0.3182,neg,neutral,0.550887


In [10]:
# convert the updated df_inter to Sentiment_result.csv
df_inter.to_csv('/content/drive/MyDrive/Web_Intel_test/Sentiment_result.csv')

## 3. Analyisis Result Comparison

In [11]:
# Import plotly to plot comparison results
import plotly.graph_objs as go

### 3.1. Compare analysis results between Reddit and Youtube, according to VADER and RoBERTa respectively

In [12]:
# Add a column to df_inter to indicate the source of the comments
df_inter['source'] = ['youtube' for i in range(5267-3562)]+['reddit' for i in range(3562)]
df_inter

Unnamed: 0,comments,scores_vader,compound_vader,comp_score_vader,roberta_sent_label,roberta_sent_value,source
0,Lectron CCS to Tesla Adapter : Dual-Level Tes...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu,neutral,0.911012,youtube
1,"""I was a landlord for 20 years and this sounds...","{'neg': 0.172, 'neu': 0.703, 'pos': 0.125, 'co...",-0.7894,neg,negative,0.569149,youtube
2,"""Really interesting video, great to see more d...","{'neg': 0.107, 'neu': 0.767, 'pos': 0.126, 'co...",-0.2377,neg,positive,0.955658,youtube
3,This sounds like a nightmare of a business. Ku...,"{'neg': 0.0, 'neu': 0.766, 'pos': 0.234, 'comp...",0.7003,pos,negative,0.679217,youtube
4,"""I only got through 6 trips on my 2019 model 3...","{'neg': 0.106, 'neu': 0.871, 'pos': 0.023, 'co...",-0.7626,neg,negative,0.847127,youtube
...,...,...,...,...,...,...,...
5262,And Toyota is late to the party. So let’s get ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",0.4019,pos,neutral,0.589608,reddit
5263,The title is super misleading. It’s just one T...,"{'neg': 0.119, 'neu': 0.708, 'pos': 0.173, 'co...",0.2960,pos,negative,0.829832,reddit
5264,BuT wHaT aBoUt ThE pAnEl GaPs??? /s,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu,neutral,0.558695,reddit
5265,Did they just decide to do this recently? Seem...,"{'neg': 0.204, 'neu': 0.796, 'pos': 0.0, 'comp...",-0.3182,neg,neutral,0.550887,reddit


#### 3.1.1 VADER

In [13]:
%matplotlib inline

In [14]:
# Initialize a dictionary to store sentiment counts
source_vader = {}
label_order = ['neg', 'neu', 'pos']

# Go through each group，count labels and save to the dictionary
for i, part in df_inter.groupby("source"):
    counter = Counter(part.comp_score_vader.tolist())
    source_vader[i] = {label: counter.get(label, 0) for label in label_order}

print(source_vader)

{'reddit': {'neg': 951, 'neu': 1056, 'pos': 1555}, 'youtube': {'neg': 218, 'neu': 257, 'pos': 1230}}


In [15]:
# Load data
source_vader_reddit = source_vader['reddit']
source_vader_youtube = source_vader['youtube']

# Create a trace for source_vader_reddit
trace1 = go.Bar(
    x=list(source_vader_reddit.keys()),
    y=[(val / sum(source_vader_reddit.values()))*100 for val in source_vader_reddit.values()],
    name='Reddit',
)

# Create a trace for source_vader_youtube
trace2 = go.Bar(
    x=list(source_vader_youtube.keys()),
    y=[(val / sum(source_vader_youtube.values()))*100 for val in source_vader_youtube.values()],
    name='Youtube',
)

# Plot bar chart
layout = go.Layout(
    title='VADER result between Reddit and Youtube',
    titlefont=dict(size=26),
    xaxis=dict(title='Sentiment'),
    yaxis=dict(title='% of total'),
    barmode='group'
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

#### 3.1.2 RoBERTa

In [16]:
# Initialize a dictionary to store sentiment counts
source_roberta = {}
label_order = ['negative', 'neutral', 'positive']

# Go through each group，count labels and save to the dictionary
for i, part in df_inter.groupby("source"):
    counter = Counter(part.roberta_sent_label.tolist())
    source_roberta[i] = {label: counter.get(label, 0) for label in label_order}

print(source_roberta)

{'reddit': {'negative': 1339, 'neutral': 1514, 'positive': 709}, 'youtube': {'negative': 230, 'neutral': 478, 'positive': 997}}


In [17]:
# Load data
source_roberta_reddit = source_roberta['reddit']
source_roberta_youtube = source_roberta['youtube']

# Create a trace for source_roberta_reddit
trace1 = go.Bar(
    x=list(source_roberta_reddit.keys()),
    y=[(val / sum(source_roberta_reddit.values()))*100 for val in source_roberta_reddit.values()],
    name='Reddit',
)

# Create a trace for source_roberta_youtube
trace2 = go.Bar(
    x=list(source_roberta_youtube.keys()),
    y=[(val / sum(source_roberta_youtube.values()))*100 for val in source_roberta_youtube.values()],
    name='Yotube',
)

# Plot bar chart
layout = go.Layout(
    title='RoBERTa result between Reddit and Youtube',
    titlefont=dict(size=26),
    xaxis=dict(title='Sentiment'),
    yaxis=dict(title='% of total'),
    barmode='group'
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()


###  3.2. Compare analysis results between VADER and RoBERTa analysis results on all comments

In [18]:
# Sentiment values and their counts from VADER and RoBERTa
vader_values, vader_counts = np.unique(df_inter.comp_score_vader.values, return_counts=True)
roberta_values, roberta_counts = np.unique(df_inter.roberta_sent_label.values, return_counts=True)

# Mappings to unify sentiment labels
vader_mapping = {'neg': 'negative', 'neu': 'neutral', 'pos': 'positive'}
roberta_mapping = {'negative': 'negative', 'neutral': 'neutral', 'positive': 'positive'}

# Create dictionaries to count labels
source_vader = {vader_mapping[label]: count for label, count in zip(vader_values, vader_counts)}
source_roberta = {roberta_mapping[label]: count for label, count in zip(roberta_values, roberta_counts)}

print('VADER', source_vader)
print('RoBERTa', source_roberta)

VADER {'negative': 1169, 'neutral': 1313, 'positive': 2785}
RoBERTa {'negative': 1569, 'neutral': 1992, 'positive': 1706}


In [19]:
# Create a trace for source_vader
trace1 = go.Bar(
    x=list(source_vader.keys()),
    y=[(val / sum(source_vader.values()))*100 for val in source_vader.values()],
    name='VADER',
)

# Create a trace for source_roberta
trace2 = go.Bar(
    x=list(source_roberta.keys()),
    y=[(val / sum(source_roberta.values()))*100 for val in source_roberta.values()],
    name='RoBERTa'
)


# Plot bar chart to compare
layout = go.Layout(
    title='Comparison of sentiment analysis between VADER and RoBERTa',
    titlefont=dict(size=26),
    xaxis=dict(title='Sentiment'),
    yaxis=dict(title='% of total'),
    barmode='group'
)

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()