In [11]:
import pandas as pd
from spark_session_manager import SparkSessionManager

In [12]:
spark = SparkSessionManager().get_spark_session()

## Load data

In [13]:
# emotions = pd.read_parquet("data/results/emotion-english-distilroberta-base-split_min.parquet")
emotions = spark.read.parquet("data/results/emotion-english-distilroberta-base-split_min.parquet")

emotions.head()

emotions.dtypes

[('video_id', 'string'),
 ('anger', 'double'),
 ('disgust', 'double'),
 ('fear', 'double'),
 ('joy', 'double'),
 ('neutral', 'double'),
 ('sadness', 'double'),
 ('surprise', 'double'),
 ('highest_confidence_emotion', 'string')]

In [14]:
# remove duplicates
emotions = emotions.dropDuplicates(subset=['video_id'])

In [15]:
emotions.head(20)

                                                                                

[Row(video_id='--SvHNpSvpk', anger=0.005676752887666225, disgust=0.005355197470635176, fear=0.006694482173770666, joy=0.07837428897619247, neutral=0.5112354159355164, sadness=0.2456246316432953, surprise=0.1470392346382141, highest_confidence_emotion='neutral'),
 Row(video_id='--gJDs10ShA', anger=0.9241438508033752, disgust=0.021352430805563927, fear=0.008110920898616314, joy=0.0019977663177996874, neutral=0.008527128957211971, sadness=0.0054464456625282764, surprise=0.030421489849686623, highest_confidence_emotion='anger'),
 Row(video_id='--hjHKgm67g', anger=0.21622449159622192, disgust=0.3774898648262024, fear=0.07863780111074448, joy=0.006555378437042236, neutral=0.20383568108081818, sadness=0.1043320968747139, surprise=0.012924706563353539, highest_confidence_emotion='disgust'),
 Row(video_id='-0PZSxZuAXQ', anger=0.011317762546241283, disgust=0.003688700497150421, fear=0.0034214965999126434, joy=0.23058640956878662, neutral=0.5751965641975403, sadness=0.06553573906421661, surprise=

In [16]:
# read parquet file
# engagement = pd.read_parquet("data/processed/engagement.parquet")
engagement = spark.read.parquet("data/processed/engagement.parquet")

# show the first 5 rows
engagement.head()

# show the schema
engagement.dtypes

[('id', 'bigint'),
 ('video_id', 'string'),
 ('likes', 'int'),
 ('dislikes', 'int'),
 ('comment_count', 'int'),
 ('view_count', 'int'),
 ('view_to_like_ratio', 'double'),
 ('like_to_comment_ratio', 'double'),
 ('engagement_rate', 'double'),
 ('comments_per_view', 'double'),
 ('average_views_per_day', 'double')]

In [17]:
# show first 50 rows, asc by video_id
engagement.orderBy("video_id").show(50)

+------+-----------+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+
|    id|   video_id| likes|dislikes|comment_count|view_count|view_to_like_ratio|like_to_comment_ratio|   engagement_rate|   comments_per_view|average_views_per_day|
+------+-----------+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+
| 60792|--14w5SOEUs|122830|     867|         9539|   1076217| 8.761841569649109|   12.876611804172345| 12.29947120329822|0.008863454117524625|            1076217.0|
| 60991|--14w5SOEUs|167034|    1755|        12998|   2146104|12.848306332842416|   12.850746268656716|  8.38878264986226|0.006056556439016935|            2146104.0|
| 61197|--14w5SOEUs|202153|    2518|        14721|   3317372|16.410204152300484|   13.732287208749405| 6.537524281268425|0.004437548758475082|            1658686.0|
| 61398|--

In [18]:
from pyspark.sql import functions as F

# join emotions and engagement metrics and data on column 'video_id', leave only one video_id column
# Join the dataframes on 'video_id'
df = engagement.join(emotions, on='video_id', how='inner')

df.orderBy("video_id").show(50)


+-----------+------+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------------+
|   video_id|    id| likes|dislikes|comment_count|view_count|view_to_like_ratio|like_to_comment_ratio|   engagement_rate|   comments_per_view|average_views_per_day|               anger|             disgust|                fear|                 joy|             neutral|             sadness|            surprise|highest_confidence_emotion|
+-----------+------+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------

## Descriptive Stats for Emotions

In [40]:
# import color palette
from style.emotions_palette import palette

# map the color palette to the emotions
emotion_dict = {
    'anger': 'red',
    'disgust': 'green',
    'fear': 'black',
    'joy': 'yellow',
    'neutral': 'grey',
    'sadness': 'blue',
    'surprise': 'orange'
}

# map the color palette to the emotions
emotions_palette = {k: palette[v] for k, v in emotion_dict.items()}

### Average views by emotions

In [42]:
# calculate average views by emotion

## group by emotion and calculate the average views
average_views_by_emotion = df.groupby('highest_confidence_emotion').agg({'view_count': 'mean'})

# change name of column 'highest_confidence_emotion' to 'emotion'
average_views_by_emotion = average_views_by_emotion.withColumnRenamed('highest_confidence_emotion', 'emotion')

In [43]:
average_views_by_emotion.show()

+--------+------------------+
| emotion|   avg(view_count)|
+--------+------------------+
|     joy| 2842972.042824706|
| neutral|2681463.9558491763|
|   anger| 2223559.851560428|
|    fear|3568719.0237273816|
|surprise|3316297.3522978346|
| sadness|1744728.3174656765|
| disgust|1944047.1853321576|
+--------+------------------+



In [51]:
from style.chart_defaults import *


In [54]:
# Assuming 'average_views_by_emotion' is your PySpark DataFrame
emotion_pd_df = average_views_by_emotion.toPandas()

import plotly.graph_objects as go

# Create a bar chart using Plotly
fig = go.Figure(go.Bar(
    x=emotion_pd_df['emotion'],  # Emotion names on the x-axis
    y=emotion_pd_df['avg(view_count)'],  # Average view counts on the y-axis
    marker=dict(
        color=emotion_pd_df['avg(view_count)'],  # Color the bars by average view counts
        colorscale='Viridis'  # Choose a colorscale that visually represents data well
    )
))

# Update the layout for a more customized look
fig.update_layout(
    title='Average YouTube View Count by Emotion',
    xaxis_title='Emotion',
    yaxis_title='Average View Count',
    font=dict(size=12)
)

# Show the figure
fig.show()

# save as html, use cdn
fig.write_html("charts/emotions/mean_views_by_emotion.html", include_plotlyjs='cdn')


### Emotion Distribution Across Categories

In [61]:
df.show()

+-----------+---+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------------+
|   video_id| id| likes|dislikes|comment_count|view_count|view_to_like_ratio|like_to_comment_ratio|   engagement_rate|   comments_per_view|average_views_per_day|               anger|             disgust|                fear|                 joy|             neutral|             sadness|            surprise|highest_confidence_emotion|
+-----------+---+------+--------+-------------+----------+------------------+---------------------+------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------

In [65]:
import plotly.express as px
from pyspark.sql import functions as F

# Print the schema to verify the structure
df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- id: long (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- view_to_like_ratio: double (nullable = true)
 |-- like_to_comment_ratio: double (nullable = true)
 |-- engagement_rate: double (nullable = true)
 |-- comments_per_view: double (nullable = true)
 |-- average_views_per_day: double (nullable = true)
 |-- anger: double (nullable = true)
 |-- disgust: double (nullable = true)
 |-- fear: double (nullable = true)
 |-- joy: double (nullable = true)
 |-- neutral: double (nullable = true)
 |-- sadness: double (nullable = true)
 |-- surprise: double (nullable = true)
 |-- highest_confidence_emotion: string (nullable = true)



In [67]:
emotion_category_distribution = df.groupby('highest_confidence_emotion').agg(F.count('video_id').alias('count'))

In [71]:
from style.emotions_palette import palette
# Create a stacked bar chart using the Pandas DataFrame
fig = px.bar(emotion_category_distribution,
             x='category',
             y='count',
                color='emotion',
                barmode='stack',
                title='Emotion Distribution Across Categories',
                color_discrete_map=palette
                )

fig.update_layout(template='plotly_dark')


# save as svg
fig.write_image(
    "charts/emotions/emotion_distribution_across_categories.svg", width=1200, height=800)

# save as html
fig.write_html(
    "charts/emotions/emotion_distribution_across_categories.html")

fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['highest_confidence_emotion', 'count'] but received: category

### Engagement Metrics by Emotion


### Average Engagement Metrics by Category

In [47]:
final_df.head()

Unnamed: 0,video_id,anger,disgust,fear,joy,neutral,sadness,surprise,highest_confidence_emotion,title,categoryTitle,view_to_like_ratio,like_to_comment_ratio,engagement_rate,comments_per_view,average_views_per_day,likes,dislikes,comment_count,view_count
0,3C66w5Z0ixs,0.502659,0.089732,0.012764,0.007059,0.16835,0.180872,0.038564,anger,I ASKED HER TO BE MY GIRLFRIEND...,People & Blogs,9.652879,4.443349,12.691088,0.023315,1514614.0,156908,5855,35313,1514614
1,3C66w5Z0ixs,0.502659,0.089732,0.012764,0.007059,0.16835,0.180872,0.038564,anger,I ASKED HER TO BE MY GIRLFRIEND...,People & Blogs,9.652879,4.443349,12.691088,0.023315,1514614.0,183592,9506,41374,3017834
2,3C66w5Z0ixs,0.502659,0.089732,0.012764,0.007059,0.16835,0.180872,0.038564,anger,I ASKED HER TO BE MY GIRLFRIEND...,People & Blogs,9.652879,4.443349,12.691088,0.023315,1514614.0,195344,10648,44555,3721729
3,3C66w5Z0ixs,0.502659,0.089732,0.012764,0.007059,0.16835,0.180872,0.038564,anger,I ASKED HER TO BE MY GIRLFRIEND...,People & Blogs,9.652879,4.443349,12.691088,0.023315,1514614.0,200215,10904,45461,3853404
4,3C66w5Z0ixs,0.502659,0.089732,0.012764,0.007059,0.16835,0.180872,0.038564,anger,I ASKED HER TO BE MY GIRLFRIEND...,People & Blogs,9.652879,4.443349,12.691088,0.023315,1514614.0,201808,10975,45574,3898795


In [48]:
# Calculate Average Engagement Metrics in PySpark


import numpy as np
import matplotlib.pyplot as plt
# !pip install palettable
from palettable import colorbrewer
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from pyspark.sql.functions import avg

# Assuming df is a PySpark DataFrame with your data
metrics = final_df.groupby('categoryTitle').agg(
    avg('view_count').alias('average_views'),
    avg('like_count').alias('average_likes'),
    avg('dislike_count').alias('average_dislikes'),
    avg('comment_count').alias('average_comments'),
    avg('engagement_rate').alias('average_engagement_rate')
)


# Convert to Pandas DataFrame for visualization
metrics_pd = metrics.toPandas()

# ------------------------- normalize for radar chart ------------------------ #

scaler = MinMaxScaler()

metrics_normalized = pd.DataFrame(scaler.fit_transform(metrics_pd.iloc[:, 1:]),
                                  columns=metrics_pd.columns[1:],
                                  index=metrics_pd['categoryTitle'])

# Reset index to keep 'categoryId' as a column for plotting
metrics_normalized.reset_index(inplace=True)

# use color pallette from brewer

sequential_palette = colorbrewer.sequential.YlGnBu_9.hex_colors


# Plot Radar Chart


categories = list(metrics_normalized.columns[1:])
N = len(categories)

angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

for index, row in metrics_normalized.iterrows():
    data = metrics_normalized.iloc[index].drop('categoryId').tolist()
    data += data[:1]
    ax.plot(angles, data, linewidth=2,
            linestyle='solid', label=row['categoryId'])
    ax.fill(angles, data, alpha=0.1)

ax.set_thetagrids(np.degrees(angles[:-1]), categories)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# # add names of labels at the end of each line
# for i in range(len(metrics_normalized)):
#     ax.text(angles[-1], data[-1], metrics_normalized['categoryId'][i], fontsize=12, color=sequential_palette[i])

plt.show()


# save as svg


# save as svg
fig.write_image(
    "charts/eng_metrics/average_engagement_metrics.svg", width=1200, height=800)

TypeError: Column.__init__() missing 1 required positional argument: 'jc'

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assume metrics_normalized_df has been normalized and contains categories as rows
# and metrics as columns, as per your previous data structure.

# Get unique categories
unique_categories = metrics_normalized['categoryId'].unique()

# Define the number of rows and columns for subplots
num_rows = int(len(unique_categories) ** 0.5)
num_cols = (len(unique_categories) + num_rows -
            1) // num_rows  # Ceiling division

# Create subplots, each subplot is a radar chart
fig = make_subplots(
    rows=num_rows, cols=num_cols,
    specs=[[{'type': 'polar'}] * num_cols] * num_rows,
    subplot_titles=unique_categories
)

# Plot each category in a separate radar chart
for i, category in enumerate(unique_categories):
    category_data = metrics_normalized[metrics_normalized['categoryId']
                                       == category].iloc[0]

    fig.add_trace(
        go.Scatterpolar(
            r=category_data[metrics].tolist() + [category_data[metrics[0]]],
            theta=metrics,
            fill='toself',
            name=category
        ),
        row=(i // num_cols) + 1,  # Row in subplot grid
        col=(i % num_cols) + 1    # Col in subplot grid
    )

# Update layout for all subplots
fig.update_layout(
    title='Average Engagement Metrics by Category',
    template='plotly_dark',
    polar=dict(radialaxis=dict(visible=True)),
    showlegend=False
)

# hide avg_controversy_index etc labels outside radar
fig.update_polars(radialaxis=dict(visible=True, tickangle=45))


# save as svg
fig.write_image(
    "charts/eng_metrics/radar_chart_average_engagement_metrics.svg", width=1900, height=1200)

# save as html
fig.write_html(
    "charts/eng_metrics/radar_chart_average_engagement_metrics.html", auto_open=True)

In [None]:
# comments rate = comments / views for category

### Correlation Heatmap

In [None]:
# Convert to Pandas DataFrame (if not already in Pandas)
import plotly.figure_factory as ff
df_pd = df.select("view_count", "likes", "dislikes", "comment_count",
                                                            "comment_rate", "dislike_rate", "dislike_ratio", "controversy_index").toPandas()

# Calculate Correlation Matrix
corr_matrix = df_pd.corr()

# Plot Heatmap with Plotly

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.round(2).values,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title_text='Correlation Heatmap of Engagement Metrics',
                  xaxis_title='Metrics',
                  yaxis_title='Metrics',
                  template='plotly_dark')

# save as svg
fig.write_image(
    "charts/eng_metrics/Correlation_Heatmap_of_Engagement_Metrics.svg", width=1200, height=800)

In [None]:
import plotly.figure_factory as ff
import numpy as np

# Assuming df_pd is your Pandas DataFrame
corr_matrix = df_pd.corr()

# Mask to display only upper half
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.where(mask).values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.where(mask).round(2).astype(str).values,
    colorscale='Viridis',
    showscale=True
)

# Update layout to include definitions
metric_definitions = """
<b>Definitions</b>:
<br>Comment Rate = comments / views || Dislike Rate = dislikes / view
<br>Dislike Ratio = dislikes / (likes + dislikes) || Controversy Index = comment rate * dislike ratio
"""

fig.update_layout(
    title='Correlation Heatmap of Engagement Metrics<br>',
    xaxis=dict(title='Metrics'),
    yaxis=dict(title='Metrics'),
    template='plotly_dark'
)

# inert annotations at the bottom

fig.add_annotation(
    text=metric_definitions,
    xref="paper", yref="paper",
    x=0.1, y=-0.1, showarrow=False,
    font=dict(size=14, color="white")
)

# save as svg
fig.write_image(
    "charts/eng_metrics/Correlation_Heatmap_of_Engagement_Metrics2.svg", width=1200, height=800)

In [None]:
# correlation of emotions and engagement metrics

# Calculate Correlation Matrix
corr_matrix = df.select("view_count", "likes", "dislikes", "comment_count", "comment_rate", "dislike_rate", "dislike_ratio", "controversy_index") \
    .toPandas().corr()


corr_matrix

### Emotions per category

### create df for emotions

In [None]:
# emotions per category
emotions_per_category = df.groupBy(
    'categoryId', 'emotion').count().orderBy('categoryId', 'emotion')

# convert to pandas
emotions_per_category_pd = emotions_per_category.toPandas()

# head
emotions_per_category_pd.head()

In [None]:
# show whole dataframe, do not truncate
# import pandas as pd
# pd.set_option('display.max_rows', None)
# emotions_per_category_pd

In [None]:
# chart emotions per category
# use stacked bar plot

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a figure with subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=(
    "Emotions per Category", "Emotions per Category (Normalized)"))

# Create a bar plot for the absolute counts

for i, category in enumerate(emotions_per_category_pd['categoryId'].unique()):
    data = emotions_per_category_pd[emotions_per_category_pd['categoryId'] == category]
    fig.add_trace(go.Bar(x=data['emotion'],
                  y=data['count'], name=category), row=1, col=1)

# Create a bar plot for the normalized counts
for i, category in enumerate(emotions_per_category_pd['categoryId'].unique()):
    data = emotions_per_category_pd[emotions_per_category_pd['categoryId'] == category]
    fig.add_trace(go.Bar(x=data['emotion'], y=data['count'] /
                  data['count'].sum(), name=category), row=1, col=2)

### Sankey chart

In [None]:
import pandas as pd
from pySankey.sankey import sankey
import matplotlib.pyplot as plt

# Assuming df_subset_pd is your DataFrame with 'emotion' and 'categoryId' columns ready for plotting

# Create a Sankey chart
sankey(df_subset_pd['emotion'],
       df_subset_pd['categoryId'], aspect=20, fontsize=10)

# Get current figure
fig = plt.gcf()

# Set the size of the figure in inches [width, height]
# Adjust these values as needed to avoid label collision
fig.set_size_inches(12, 12)

# Set the color of the background to black
fig.set_facecolor("black")

# set title "Emotions per Category (Sankey Diagram)"
plt.title("Emotions per Category (Sankey Diagram)", fontsize=14, color="white")

# set color of the labels to white
plt.rcParams['text.color'] = 'white'

# Save the figure to a file, adjust the dpi to increase the resolution
fig.savefig("charts/sankey-emotions-per-category.png", bbox_inches="tight", dpi=300)

# save plt to file 'html
# plt.savefig("charts/sankey-emotions-per-category.html", format='html')

In [None]:
# Re-importing necessary library for the Sankey diagram
import plotly.graph_objects as go

# Assuming a simplified version of the provided data for demonstration purposes

# Correcting the data input issue and focusing on including colors for the emotions

# Simplified data structure for creating the Sankey diagram with colored emotions
category_ids = emotions_per_category_pd['categoryId'].astype(
    'category').cat.codes
emotion_ids = emotions_per_category_pd['emotion'].astype(
    'category').cat.codes + category_ids.max() + 1  # Offset by max category id
counts = emotions_per_category_pd['count']

# Unique categories and emotions for labeling
unique_categories = emotions_per_category_pd['categoryId'].astype(
    'category').cat.categories

unique_emotions = emotions_per_category_pd['emotion'].astype(
    'category').cat.categories

node_colors = [emotions_palette[emotion]
               if emotion in emotions_palette else 'grey' for emotion in unique_emotions]

# Creating the Sankey diagram with colors
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(unique_categories) + list(unique_emotions),
        # Default grey for categories, colored for emotions
        color=['grey']*len(unique_categories) + node_colors
    ),
    link=dict(
        source=category_ids,
        target=emotion_ids,
        value=counts
    ))])

fig.update_layout(
    title_text="YouTube Sentiment Analysis by Category and Emotion with Color", font_size=12)

# save as html, use cdn
fig.write_html("charts/sankey.html", include_plotlyjs='cdn')

## proportions of emotions within each YouTube category

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming emotions_per_category_pd is your DataFrame

# Pivot the DataFrame to get a matrix where categories are rows, emotions are columns, and values are counts
pivot_df = emotions_per_category_pd.pivot(
    index='categoryId', columns='emotion', values='count').fillna(0)

# Normalize the counts by row to get proportions
proportions = pivot_df.div(pivot_df.sum(axis=1), axis=0)

# Plot a 100% stacked bar chart
proportions.plot(kind='bar', stacked=True, figsize=(10, 7))

# Configure the plot with titles, labels, etc.
plt.title('Proportion of Emotions per YouTube Category')
plt.xlabel('Category')
plt.ylabel('Proportion of Emotions')
plt.legend(title='Emotion')
plt.tight_layout()

# change color of legend
plt.legend(title='Emotion', title_fontsize='12',
           fontsize='12', facecolor='black', edgecolor='black', loc='center left', bbox_to_anchor=(1, 0.5))
# Show the plot
plt.show()

In [None]:
# Pivot the DataFrame to get a matrix where categories are rows, emotions are columns, and values are counts
# pivot_df = emotions_per_category_pd.pivot(index='categoryId', columns='emotion', values='count').fillna(0)

# Normalize the counts by row to get proportions
proportions = pivot_df.div(pivot_df.sum(axis=1), axis=0)

# Create a list of traces for the stacked bar chart, one for each emotion
traces = []

# Pivot the DataFrame to have categories as rows and emotions as columns, filling NaNs with 0
category_emotion_counts = emotions_per_category_pd.pivot_table(
    index='categoryId',
    columns='emotion',
    values='count',
    aggfunc='sum',
    fill_value=0
)

# Calculate the proportions
category_emotion_proportions = category_emotion_counts.div(
    category_emotion_counts.sum(axis=1), axis=0)

# Create the plotly figure with one trace for each emotion column
fig = go.Figure()

for emotion in category_emotion_proportions.columns:
    fig.add_trace(go.Bar(
        name=emotion,
        x=category_emotion_proportions.index,
        y=category_emotion_proportions[emotion],
        hoverinfo='none'  # Disable default hoverinfo
    ))

# Update the layout of the figure to stack the bars and adjust the y-axis to show percentage
fig.update_layout(
    barmode='stack',
    title='Proportion of Emotions per YouTube Category',
    xaxis_title='Category',
    yaxis=dict(
        title='Proportion',
        tickformat=',.0%'
    ),
    hoverlabel=dict(  # Customize hover label
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    legend_title_text='Emotion'
)

# Define the custom hovertemplate with HTML
# {emotion} in {category}: {y:.2%} <extra></extra>
hovertemplate = "<b>%{y:.2%}</b> of <b>%{x}</b> emits <b>%{fullData.name}</b><extra></extra>"

# Apply custom hovertemplate to each trace
for trace in fig.data:
    trace.hovertemplate = hovertemplate

# change colors of the bars using the emotions_palette
for i, emotion in enumerate(category_emotion_proportions.columns):
    fig.data[i].marker.color = emotions_palette[emotion]

# Save the figure to an HTML file
fig.write_html("charts/stacked_bar.html", include_plotlyjs='cdn')

## Engagement metrics by emotion 

### Stewise regression

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize a list to keep track of variables added and their R²
selected_variables = []
current_r2 = -float("inf")
improvement = True

# List all potential predictors
potential_predictors = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

while improvement:
    improvement = False
    best_r2_for_step = current_r2
    
    for predictor in potential_predictors:
        # Temporary model with the current selected variables plus the new potential predictor
        assembler = VectorAssembler(inputCols=selected_variables + [predictor], outputCol="features")
        
        # Fit and evaluate the model
        lr = LinearRegression(featuresCol="features", labelCol="view_count")
        pipeline = Pipeline(stages=[assembler, lr])
        model = pipeline.fit(train_data)  # Assuming train_data is already defined
        predictions = model.transform(test_data)  # Assuming test_data is already defined
        
        evaluator = RegressionEvaluator(labelCol="view_count", predictionCol="prediction", metricName="r2")
        r2 = evaluator.evaluate(predictions)
        
        # Check if this model is the best so far
        if r2 > best_r2_for_step:
            best_r2_for_step = r2
            best_predictor = predictor
            improvement = True
            
    # If there was an improvement, update the list of selected variables and the current best R²
    if improvement:
        selected_variables.append(best_predictor)
        current_r2 = best_r2_for_step
        potential_predictors.remove(best_predictor)
        print(f"Added {best_predictor}. New R² is {best_r2_for_step}.")


In [None]:
# print the selected variables
print("Selected variables:", selected_variables)

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Assuming df_subset is your DataFrame that includes 'categoryId', 'emotion', and engagement metrics

# List of engagement metrics to include as features
feature_cols = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

# Initialize an evaluator for R2
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")

# Retrieve unique categories and emotions
categories = df.select("categoryId").distinct().rdd.flatMap(lambda x: x).collect()
emotions = df.select("emotion").distinct().rdd.flatMap(lambda x: x).collect()

results = []

for category in categories:
    for emotion in emotions:
        # Filter the DataFrame for the current category and emotion
        df_filtered = df.filter((df.categoryId == category) & (df.emotion == emotion))
        
        # Check if the filtered DataFrame is not empty
        if df_filtered.count() > 0:
            assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
            lr = LinearRegression(featuresCol="features", labelCol="view_count")
            
            # Define the pipeline
            pipeline = Pipeline(stages=[assembler, lr])
            
            # Split the data
            train_data, test_data = df_filtered.randomSplit([0.7, 0.3], seed=42)
            
            # Fit the model
            model = pipeline.fit(train_data)
            
            # Make predictions
            predictions = model.transform(test_data)
            
            # Evaluate the model
            r2 = evaluator.evaluate(predictions)
            
            # Store the results
            results.append(((category, emotion), r2))
        else:
            results.append(((category, emotion), None))

# Display the results
for result in results:
    print(f"Category: {result[0][0]}, Emotion: {result[0][1]}, R2: {result[1]}")


## Word cloud

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud


