In [None]:
import pandas as pd
from spark_session_manager import SparkSessionManager

In [None]:
df_with_emotions = pd.read_parquet("data/results/emotion-english-distilroberta-base.parquet")

## Emotions

In [None]:
from pyspark.sql import functions as F

# Assuming 'df' is your PySpark DataFrame
emotion_views = df.groupBy('emotion').agg(F.mean('view_count').alias('mean_view_count'))\
    .orderBy(F.desc('mean_view_count'))

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px

emotion_views_pd = emotion_views.toPandas()

# Normalize 'mean_view_count' for color mapping
max_views = emotion_views_pd['mean_view_count'].max()
min_views = emotion_views_pd['mean_view_count'].min()
emotion_views_pd['normalized_views'] = (
    emotion_views_pd['mean_view_count'] - min_views) / (max_views - min_views)

# Use Plotly Express to generate a continuous color scale


# Generate a continuous colormap
cmap = plt.get_cmap('viridis')

colors = [cmap(x) for x in emotion_views_pd['normalized_views']]

#### convert to pandas

In [None]:
# head
df_subset_pd.head()

## Descriptive Stats for Emotions

In [None]:
import plotly.graph_objects as go

# Create a bar chart with colors based on 'mean_view_count'
fig = go.Figure(data=go.Bar(
    x=df_subset_pd['emotion'],
    y=df_subset_pd['view_count'],
    marker_color=colors  # Use the generated colors
))

fig.update_layout(
    title='Average Views by Emotion',
    xaxis_title='Emotion',
    yaxis_title='Average Views',
    template='plotly_dark'
)

# change theme to white


fig.show()

# save the plot as a .html file
fig.write_html(
    "charts/emotions/mean_views_by_emotion.html")

# save as png
fig.write_image(
    "charts/emotions/mean_views_by_emotion.png")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Assuming 'spark' is your SparkSession
# Example: spark = SparkSession.builder.appName("YourAppName").getOrCreate()

# Adjust the DataFrame read if 'df' is not already a PySpark DataFrame
# df = spark.read.csv("path/to/your.csv", header=True, inferSchema=True)

emotion_views = df.groupBy("emotion") \
    .agg(F.mean("view_count").alias("mean_view_count")) \
    .orderBy(F.desc("mean_view_count"))

# Convert to Pandas DataFrame for plotting
emotion_views_pd = emotion_views.toPandas()

In [None]:
import plotly.express as px

# Normalize 'mean_view_count' between 0 and 1
min_view_count = emotion_views_pd['mean_view_count'].min()
max_view_count = emotion_views_pd['mean_view_count'].max()
emotion_views_pd['normalized_view_count'] = (
    emotion_views_pd['mean_view_count'] - min_view_count) / (max_view_count - min_view_count)

# Use continuous color scale
colors = px.colors.sequential.Viridis

In [None]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# import plotly settings and set black theme
import plotly.io as pio
pio.templates.default = "plotly_dark"

# Convert normalized view counts to colors using matplotlib's Viridis colormap
cmap = plt.get_cmap('viridis')
emotion_views_pd['color'] = emotion_views_pd['normalized_view_count'].apply(
    lambda x: mcolors.to_hex(cmap(x)))

# Now plot with Plotly, using the interpolated colors

fig = go.Figure()

for i, row in emotion_views_pd.iterrows():
    fig.add_trace(go.Bar(x=[row['emotion']], y=[row['mean_view_count']],
                         marker_color=row['color']))

fig.update_layout(title='Average Views by Emotion',
                  xaxis_title='Emotion',
                  yaxis_title='Average Views',
                  template='plotly_dark',
                  showlegend=False)


# save as html, cdn
# fig.write_html("charts/emotions/mean_views_by_emotion.html")

# save svg
fig.write_image(
    "charts/emotions/mean_views_by_emotion.svg", width=1200, height=800)

### Emotion Distribution Across Categories

In [None]:
import plotly.express as px
from pyspark.sql import functions as F

# Count the number of videos for each combination of category and emotion
emotion_category_distribution = df.groupBy("categoryId", "emotion") \
    .count() \
    .withColumnRenamed("count", "counts") \
    .toPandas()


# Create a stacked bar chart using the Pandas DataFrame
fig = px.bar(emotion_category_distribution,
             x='categoryId',
             y='counts',
             color='emotion',
             title="Emotion Distribution Across Categories",
             labels={'counts': 'Number of Videos', 'categoryId': 'Category ID'})

fig.update_layout(template='plotly_dark')


# save as svg
fig.write_image(
    "charts/emotions/emotion_distribution_across_categories.svg", width=1200, height=800)

### Engagement Metrics by Emotion


In [None]:
# Calculate average controversy index by emotion
import plotly.express as px
import numpy as np
average_controversy_by_emotion = df.groupBy("emotion") \
    .agg(F.avg("controversy_index").alias("average_controversy")) \
    .toPandas()


# Normalize the 'average_controversy' to a 0-1 range
min_controversy = average_controversy_by_emotion['average_controversy'].min()
max_controversy = average_controversy_by_emotion['average_controversy'].max()
average_controversy_by_emotion['normalized_controversy'] = (
    average_controversy_by_emotion['average_controversy'] - min_controversy) / (max_controversy - min_controversy)


# Get the Viridis color scale
viridis = px.colors.sequential.Viridis

# Map normalized controversy to colors
average_controversy_by_emotion['color'] = average_controversy_by_emotion['normalized_controversy'].apply(
    lambda x: viridis[int(x * (len(viridis) - 1))])


fig = go.Figure()

for index, row in average_controversy_by_emotion.iterrows():
    fig.add_trace(go.Bar(
        x=[row['emotion']],
        y=[row['average_controversy']],
        marker_color=row['color'],  # Use the Viridis color mapped above
        name=row['emotion']
    ))

fig.update_layout(
    title='Average Controversy Index by Emotion with Viridis Color Scale',
    xaxis_title='Emotion',
    yaxis_title='Average Controversy Index',
    template='plotly_dark',
    showlegend=False  # Hide the legend if not necessary
)


# save as svg
fig.write_image(
    "charts/emotions/average_controversy_index_by_emotion.svg", width=1200, height=800)

### Average Engagement Metrics by Category

In [None]:
# Calculate Average Engagement Metrics in PySpark


import numpy as np
import matplotlib.pyplot as plt
from palettable import colorbrewer
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from pyspark.sql.functions import avg

# Assuming df is a PySpark DataFrame with your data
metrics = df.groupBy('categoryId') \
    .agg(
        avg('comment_rate').alias('avg_comment_rate'),
        avg('dislike_rate').alias('avg_dislike_rate'),
        avg('dislike_ratio').alias('avg_dislike_ratio'),
        avg('controversy_index').alias('avg_controversy_index')
)

# Convert to Pandas DataFrame for visualization
metrics_pd = metrics.toPandas()

# ------------------------- normalize for radar chart ------------------------ #

scaler = MinMaxScaler()

metrics_normalized = pd.DataFrame(scaler.fit_transform(metrics_pd.iloc[:, 1:]),
                                  columns=metrics_pd.columns[1:],
                                  index=metrics_pd['categoryId'])

# Reset index to keep 'categoryId' as a column for plotting
metrics_normalized.reset_index(inplace=True)


# use color pallette from brewer

sequential_palette = colorbrewer.sequential.YlGnBu_9.hex_colors


# Plot Radar Chart


categories = list(metrics_normalized.columns[1:])
N = len(categories)

angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

for index, row in metrics_normalized.iterrows():
    data = metrics_normalized.iloc[index].drop('categoryId').tolist()
    data += data[:1]
    ax.plot(angles, data, linewidth=2,
            linestyle='solid', label=row['categoryId'])
    ax.fill(angles, data, alpha=0.1)

ax.set_thetagrids(np.degrees(angles[:-1]), categories)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# # add names of labels at the end of each line
# for i in range(len(metrics_normalized)):
#     ax.text(angles[-1], data[-1], metrics_normalized['categoryId'][i], fontsize=12, color=sequential_palette[i])

plt.show()


# save as svg


# save as svg
fig.write_image(
    "charts/eng_metrics/average_engagement_metrics.svg", width=1200, height=800)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assume metrics_normalized_df has been normalized and contains categories as rows
# and metrics as columns, as per your previous data structure.

# Get unique categories
unique_categories = metrics_normalized['categoryId'].unique()

# Define the number of rows and columns for subplots
num_rows = int(len(unique_categories) ** 0.5)
num_cols = (len(unique_categories) + num_rows -
            1) // num_rows  # Ceiling division

# Create subplots, each subplot is a radar chart
fig = make_subplots(
    rows=num_rows, cols=num_cols,
    specs=[[{'type': 'polar'}] * num_cols] * num_rows,
    subplot_titles=unique_categories
)

# Plot each category in a separate radar chart
for i, category in enumerate(unique_categories):
    category_data = metrics_normalized[metrics_normalized['categoryId']
                                       == category].iloc[0]

    fig.add_trace(
        go.Scatterpolar(
            r=category_data[metrics].tolist() + [category_data[metrics[0]]],
            theta=metrics,
            fill='toself',
            name=category
        ),
        row=(i // num_cols) + 1,  # Row in subplot grid
        col=(i % num_cols) + 1    # Col in subplot grid
    )

# Update layout for all subplots
fig.update_layout(
    title='Average Engagement Metrics by Category',
    template='plotly_dark',
    polar=dict(radialaxis=dict(visible=True)),
    showlegend=False
)

# hide avg_controversy_index etc labels outside radar
fig.update_polars(radialaxis=dict(visible=True, tickangle=45))


# save as svg
fig.write_image(
    "charts/eng_metrics/radar_chart_average_engagement_metrics.svg", width=1900, height=1200)

# save as html
fig.write_html(
    "charts/eng_metrics/radar_chart_average_engagement_metrics.html", auto_open=True)

In [None]:
# comments rate = comments / views for category

### Correlation Heatmap

In [None]:
# Convert to Pandas DataFrame (if not already in Pandas)
import plotly.figure_factory as ff
df_pd = df.select("view_count", "likes", "dislikes", "comment_count",
                                                            "comment_rate", "dislike_rate", "dislike_ratio", "controversy_index").toPandas()

# Calculate Correlation Matrix
corr_matrix = df_pd.corr()

# Plot Heatmap with Plotly

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.round(2).values,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title_text='Correlation Heatmap of Engagement Metrics',
                  xaxis_title='Metrics',
                  yaxis_title='Metrics',
                  template='plotly_dark')

# save as svg
fig.write_image(
    "charts/eng_metrics/Correlation_Heatmap_of_Engagement_Metrics.svg", width=1200, height=800)

In [None]:
import plotly.figure_factory as ff
import numpy as np

# Assuming df_pd is your Pandas DataFrame
corr_matrix = df_pd.corr()

# Mask to display only upper half
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Heatmap
fig = ff.create_annotated_heatmap(
    z=corr_matrix.where(mask).values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.where(mask).round(2).astype(str).values,
    colorscale='Viridis',
    showscale=True
)

# Update layout to include definitions
metric_definitions = """
<b>Definitions</b>:
<br>Comment Rate = comments / views || Dislike Rate = dislikes / view
<br>Dislike Ratio = dislikes / (likes + dislikes) || Controversy Index = comment rate * dislike ratio
"""

fig.update_layout(
    title='Correlation Heatmap of Engagement Metrics<br>',
    xaxis=dict(title='Metrics'),
    yaxis=dict(title='Metrics'),
    template='plotly_dark'
)

# inert annotations at the bottom

fig.add_annotation(
    text=metric_definitions,
    xref="paper", yref="paper",
    x=0.1, y=-0.1, showarrow=False,
    font=dict(size=14, color="white")
)

# save as svg
fig.write_image(
    "charts/eng_metrics/Correlation_Heatmap_of_Engagement_Metrics2.svg", width=1200, height=800)

In [None]:
# correlation of emotions and engagement metrics

# Calculate Correlation Matrix
corr_matrix = df.select("view_count", "likes", "dislikes", "comment_count", "comment_rate", "dislike_rate", "dislike_ratio", "controversy_index") \
    .toPandas().corr()


corr_matrix

### Emotions per category

### create df for emotions

In [None]:
# emotions per category
emotions_per_category = df.groupBy(
    'categoryId', 'emotion').count().orderBy('categoryId', 'emotion')

# convert to pandas
emotions_per_category_pd = emotions_per_category.toPandas()

# head
emotions_per_category_pd.head()

In [None]:
# show whole dataframe, do not truncate
# import pandas as pd
# pd.set_option('display.max_rows', None)
# emotions_per_category_pd

In [None]:
# chart emotions per category
# use stacked bar plot

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a figure with subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=(
    "Emotions per Category", "Emotions per Category (Normalized)"))

# Create a bar plot for the absolute counts

for i, category in enumerate(emotions_per_category_pd['categoryId'].unique()):
    data = emotions_per_category_pd[emotions_per_category_pd['categoryId'] == category]
    fig.add_trace(go.Bar(x=data['emotion'],
                  y=data['count'], name=category), row=1, col=1)

# Create a bar plot for the normalized counts
for i, category in enumerate(emotions_per_category_pd['categoryId'].unique()):
    data = emotions_per_category_pd[emotions_per_category_pd['categoryId'] == category]
    fig.add_trace(go.Bar(x=data['emotion'], y=data['count'] /
                  data['count'].sum(), name=category), row=1, col=2)

### Sankey chart

In [None]:
import pandas as pd
from pySankey.sankey import sankey
import matplotlib.pyplot as plt

# Assuming df_subset_pd is your DataFrame with 'emotion' and 'categoryId' columns ready for plotting

# Create a Sankey chart
sankey(df_subset_pd['emotion'],
       df_subset_pd['categoryId'], aspect=20, fontsize=10)

# Get current figure
fig = plt.gcf()

# Set the size of the figure in inches [width, height]
# Adjust these values as needed to avoid label collision
fig.set_size_inches(12, 12)

# Set the color of the background to black
fig.set_facecolor("black")

# set title "Emotions per Category (Sankey Diagram)"
plt.title("Emotions per Category (Sankey Diagram)", fontsize=14, color="white")

# set color of the labels to white
plt.rcParams['text.color'] = 'white'

# Save the figure to a file, adjust the dpi to increase the resolution
fig.savefig("charts/sankey-emotions-per-category.png", bbox_inches="tight", dpi=300)

# save plt to file 'html
# plt.savefig("charts/sankey-emotions-per-category.html", format='html')

In [None]:
# Re-importing necessary library for the Sankey diagram
import plotly.graph_objects as go

# Assuming a simplified version of the provided data for demonstration purposes

# Correcting the data input issue and focusing on including colors for the emotions

# Simplified data structure for creating the Sankey diagram with colored emotions
category_ids = emotions_per_category_pd['categoryId'].astype(
    'category').cat.codes
emotion_ids = emotions_per_category_pd['emotion'].astype(
    'category').cat.codes + category_ids.max() + 1  # Offset by max category id
counts = emotions_per_category_pd['count']

# Unique categories and emotions for labeling
unique_categories = emotions_per_category_pd['categoryId'].astype(
    'category').cat.categories

unique_emotions = emotions_per_category_pd['emotion'].astype(
    'category').cat.categories

node_colors = [emotions_palette[emotion]
               if emotion in emotions_palette else 'grey' for emotion in unique_emotions]

# Creating the Sankey diagram with colors
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(unique_categories) + list(unique_emotions),
        # Default grey for categories, colored for emotions
        color=['grey']*len(unique_categories) + node_colors
    ),
    link=dict(
        source=category_ids,
        target=emotion_ids,
        value=counts
    ))])

fig.update_layout(
    title_text="YouTube Sentiment Analysis by Category and Emotion with Color", font_size=12)

# save as html, use cdn
fig.write_html("charts/sankey.html", include_plotlyjs='cdn')

## proportions of emotions within each YouTube category

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming emotions_per_category_pd is your DataFrame

# Pivot the DataFrame to get a matrix where categories are rows, emotions are columns, and values are counts
pivot_df = emotions_per_category_pd.pivot(
    index='categoryId', columns='emotion', values='count').fillna(0)

# Normalize the counts by row to get proportions
proportions = pivot_df.div(pivot_df.sum(axis=1), axis=0)

# Plot a 100% stacked bar chart
proportions.plot(kind='bar', stacked=True, figsize=(10, 7))

# Configure the plot with titles, labels, etc.
plt.title('Proportion of Emotions per YouTube Category')
plt.xlabel('Category')
plt.ylabel('Proportion of Emotions')
plt.legend(title='Emotion')
plt.tight_layout()

# change color of legend
plt.legend(title='Emotion', title_fontsize='12',
           fontsize='12', facecolor='black', edgecolor='black', loc='center left', bbox_to_anchor=(1, 0.5))
# Show the plot
plt.show()

In [None]:
# Pivot the DataFrame to get a matrix where categories are rows, emotions are columns, and values are counts
# pivot_df = emotions_per_category_pd.pivot(index='categoryId', columns='emotion', values='count').fillna(0)

# Normalize the counts by row to get proportions
proportions = pivot_df.div(pivot_df.sum(axis=1), axis=0)

# Create a list of traces for the stacked bar chart, one for each emotion
traces = []

# Pivot the DataFrame to have categories as rows and emotions as columns, filling NaNs with 0
category_emotion_counts = emotions_per_category_pd.pivot_table(
    index='categoryId',
    columns='emotion',
    values='count',
    aggfunc='sum',
    fill_value=0
)

# Calculate the proportions
category_emotion_proportions = category_emotion_counts.div(
    category_emotion_counts.sum(axis=1), axis=0)

# Create the plotly figure with one trace for each emotion column
fig = go.Figure()

for emotion in category_emotion_proportions.columns:
    fig.add_trace(go.Bar(
        name=emotion,
        x=category_emotion_proportions.index,
        y=category_emotion_proportions[emotion],
        hoverinfo='none'  # Disable default hoverinfo
    ))

# Update the layout of the figure to stack the bars and adjust the y-axis to show percentage
fig.update_layout(
    barmode='stack',
    title='Proportion of Emotions per YouTube Category',
    xaxis_title='Category',
    yaxis=dict(
        title='Proportion',
        tickformat=',.0%'
    ),
    hoverlabel=dict(  # Customize hover label
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
    legend_title_text='Emotion'
)

# Define the custom hovertemplate with HTML
# {emotion} in {category}: {y:.2%} <extra></extra>
hovertemplate = "<b>%{y:.2%}</b> of <b>%{x}</b> emits <b>%{fullData.name}</b><extra></extra>"

# Apply custom hovertemplate to each trace
for trace in fig.data:
    trace.hovertemplate = hovertemplate

# change colors of the bars using the emotions_palette
for i, emotion in enumerate(category_emotion_proportions.columns):
    fig.data[i].marker.color = emotions_palette[emotion]

# Save the figure to an HTML file
fig.write_html("charts/stacked_bar.html", include_plotlyjs='cdn')

## Engagement metrics by emotion 

### Correlation of eng. metrics

#### basic model

In [None]:
# prepare data

## Removing Rows with Null Values
df = df.na.drop(subset=["comment_count", "view_count", "likes", "dislikes"]) # This approach will help ensure that the VectorAssembler does not encounter null values, which should prevent the Py4JJavaError you're seeing.

## Filling Null Values
# df_subset = df_subset.na.fill(value=0, subset=["comment_count", "view_count", "likes", "dislikes"])

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Step 3: Assemble features
feature_columns = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

In [None]:
## Using handleInvalid Parameter in VectorAssembler
assembler = VectorAssembler(
    inputCols=["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"],
    outputCol="features",
    handleInvalid="skip"
)

# Transform the DataFrame to include a features vector column
df_features = assembler.transform(df)

In [None]:
# Step 4: Create and train the model
lr = LinearRegression(featuresCol="features", labelCol="view_count")

# Split data into training and test sets
train_data, test_data = df_features.randomSplit([0.7, 0.3], seed=42)

# Fit the model on the training data
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

Evaluate Model Effectiveness

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an evaluator for RMSE
evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="rmse")

# Evaluate RMSE
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

# If you want to calculate R-squared
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data =", r2)


Coefficients

In [None]:
# checks

## coefficients with their names
print("## Coefficients")
for i, feature in enumerate(feature_columns):
    print(feature, ":", lr_model.coefficients[i])

#### Model with interaction terms

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Handling null values (assuming 'df_subset' already has engagement metrics calculated)
df = df.na.drop()  # Dropping rows with any nulls

# RFormula to automatically include interactions (example formula, adjust accordingly)
r_formula = RFormula(formula="view_count ~ categoryId:emotion + comment_rate + dislike_rate + dislike_ratio + controversy_index + categoryId + emotion")

# Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="view_count")

# Define Pipeline
pipeline = Pipeline(stages=[r_formula, lr])

# Split the data into training and test sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

# Fit the pipeline to the training data
model = pipeline.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

model evaluation

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an evaluator for RMSE
evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="rmse")

# Evaluate RMSE
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

# If you want to calculate R-squared
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data =", r2)


coefficients

In [None]:
# Assuming 'model' is the fitted pipeline model
lr_model = model.stages[-1]  # The last stage in the pipeline is the LinearRegression model

# Extracting coefficients and intercept
coefficients = lr_model.coefficients
intercept = lr_model.intercept

print("Intercept: ", intercept)
print("Coefficients: ")
    
for i, feature in enumerate(feature_columns):
    print(feature, ":", coefficients[i])

### Stewise regression

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize a list to keep track of variables added and their R²
selected_variables = []
current_r2 = -float("inf")
improvement = True

# List all potential predictors
potential_predictors = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

while improvement:
    improvement = False
    best_r2_for_step = current_r2
    
    for predictor in potential_predictors:
        # Temporary model with the current selected variables plus the new potential predictor
        assembler = VectorAssembler(inputCols=selected_variables + [predictor], outputCol="features")
        
        # Fit and evaluate the model
        lr = LinearRegression(featuresCol="features", labelCol="view_count")
        pipeline = Pipeline(stages=[assembler, lr])
        model = pipeline.fit(train_data)  # Assuming train_data is already defined
        predictions = model.transform(test_data)  # Assuming test_data is already defined
        
        evaluator = RegressionEvaluator(labelCol="view_count", predictionCol="prediction", metricName="r2")
        r2 = evaluator.evaluate(predictions)
        
        # Check if this model is the best so far
        if r2 > best_r2_for_step:
            best_r2_for_step = r2
            best_predictor = predictor
            improvement = True
            
    # If there was an improvement, update the list of selected variables and the current best R²
    if improvement:
        selected_variables.append(best_predictor)
        current_r2 = best_r2_for_step
        potential_predictors.remove(best_predictor)
        print(f"Added {best_predictor}. New R² is {best_r2_for_step}.")


In [None]:
# print the selected variables
print("Selected variables:", selected_variables)

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Assuming df_subset is your DataFrame that includes 'categoryId', 'emotion', and engagement metrics

# List of engagement metrics to include as features
feature_cols = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

# Initialize an evaluator for R2
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")

# Retrieve unique categories and emotions
categories = df.select("categoryId").distinct().rdd.flatMap(lambda x: x).collect()
emotions = df.select("emotion").distinct().rdd.flatMap(lambda x: x).collect()

results = []

for category in categories:
    for emotion in emotions:
        # Filter the DataFrame for the current category and emotion
        df_filtered = df.filter((df.categoryId == category) & (df.emotion == emotion))
        
        # Check if the filtered DataFrame is not empty
        if df_filtered.count() > 0:
            assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
            lr = LinearRegression(featuresCol="features", labelCol="view_count")
            
            # Define the pipeline
            pipeline = Pipeline(stages=[assembler, lr])
            
            # Split the data
            train_data, test_data = df_filtered.randomSplit([0.7, 0.3], seed=42)
            
            # Fit the model
            model = pipeline.fit(train_data)
            
            # Make predictions
            predictions = model.transform(test_data)
            
            # Evaluate the model
            r2 = evaluator.evaluate(predictions)
            
            # Store the results
            results.append(((category, emotion), r2))
        else:
            results.append(((category, emotion), None))

# Display the results
for result in results:
    print(f"Category: {result[0][0]}, Emotion: {result[0][1]}, R2: {result[1]}")
