In [27]:
# regression analysis of what plays when it comes to emotions and engagement metrics and how they correlate with each other

In [28]:
# import libraries
import pandas as pd

from spark_session_manager import SparkSessionManager
from pyspark.sql import functions as F


In [29]:
spark =  SparkSessionManager().get_spark_session()

In [30]:
# load emotion data -> data/processed/emotion_analysis_results.parquet
df = spark.read.parquet("data/results/emotion-english-distilroberta-base.parquet")

# show head
df.show(5, truncate=False)

# show schema
df.printSchema()

+----------+-----------+------------------------------------------------------------+--------------------+------------------------+-------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------------+-----------------+------------------+----------------------------------------------+-----------------+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
# split column emotion into multiple columns
#  |-- emotions: struct (nullable = true)
#  |    |-- anger: double (nullable = true)
#  |    |-- disgust: double (nullable = true)
#  |    |-- fear: double (nullable = true)
#  |    |-- joy: double (nullable = true)
#  |    |-- neutral: double (nullable = true)
#  |    |-- sadness: double (nullable = true)
#  |    |-- surprise: double (nullable = true)
#  |    |-- highest_confidence_emotion: string (nullable = true)


In [32]:
from pyspark.sql.functions import col

# Assuming your DataFrame is named 'df'
# Extract each field from the struct to a separate column
df = df.withColumn("anger", col("emotions.anger")) \
       .withColumn("disgust", col("emotions.disgust")) \
       .withColumn("fear", col("emotions.fear")) \
       .withColumn("joy", col("emotions.joy")) \
       .withColumn("neutral", col("emotions.neutral")) \
       .withColumn("sadness", col("emotions.sadness")) \
       .withColumn("surprise", col("emotions.surprise")) \
       .withColumn("highest_confidence_emotion", col("emotions.highest_confidence_emotion"))


# select 'video_id' and emotions columns
df = df.select('video_id', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'highest_confidence_emotion')

# Show the modified DataFrame to verify the changes
df.show(truncate=False)


+-----------+---------------------+---------------------+---------------------+---------------------+--------------------+---------------------+--------------------+--------------------------+
|video_id   |anger                |disgust              |fear                 |joy                  |neutral             |sadness              |surprise            |highest_confidence_emotion|
+-----------+---------------------+---------------------+---------------------+---------------------+--------------------+---------------------+--------------------+--------------------------+
|3C66w5Z0ixs|0.502659022808075    |0.08973177522420883  |0.01276390627026558  |0.007058965973556042 |0.16834966838359833 |0.18087247014045715  |0.03856417536735535 |anger                     |
|M9Pmf9AB4Mo|0.027806859463453293 |0.517167329788208    |0.002150328131392598 |0.07300199568271637  |0.362636536359787   |0.008639412932097912 |0.008597424253821373|disgust                   |
|J78aPJ3VyNs|0.019757458940148354 |

In [36]:
# save df as parquet
df.write.mode("overwrite").parquet("data/results/emotion-english-distilroberta-base-split_min.parquet")

                                                                                

## Preparing the Data

To predict view counts based on the levels of detected emotions in video content, you can build a regression model using the PySpark MLlib library. This involves a few key steps:

* Preparing the Data: Format your data into a features vector and a label (target variable).
* Splitting the Data: Divide your data into training and test sets.
* Building the Model: Choose a regression model and train it on your data.
* Evaluating the Model: Assess the model's performance.

In [None]:
# load engagement data -> data/processed/engage_metrics_data.parquet
df_engage = spark.read.parquet("data/results/engage_metrics_data.parquet")


In [None]:
# Preparing the Data
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

# Define the assembler
assembler = VectorAssembler(inputCols=["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"], outputCol="features")

# Define the label column
df = df.withColumnRenamed("view_count", "label")

# Assemble the features
data = assembler.transform(df)


## Splitting the Data

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2])

## Building the Model

In [None]:
# Initialize the LinearRegression model
lr = LinearRegression(featuresCol='features', labelCol='label')

# Train the model
lr_model = lr.fit(train_data)

## Evaluating the Model

In [None]:
# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

# Calculate RMSE
rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

In [33]:
# build base model

In [34]:
## evaluate model

In [35]:
# build imporved model (maybe with interactions?)

### Correlation of eng. metrics

#### basic model

In [None]:
# prepare data

## Removing Rows with Null Values
df = df.na.drop(subset=["comment_count", "view_count", "likes", "dislikes"]) # This approach will help ensure that the VectorAssembler does not encounter null values, which should prevent the Py4JJavaError you're seeing.

## Filling Null Values
# df_subset = df_subset.na.fill(value=0, subset=["comment_count", "view_count", "likes", "dislikes"])

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Step 3: Assemble features
feature_columns = ["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"]

In [None]:
## Using handleInvalid Parameter in VectorAssembler
assembler = VectorAssembler(
    inputCols=["comment_rate", "like_rate", "dislike_rate", "dislike_ratio", "controversy_index"],
    outputCol="features",
    handleInvalid="skip"
)

# Transform the DataFrame to include a features vector column
df_features = assembler.transform(df)

In [None]:
# Step 4: Create and train the model
lr = LinearRegression(featuresCol="features", labelCol="view_count")

# Split data into training and test sets
train_data, test_data = df_features.randomSplit([0.7, 0.3], seed=42)

# Fit the model on the training data
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

Evaluate Model Effectiveness

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an evaluator for RMSE
evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="rmse")

# Evaluate RMSE
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

# If you want to calculate R-squared
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data =", r2)


Coefficients

In [None]:
# checks

## coefficients with their names
print("## Coefficients")
for i, feature in enumerate(feature_columns):
    print(feature, ":", lr_model.coefficients[i])

#### Model with interaction terms

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Handling null values (assuming 'df_subset' already has engagement metrics calculated)
df = df.na.drop()  # Dropping rows with any nulls

# RFormula to automatically include interactions (example formula, adjust accordingly)
r_formula = RFormula(formula="view_count ~ categoryId:emotion + comment_rate + dislike_rate + dislike_ratio + controversy_index + categoryId + emotion")

# Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="view_count")

# Define Pipeline
pipeline = Pipeline(stages=[r_formula, lr])

# Split the data into training and test sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

# Fit the pipeline to the training data
model = pipeline.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

model evaluation

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an evaluator for RMSE
evaluator_rmse = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="rmse")

# Evaluate RMSE
rmse = evaluator_rmse.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

# If you want to calculate R-squared
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="view_count", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data =", r2)


coefficients

In [None]:
# Assuming 'model' is the fitted pipeline model
lr_model = model.stages[-1]  # The last stage in the pipeline is the LinearRegression model

# Extracting coefficients and intercept
coefficients = lr_model.coefficients
intercept = lr_model.intercept

print("Intercept: ", intercept)
print("Coefficients: ")
    
for i, feature in enumerate(feature_columns):
    print(feature, ":", coefficients[i])