## Goal
detect emotions for given text

In [1]:
# read data from data/processed/tranformed_US_youtube_trending_data.parquet

from spark_session_manager import SparkSessionManager

spark = SparkSessionManager.get_spark_session()


24/04/01 03:04:44 WARN Utils: Your hostname, gr00stl-Legion-Y540-15IRH-PG0 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
24/04/01 03:04:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/01 03:04:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:

# read data
df = spark.read.parquet("data/processed/tranformed_US_youtube_trending_data_min.parquet")

# show schema
df.printSchema()

root
 |-- categoryId: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: double (nullable = true)
 |-- likes: double (nullable = true)
 |-- dislikes: double (nullable = true)
 |-- comment_count: double (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryTitle: string (nullable = true)



In [3]:
df.head(2)

[Row(categoryId='20', video_id='_I9T4hxmZXU', title='Lunar New Year Magic Show | Clash of Clans Official', publishedAt='2023-01-18T07:59:02Z', channelId='UCD1Em4q90ZUK2R5HKesszJg', channelTitle='Clash of Clans', trending_date='2023-01-22T00:00:00Z', tags='clash of clans|COC|Clash of Clans Gameplay|Clash of Clans Strategy|Clash of Clans Animation|Clash of Clans Commercial|Clash of Clans Attacks|Clash of Clans Town Hall|Hog Rider|PEKKA|Clan Wars|season challenges|clan war leagues|clash on|clan game|clan games|clash-a-rama|clasharama|lunar new year|year of the rabbit|rabbit archer queen|Magic Theatre Scenery|magic theatre|Magic Show Statue|magic show', view_count=14.698371488939815, likes=10.99964683504317, dislikes=0.0, comment_count=8.09376775793108, thumbnail_link='https://i.ytimg.com/vi/_I9T4hxmZXU/default.jpg', comments_disabled='False', ratings_disabled='False', description='Celebrate the Year of the Rabbit with the brand new Lunar Queen skin, the Magic Show statue, as well as the M

In [4]:
# trial - sample 100 random rows
from pyspark.sql.functions import rand

# Sample 100 random rows from the DataFrame
# sample_df = df.orderBy(rand()).limit(100)

Define a Pandas UDF for Emotion Classification


In [5]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
import pandas as pd
from transformers import pipeline

# Instantiate the classifier
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

  _torch_pytree._register_pytree_node(
2024-04-01 03:04:53.827046: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 03:04:54.122561: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [6]:

# Define the schema for the UDF output to include individual emotion columns
schema = StructType([
    StructField("anger", DoubleType()),
    StructField("disgust", DoubleType()),
    StructField("fear", DoubleType()),
    StructField("joy", DoubleType()),
    StructField("neutral", DoubleType()),
    StructField("sadness", DoubleType()),
    StructField("surprise", DoubleType()),
    StructField("highest_confidence_emotion", StringType())
])

@pandas_udf(schema, PandasUDFType.SCALAR)
def analyze_emotions(titles: pd.Series) -> pd.DataFrame:
    # Prepare a dictionary to collect prediction data
    results_data = {emotion: [] for emotion in schema.names[:-1]}
    results_data['highest_confidence_emotion'] = []
    
    # Classify emotions for each title
    for title in titles:
        predictions = classifier(title)
        # Prepare a dict to collect scores for each emotion
        scores = {emotion: 0.0 for emotion in schema.names[:-1]}  # Default score is 0
        for pred in predictions[0]:
            scores[pred['label']] = pred['score']
        for emotion, score in scores.items():
            results_data[emotion].append(score)
        # Determine the highest confidence emotion
        highest_confidence_emotion = max(predictions[0], key=lambda x: x['score'])['label']
        results_data['highest_confidence_emotion'].append(highest_confidence_emotion)

    # Convert the results data into a DataFrame
    results_df = pd.DataFrame(results_data)
    
    return results_df






Apply the UDF to Your DataFrame

In [7]:
from pyspark.sql.functions import col

In [8]:
# create dataframe that contains only id, title
# df = sample_df.select("video_id", "title")

# Apply the UDF to the DataFrame
df_with_emotions = df.withColumn("emotions", analyze_emotions(df["title"]))

In [9]:
# Show the results
df_with_emotions.show(5)

  _torch_pytree._register_pytree_node(
2024-04-01 03:05:05.267083: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 03:05:05.302897: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-01 03:05:12.825648: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 03:05:12.866245: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operati

+----------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------------+------------------+--------+------------------+--------------------+-----------------+----------------+--------------------+--------------+--------------------+
|categoryId|   video_id|               title|         publishedAt|           channelId|  channelTitle|       trending_date|                tags|        view_count|             likes|dislikes|     comment_count|      thumbnail_link|comments_disabled|ratings_disabled|         description| categoryTitle|            emotions|
+----------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------------+------------------+--------+------------------+--------------------+-----------------+----------------+--------------------+--------------+--------------------+
|        20|_I9T4hxmZXU|Luna

reduce focus on id, title, results -> dont shuffle the dataset in whole

In [10]:
# Print the schema to verify the new column and its struct fields
df_with_emotions.printSchema()


root
 |-- categoryId: string (nullable = true)
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: double (nullable = true)
 |-- likes: double (nullable = true)
 |-- dislikes: double (nullable = true)
 |-- comment_count: double (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryTitle: string (nullable = true)
 |-- emotions: struct (nullable = true)
 |    |-- anger: double (nullable = true)
 |    |-- disgust: double (nullable = true)
 |    |-- fear: double (nullable = true)
 |    |-- joy: double (nullable = true)
 |    |-- neutral: double (nullable = true)
 |    |-- sadness: do

In [11]:
# show
df_with_emotions.show(5)

  _torch_pytree._register_pytree_node(
2024-04-01 03:05:22.265643: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 03:05:22.301869: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


+----------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------------+------------------+--------+------------------+--------------------+-----------------+----------------+--------------------+--------------+--------------------+
|categoryId|   video_id|               title|         publishedAt|           channelId|  channelTitle|       trending_date|                tags|        view_count|             likes|dislikes|     comment_count|      thumbnail_link|comments_disabled|ratings_disabled|         description| categoryTitle|            emotions|
+----------+-----------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+------------------+------------------+--------+------------------+--------------------+-----------------+----------------+--------------------+--------------+--------------------+
|        20|_I9T4hxmZXU|Luna

                                                                                

save results

In [None]:
from pyspark.sql.functions import col

# Assuming your DataFrame is named 'df'
# Extract each field from the struct to a separate column
df_with_emotions = df_with_emotions.withColumn("anger", col("emotions.anger")) \
       .withColumn("disgust", col("emotions.disgust")) \
       .withColumn("fear", col("emotions.fear")) \
       .withColumn("joy", col("emotions.joy")) \
       .withColumn("neutral", col("emotions.neutral")) \
       .withColumn("sadness", col("emotions.sadness")) \
       .withColumn("surprise", col("emotions.surprise")) \
       .withColumn("highest_confidence_emotion", col("emotions.highest_confidence_emotion"))


# select 'video_id' and emotions columns
df_with_emotions = df_with_emotions.select('video_id', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise', 'highest_confidence_emotion')

In [12]:
# save the results as parquet
df_with_emotions.write.mode("overwrite").parquet("data/results/emotion-english-distilroberta-base.parquet")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-01 03:07:52.641931: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 03:07:52.722255: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-01 03:07:53.032158: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find 

# [distilbert finetuned](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english)

udf

In [13]:
# re load data
df = spark.read.parquet("data/processed/tranformed_US_youtube_trending_data_min.parquet")

In [14]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
import pandas as pd
from transformers import pipeline

# Instantiate the classifier with the DistilBERT model
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

# Define the schema for the UDF output
schema = StructType([
    StructField("sentiment_predictions", ArrayType(StructType([
        StructField("label", StringType()),
        StructField("score", DoubleType())
    ])), nullable=True),
    StructField("highest_confidence_sentiment", StringType(), nullable=True)
])
@pandas_udf(schema, PandasUDFType.SCALAR)
def analyze_sentiments(titles: pd.Series) -> pd.DataFrame:
    # Store the results in these lists
    sentiments_list = []
    highest_confidence_list = []
    
    # Process each title in the Series
    for title in titles:
        # Get the predictions from the classifier
        predictions = classifier(title)
        # Sort the predictions based on score
        sorted_predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
        # The sorted list of predictions is stored directly
        sentiments_list.append(sorted_predictions)
        # The highest confidence sentiment is the first in the sorted list
        highest_confidence_list.append(sorted_predictions[0]['label'])
    
    # Return a DataFrame with the desired structure
    return pd.DataFrame({
        "sentiment_predictions": sentiments_list,
        "highest_confidence_sentiment": highest_confidence_list
    })


In [15]:

# Apply the UDF to the DataFrame
df = df.withColumn("sentiment_data", analyze_sentiments(df["title"]))

# Display the result
df.select("title", "sentiment_data.sentiment_predictions", "sentiment_data.highest_confidence_sentiment").show(truncate=False)


  _torch_pytree._register_pytree_node(
2024-04-01 04:43:05.920552: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 04:43:05.977487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-01 04:43:17.319765: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 04:43:17.378885: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operati

+--------------------------------------------------------------------------------------------------+--------------------------------+----------------------------+
|title                                                                                             |sentiment_predictions           |highest_confidence_sentiment|
+--------------------------------------------------------------------------------------------------+--------------------------------+----------------------------+
|Lunar New Year Magic Show | Clash of Clans Official                                               |[{POSITIVE, 0.9874634742736816}]|POSITIVE                    |
|LAST PERSON TO MISS A PENALTY WINS $10,000!                                                       |[{NEGATIVE, 0.5848234295845032}]|NEGATIVE                    |
|Coi Leray - Players (DJ Smallz 732 - Jersey Club Remix) (Official Music Video)                    |[{POSITIVE, 0.5808926224708557}]|POSITIVE                    |
|I CHEATED As The WITH

                                                                                

In [16]:
df.count()

260787

In [17]:
# Repartition the DataFrame based on the size of your data and the number of available cores
df = df.repartition(200)

In [18]:
# Coalesce into a smaller number of partitions before writing
# df = df.coalesce(10)  # The number here depends on the size of your data.

In [19]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

In [20]:
# Optimize the number of shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", "200")  # Adjust based on your environment


In [21]:
spark.conf.set("spark.sql.files.maxRecordsPerFile", 100000)

In [22]:
df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(200), REPARTITION_BY_NUM, [plan_id=191]
   +- Project [categoryId#222, video_id#223, title#224, publishedAt#225, channelId#226, channelTitle#227, trending_date#228, tags#229, view_count#230, likes#231, dislikes#232, comment_count#233, thumbnail_link#234, comments_disabled#235, ratings_disabled#236, description#237, categoryTitle#238, pythonUDF0#319 AS sentiment_data#257]
      +- ArrowEvalPython [analyze_sentiments(title#224)#256], [pythonUDF0#319], 200
         +- FileScan parquet [categoryId#222,video_id#223,title#224,publishedAt#225,channelId#226,channelTitle#227,trending_date#228,tags#229,view_count#230,likes#231,dislikes#232,comment_count#233,thumbnail_link#234,comments_disabled#235,ratings_disabled#236,description#237,categoryTitle#238] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/gr00stl/Nextcloud/Projects/social-media-sentiment/youtube/d

In [23]:
# save
df.write.mode("overwrite").option("compression", "snappy").parquet("distilbert-base-uncased-finetuned-sst-2-english.parquet")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-01 04:43:31.735560: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 04:43:31.815331: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
2024-04-01 04:43:32.081141: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-01 04:43:32.160526: I tensorflow/core/platform/cpu_featur