#  Analysis on Singapore Data Set

In [0]:
# Load JSON (multiline must be true for nested JSON objects)
df = spark.read.option("multiline", "true").json("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_singapore.json")

# Show schema to understand structure
df.printSchema()

root
 |-- contentDetails: struct (nullable = true)
 |    |-- caption: string (nullable = true)
 |    |-- definition: string (nullable = true)
 |    |-- dimension: string (nullable = true)
 |    |-- duration: string (nullable = true)
 |    |-- licensedContent: boolean (nullable = true)
 |    |-- projection: string (nullable = true)
 |    |-- regionRestriction: struct (nullable = true)
 |    |    |-- blocked: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- etag: string (nullable = true)
 |-- id: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- snippet: struct (nullable = true)
 |    |-- categoryId: string (nullable = true)
 |    |-- channelId: string (nullable = true)
 |    |-- channelTitle: string (nullable = true)
 |    |-- defaultAudioLanguage: string (nullable = true)
 |    |-- defaultLanguage: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- liveBroadcastContent: string (nullable = true)
 |    

In [0]:
from pyspark.sql.functions import col

selected_df = df.select(
    "id",
    "snippet.publishedAt",
    "snippet.channelId",
    "snippet.channelTitle",
    "snippet.title",
    "snippet.description",
    "snippet.categoryId",
    "snippet.tags",
    "snippet.defaultLanguage",
    "snippet.defaultAudioLanguage",
    "statistics.viewCount",
    "statistics.likeCount",
    "statistics.commentCount",
    "contentDetails.duration",
    "contentDetails.caption"
)

selected_df.show(truncate=False)


+-----------+--------------------+------------------------+----------------------------------+----------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col

casted_df1 = selected_df.withColumn("viewCount", col("viewCount").cast("int")) \
                       .withColumn("likeCount", col("likeCount").cast("int")) \
                       .withColumn("commentCount", col("commentCount").cast("int"))


In [0]:
casted_df1.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df1.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedAt", "channelTitle",
    "title", "categoryId", "description"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df1.select("title", "duration").show(10, truncate=False)

+----------------------------------------------------------------------------------------------------+--------+
|title                                                                                               |duration|
+----------------------------------------------------------------------------------------------------+--------+
|BLACKPINK - ‘뛰어(JUMP)’ M/V                                                                        |PT3M14S |
|cute couple 😭🤣 #newmusic #music #funny #love #funnyvideo #prank                                   |PT18S   |
|F4 x MAYDAY五月天 [ 流星雨 Meteor Rain ] 5525版 Official Stage Video                                |PT5M5S  |
|Little doctor playing with dog👩‍⚕️🤭💕✨                                                            |PT26S   |
|Big Brother King For Sister💝☺️                                                                     |PT15S   |
|CZN Burak Pranked Me🇹🇷 #ishowspeed                                                                |PT14S   |
|What's T

# Convert duration (ISO 8601 format) to Total Seconds

In [0]:
from pyspark.sql.functions import col, regexp_extract

# Step 1: Extract raw duration string
df_with_duration = casted_df1.withColumn("duration_raw", col("duration"))

# Step 2: Extract hours, minutes, and seconds using regex
df_parsed = df_with_duration \
    .withColumn("hours", regexp_extract(col("duration_raw"), r'PT(\d+)H', 1).cast("int")) \
    .withColumn("minutes", regexp_extract(col("duration_raw"), r'(?<=PT(\d+H)?)?(\d+)M', 2).cast("int")) \
    .withColumn("seconds", regexp_extract(col("duration_raw"), r'(?<=M)?(\d+)S', 1).cast("int"))

# Step 3: Replace nulls with 0
df_filled = df_parsed.fillna({"hours": 0, "minutes": 0, "seconds": 0})

# Step 4: Calculate total duration in seconds and overwrite `duration` column
final_df = df_filled.withColumn(
    "duration",
    col("hours") * 3600 + col("minutes") * 60 + col("seconds")
)

# Step 5: Drop temporary columns (if you want clean dataframe)
final_df = final_df.drop("duration_raw", "hours", "minutes", "seconds")

#  Now update casted_df to this final version
casted_df1 = final_df


In [0]:
casted_df1.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = false)
 |-- caption: string (nullable = true)



In [0]:
casted_df1.select("title", "duration").show(20, truncate=False)

+----------------------------------------------------------------------------------------------------+--------+
|title                                                                                               |duration|
+----------------------------------------------------------------------------------------------------+--------+
|BLACKPINK - ‘뛰어(JUMP)’ M/V                                                                        |194     |
|cute couple 😭🤣 #newmusic #music #funny #love #funnyvideo #prank                                   |18      |
|F4 x MAYDAY五月天 [ 流星雨 Meteor Rain ] 5525版 Official Stage Video                                |305     |
|Little doctor playing with dog👩‍⚕️🤭💕✨                                                            |26      |
|Big Brother King For Sister💝☺️                                                                     |15      |
|CZN Burak Pranked Me🇹🇷 #ishowspeed                                                                |14      |
|What's T

# in our data distinct categoryid available

In [0]:
df.select("snippet.categoryId").distinct().count()

Out[10]: 11

In [0]:
df.select("snippet.categoryId").distinct().show(truncate=False)

+----------+
|categoryId|
+----------+
|22        |
|28        |
|27        |
|17        |
|23        |
|25        |
|24        |
|1         |
|20        |
|10        |
|2         |
+----------+



# Mapping distinct Categoryid to Category Name

In [0]:
from pyspark.sql.functions import when, col

casted_df1 = casted_df1.withColumn(
    "categoryName",
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "2", "Autos & Vehicles")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "17", "Sports")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "23", "Comedy")
    .when(col("categoryId") == "24", "Entertainment")
    .when(col("categoryId") == "25", "News & Politics")
    .when(col("categoryId") == "26", "Howto & Style")
    .when(col("categoryId") == "27", "Education")
    .when(col("categoryId") == "28", "Science & Technology")
    .otherwise("Unknown")
)


In [0]:
casted_df1.groupBy("categoryName").count().show(truncate=False)

+--------------------+-----+
|categoryName        |count|
+--------------------+-----+
|Education           |2    |
|Gaming              |2    |
|Entertainment       |17   |
|Science & Technology|2    |
|Sports              |4    |
|Film & Animation    |2    |
|People & Blogs      |8    |
|News & Politics     |2    |
|Autos & Vehicles    |1    |
|Music               |8    |
|Comedy              |2    |
+--------------------+-----+



# convert publishAt col to time stamp

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, year, month

# Step 1: Convert to timestamp
casted_df1 = casted_df1.withColumn("publishedAt_ts", to_timestamp(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Step 2: Extract only the date part
casted_df1 = casted_df1.withColumn("publishedDate", to_date(col("publishedAt_ts")))

# Step 3: Extract year and month from publishedDate
casted_df1 = casted_df1.withColumn("year", year("publishedDate")) \
                     .withColumn("month", month("publishedDate"))

# Step 4: Drop original publishedAt and timestamp
casted_df1 = casted_df1.drop("publishedAt", "publishedAt_ts")


In [0]:
casted_df1.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedDate", "channelTitle",
    "title", "categoryId", "description", "categoryName"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# in likecount there was a single null valued  so i filled it with '0'

In [0]:
from pyspark.sql.functions import col

casted_df1 = casted_df1.fillna({"likeCount": 0})

In [0]:
from pyspark.sql.functions import col, count, when

# Check null values for all relevant columns
casted_df1.select([
    count(when(col("viewCount").isNull(), 1)).alias("null_viewCount"),
    count(when(col("likeCount").isNull(), 1)).alias("null_likeCount"),
    count(when(col("commentCount").isNull(), 1)).alias("null_commentCount"),
    count(when(col("duration").isNull(), 1)).alias("null_duration"),
    count(when(col("caption").isNull(), 1)).alias("null_caption"),
    count(when(col("categoryName").isNull(), 1)).alias("null_categoryName"),
    count(when(col("publishedDate").isNull(), 1)).alias("null_publishedDate"),
    count(when(col("year").isNull(), 1)).alias("null_year"),
    count(when(col("month").isNull(), 1)).alias("null_month")
]).show()

+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|null_viewCount|null_likeCount|null_commentCount|null_duration|null_caption|null_categoryName|null_publishedDate|null_year|null_month|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|             0|             0|                0|            0|           0|                0|                 0|        0|         0|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+



# Engegment Analysis

# Top Channels by Total Views

In [0]:
top_channels = casted_df1.groupBy("channelTitle") \
    .sum("viewCount") \
    .withColumnRenamed("sum(viewCount)", "totalViews") \
    .orderBy(col("totalViews").desc())

top_channels.show(20, truncate=False)


+-----------------+----------+
|channelTitle     |totalViews|
+-----------------+----------+
|Mini Katana      |94520245  |
|Ben Azelart      |84420253  |
|BLACKPINK        |75930630  |
|IShowSpeed       |61858407  |
|BEN EAGLE        |61073614  |
|Let's Romantic   |53777618  |
|Red Bull         |46427751  |
|Nam Phương       |35077490  |
|MaviGadget       |31441467  |
|Eternal Love     |26856450  |
|Endless Love     |26270841  |
|Dude Perfect     |21957536  |
|JYP Entertainment|21366271  |
|Browney          |20654992  |
|Sun TV           |19711313  |
|CONDA NGUYEN     |19612884  |
|DAZN Football    |18817196  |
|BLACKPINK - Topic|17758514  |
|王家帮           |13249135  |
|Ong Squad        |13040440  |
+-----------------+----------+
only showing top 20 rows



# Top channels by total likes

In [0]:
top_liked_channels = casted_df1.groupBy("channelTitle") \
    .sum("likeCount") \
    .withColumnRenamed("sum(likeCount)", "totalLikes") \
    .orderBy(col("totalLikes").desc())
top_liked_channels.show(10, truncate=False)

+-----------------+----------+
|channelTitle     |totalLikes|
+-----------------+----------+
|BLACKPINK        |4598329   |
|Mini Katana      |2504386   |
|IShowSpeed       |2423145   |
|Ben Azelart      |1795348   |
|Red Bull         |1436906   |
|Browney          |1266350   |
|JYP Entertainment|1259913   |
|Let's Romantic   |1218226   |
|BEN EAGLE        |1198220   |
|Dude Perfect     |929960    |
+-----------------+----------+
only showing top 10 rows



# Most Liked Videos

In [0]:
# Here we can find top 5 most liked videos along with its title and chanelTitle 

In [0]:
most_liked = casted_df1.orderBy(col("likeCount").desc())
most_liked.select("title", "channelTitle", "likeCount").show(5, truncate=False)

+--------------------------------------------+------------+---------+
|title                                       |channelTitle|likeCount|
+--------------------------------------------+------------+---------+
|BLACKPINK - ‘뛰어(JUMP)’ M/V                |BLACKPINK   |4598329  |
|What's The Sharpest homemade Katana?        |Mini Katana |2504386  |
|Which Shoes Will Protect Your Feet?         |Ben Azelart |1795348  |
|Mousetrap Backflip Challenge🐭😱 #ishowspeed|IShowSpeed  |1501394  |
|That Plot Twist 🤯                          |Red Bull    |1436906  |
+--------------------------------------------+------------+---------+
only showing top 5 rows



# Top videos by comment count

In [0]:
most_commented = casted_df1.orderBy(col("commentCount").desc())
most_commented.select("title", "channelTitle", "commentCount").show(5, truncate=False)


+----------------------------------------------------------------------------------------------------+-----------------+------------+
|title                                                                                               |channelTitle     |commentCount|
+----------------------------------------------------------------------------------------------------+-----------------+------------+
|BLACKPINK - ‘뛰어(JUMP)’ M/V                                                                        |BLACKPINK        |378951      |
|TWICE "THIS IS FOR" M/V                                                                             |JYP Entertainment|156785      |
|SUPER JUNIOR 슈퍼주니어 'Express Mode' MV                                                           |SMTOWN           |34147       |
|Mousetrap Backflip Challenge🐭😱 #ishowspeed                                                        |IShowSpeed       |19171       |
|Monica - Lyric Video| COOLIE | Superstar Rajinikanth | Sun Pictures | 

# Like to View ratio (engagement rate)

In [0]:
from pyspark.sql.functions import expr

engagement_df = casted_df1.withColumn("likeViewRatio", expr("likeCount / viewCount"))
engagement_df.orderBy(col("likeViewRatio").desc()).select("title", "channelTitle", "likeViewRatio").show(10, truncate=False)


+--------------------------------------------------------------------------------------+-------------------+--------------------+
|title                                                                                 |channelTitle       |likeViewRatio       |
+--------------------------------------------------------------------------------------+-------------------+--------------------+
|She wears makeup every day to school.#movie #viralvideo #shorts                       |DreamSaga Review   |0.11433539063898798 |
|THIS is the most TERRIFYING sound for parents ​⁠@theweeniefamily                      |Jeenie.Weenie      |0.0890278120899892  |
|Told me dad to record the prettiest thing here 😭😭                                   |Haley Kalil        |0.07728013445717244 |
|How Can He Jump Under Water?                                                          |Browney            |0.07171899013850713 |
|Haley is working her magic #ModernFamily #HaleyDunphy #LukeDunphy #AlexDunphy #Shorts |Peac

# Analysis based on categories

# Top Categories by Likes

In [0]:
from pyspark.sql.functions import sum

casted_df1.groupBy("categoryName","channelTitle") \
    .agg(sum("likeCount").alias("totalLikes")) \
    .orderBy("totalLikes", ascending=False) \
    .show()

+----------------+--------------------+----------+
|    categoryName|        channelTitle|totalLikes|
+----------------+--------------------+----------+
|           Music|           BLACKPINK|   4598329|
|  People & Blogs|         Mini Katana|   2504386|
|          Gaming|          IShowSpeed|   2423145|
|   Entertainment|         Ben Azelart|   1795348|
|          Sports|            Red Bull|   1436906|
|   Entertainment|             Browney|   1266350|
|           Music|   JYP Entertainment|   1259913|
|           Music|     Let's Romantic |   1218226|
|   Entertainment|           BEN EAGLE|   1198220|
|          Sports|        Dude Perfect|    929960|
|           Music|        Endless Love|    841136|
|          Comedy|       Jeenie.Weenie|    784101|
|  People & Blogs|         Haley Kalil|    668005|
|           Music|        Eternal Love|    605711|
|Autos & Vehicles|Engineering Expla...|    599792|
|       Education|          Nam Phương|    582700|
|   Entertainment|   The Land O

# Top Categories by Comments

In [0]:
casted_df1.groupBy("categoryName","channelTitle") \
    .agg(sum("commentCount").alias("totalComments")) \
    .orderBy("totalComments", ascending=False) \
    .show()

+----------------+------------------------+-------------+
|    categoryName|            channelTitle|totalComments|
+----------------+------------------------+-------------+
|           Music|               BLACKPINK|       378951|
|           Music|       JYP Entertainment|       156785|
|           Music|                  SMTOWN|        34147|
|          Gaming|              IShowSpeed|        27223|
|   Entertainment|                  Sun TV|        18019|
|  People & Blogs|             Mini Katana|        17714|
|          Sports|           DAZN Football|        15470|
|           Music|            Eternal Love|         9102|
|           Music|        相信音樂BinMusic|         6947|
|           Music|       BLACKPINK - Topic|         6907|
|Autos & Vehicles|    Engineering Expla...|         5383|
|           Music|            Endless Love|         5246|
|          Sports|            Dude Perfect|         4148|
|   Entertainment|                 Browney|         3893|
|          Sports|

# Average Engagement per Category

In [0]:
# create engagment column on adding likecount and commentcount

In [0]:
casted_df1 = casted_df1.withColumn(
    "engagement",
    col("likeCount") + col("commentCount")
)

In [0]:
from pyspark.sql.functions import avg

casted_df1.groupBy("categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show(truncate=False)

+--------------------+-----------------+
|categoryName        |avgEngagement    |
+--------------------+-----------------+
|Music               |1234307.25       |
|Gaming              |1225184.0        |
|Sports              |799731.25        |
|Autos & Vehicles    |605175.0         |
|People & Blogs      |492900.25        |
|Education           |432505.5         |
|Entertainment       |427637.0588235294|
|Comedy              |393442.0         |
|Film & Animation    |270685.5         |
|Science & Technology|129997.0         |
|News & Politics     |2461.0           |
+--------------------+-----------------+



# Average engagement by video duration

In [0]:
# it says that people are engage or not  in short,long or medium videos

In [0]:
# Step 1: Categorize durations
from pyspark.sql.functions import when

casted_df1 = casted_df1.withColumn(
    "durationCategory",
    when(col("duration") <= 300, "Short")  # ≤ 5 mins
    .when((col("duration") > 300) & (col("duration") <= 1200), "Medium")  # 5–20 mins
    .otherwise("Long")  # > 20 mins
)

# Step 2: Avg engagement per duration category
casted_df1.groupBy("durationCategory") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show()

+----------------+-----------------+
|durationCategory|    avgEngagement|
+----------------+-----------------+
|           Short|626325.3829787234|
|          Medium|         180024.5|
|            Long|           2798.0|
+----------------+-----------------+



# Top performing categories within duration groups

In [0]:
casted_df1.groupBy("durationCategory", "categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("durationCategory", "avgEngagement", ascending=False) \
    .show()

+----------------+--------------------+------------------+
|durationCategory|        categoryName|     avgEngagement|
+----------------+--------------------+------------------+
|           Short|               Music|1398356.4285714286|
|           Short|              Gaming|         1225184.0|
|           Short|              Sports| 974946.3333333334|
|           Short|    Autos & Vehicles|          605175.0|
|           Short|      People & Blogs|         492900.25|
|           Short|           Education|          432505.5|
|           Short|       Entertainment| 427637.0588235294|
|           Short|              Comedy|          393442.0|
|           Short|    Film & Animation|          270685.5|
|           Short|Science & Technology|          129997.0|
|           Short|     News & Politics|            2124.0|
|          Medium|              Sports|          274086.0|
|          Medium|               Music|           85963.0|
|            Long|     News & Politics|            2798.

# Total Watch Time per Category (durationCategory)

In [0]:
casted_df1.groupBy("durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("totalWatchTime_seconds", ascending=False) \
    .show()

+----------------+----------------------+
|durationCategory|totalWatchTime_seconds|
+----------------+----------------------+
|           Short|                  3111|
|            Long|                  1360|
|          Medium|                  1120|
+----------------+----------------------+



# Total Watch Time per Category and Duration Bucket

In [0]:
from pyspark.sql.functions import sum

casted_df1.groupBy("categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory") \
    .show(truncate=False)

+--------------------+----------------+----------------------+
|categoryName        |durationCategory|totalWatchTime_seconds|
+--------------------+----------------+----------------------+
|Autos & Vehicles    |Short           |55                    |
|Comedy              |Short           |45                    |
|Education           |Short           |76                    |
|Entertainment       |Short           |1129                  |
|Film & Animation    |Short           |160                   |
|Gaming              |Short           |66                    |
|Music               |Medium          |305                   |
|Music               |Short           |757                   |
|News & Politics     |Long            |1360                  |
|News & Politics     |Short           |68                    |
|People & Blogs      |Short           |535                   |
|Science & Technology|Short           |34                    |
|Sports              |Medium          |815             

In [0]:
# i have attached the chanelTitle also , from here we can get idea 
# 1) Which channel is dominating which category .
# 2) Which channel do people watch for longer time?
# 3) Which channel's videos, short, medium or long, are being watched more?

In [0]:
from pyspark.sql.functions import sum

casted_df1.groupBy("channelTitle", "categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory", "totalWatchTime_seconds", ascending=False) \
    .show(truncate=False)

+--------------------+--------------------+----------------+----------------------+
|channelTitle        |categoryName        |durationCategory|totalWatchTime_seconds|
+--------------------+--------------------+----------------+----------------------+
|Dude Perfect        |Sports              |Short           |69                    |
|Thogden             |Sports              |Short           |68                    |
|Red Bull            |Sports              |Short           |49                    |
|DAZN Football       |Sports              |Medium          |815                   |
|头脑眼大设计        |Science & Technology|Short           |20                    |
|MaviGadget          |Science & Technology|Short           |14                    |
|王家帮              |People & Blogs      |Short           |142                   |
|汤泉兄弟            |People & Blogs      |Short           |115                   |
|Mini Katana         |People & Blogs      |Short           |92                    |
|Nann

# Top 5 Longest Videos with High Engagement

In [0]:
casted_df1.orderBy(col("duration").desc(), col("engagement").desc()).select("title", "duration", "engagement").show(5, truncate=False)


+------------------------------------------------------------------------------------------------------------------+--------+----------+
|title                                                                                                             |duration|engagement|
+------------------------------------------------------------------------------------------------------------------+--------+----------+
|Are Expensive Eggs More Nutritious? | Talking Point | Full Episode                                                |1360    |2798      |
|Chelsea vs. PSG | FIFA Club World Cup Final Extended Highlights                                                   |815     |274086    |
|F4 x MAYDAY五月天 [ 流星雨 Meteor Rain ] 5525版 Official Stage Video                                              |305     |85963     |
|【歌手2025·纯享】 #李佳薇《天后》展现强大唱功！高亢嗓音诠释爱而不得的复杂情绪 ｜《歌手2025》| SINGER 2025｜MangoTV|295     |22490     |
|Monica - Lyric Video| COOLIE | Superstar Rajinikanth | Sun Pictures | Lokesh | Anirudh |

#  Education and Tech-Focused Insights

In [0]:
education_df = casted_df1.filter(col("categoryName").isin("Education", "Science & Technology"))
education_df.show()

+-----------+--------------------+------------+-------------------------------------+-------------------------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|         id|           channelId|channelTitle|                                title|                          description|categoryId|                tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|        categoryName|publishedDate|year|month|engagement|durationCategory|
+-----------+--------------------+------------+-------------------------------------+-------------------------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|3gRKBu6xIIA|UCdfHiFFYfos49nJc...|头脑眼大设计|这样的制自制泳池

In [0]:
education_df.select("title", "likeCount", "viewCount", "engagement").show(truncate=False)

+------------------------------------------------------------------------------------------------+---------+---------+----------+
|title                                                                                           |likeCount|viewCount|engagement|
+------------------------------------------------------------------------------------------------+---------+---------+----------+
|这样的制自制泳池，对小朋友来说，真的是太欢乐了 #涨知识#科普                                     |8158     |562809   |8212      |
|Magic Dust Remover Cleaning Gel ✅ Product Link in Bio ( # 1970 )                                |251373   |31441467 |251782    |
|Be nice to everyone [CONDA NGUYEN] #condanguyen #kindness #goodman #help #respect               |280335   |19612884 |280596    |
|* lòng tốt nhưng hơi thiếu thông minh. ^^ [Nam Phương] #namphuong #shorts #funny #comedy #action|582700   |35077490 |584415    |
+------------------------------------------------------------------------------------------------+---------+---------+

In [0]:
education_df.select("channelTitle", "title", "categoryName").show(truncate=False)

+------------+------------------------------------------------------------------------------------------------+--------------------+
|channelTitle|title                                                                                           |categoryName        |
+------------+------------------------------------------------------------------------------------------------+--------------------+
|头脑眼大设计|这样的制自制泳池，对小朋友来说，真的是太欢乐了 #涨知识#科普                                     |Science & Technology|
|MaviGadget  |Magic Dust Remover Cleaning Gel ✅ Product Link in Bio ( # 1970 )                                |Science & Technology|
|CONDA NGUYEN|Be nice to everyone [CONDA NGUYEN] #condanguyen #kindness #goodman #help #respect               |Education           |
|Nam Phương  |* lòng tốt nhưng hơi thiếu thông minh. ^^ [Nam Phương] #namphuong #shorts #funny #comedy #action|Education           |
+------------+-----------------------------------------------------------------------------------------

In [0]:
# i check education content by seeing the keyword like what ever i have mentioned in the below 

In [0]:
from pyspark.sql.functions import col, lower
from functools import reduce

keywords = ["tutorial", "science", "AI", "learn", "how to", "technology", "education", "experiment", "physics", "math", "engineering"]

# Debugging: ensure title is not null
education_df_filtered = education_df.filter(col("title").isNotNull())

# Apply keyword filter in lowercase
education_clean_df = education_df_filtered.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) for k in keywords])
)

education_clean_df.show(truncate=False)
education_clean_df.count()

+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
|id |channelId|channelTitle|title|description|categoryId|tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|categoryName|publishedDate|year|month|engagement|durationCategory|
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+

Out[50]: 0

In [0]:
education_df.select("title").show(20, truncate=False)

+------------------------------------------------------------------------------------------------+
|title                                                                                           |
+------------------------------------------------------------------------------------------------+
|这样的制自制泳池，对小朋友来说，真的是太欢乐了 #涨知识#科普                                     |
|Magic Dust Remover Cleaning Gel ✅ Product Link in Bio ( # 1970 )                                |
|Be nice to everyone [CONDA NGUYEN] #condanguyen #kindness #goodman #help #respect               |
|* lòng tốt nhưng hơi thiếu thông minh. ^^ [Nam Phương] #namphuong #shorts #funny #comedy #action|
+------------------------------------------------------------------------------------------------+



# Most engaging education/tech videos

In [0]:
education_df.orderBy("engagement", ascending=False).select("title", "engagement").show(10, truncate=False)

+------------------------------------------------------------------------------------------------+----------+
|title                                                                                           |engagement|
+------------------------------------------------------------------------------------------------+----------+
|* lòng tốt nhưng hơi thiếu thông minh. ^^ [Nam Phương] #namphuong #shorts #funny #comedy #action|584415    |
|Be nice to everyone [CONDA NGUYEN] #condanguyen #kindness #goodman #help #respect               |280596    |
|Magic Dust Remover Cleaning Gel ✅ Product Link in Bio ( # 1970 )                                |251782    |
|这样的制自制泳池，对小朋友来说，真的是太欢乐了 #涨知识#科普                                     |8212      |
+------------------------------------------------------------------------------------------------+----------+



# Keyword-Based Filtering (regardless of category)

In [0]:
from pyspark.sql.functions import lower, col
from functools import reduce

keywords = ["tutorial", "science", "AI", "learn", "how to", "technology", "education", "experiment", "physics", "math", "engineering"]

# keyword-based filter on title and description
filtered_df = casted_df1.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) | lower(col("description")).contains(k) for k in keywords])
)

filtered_df.select("categoryName", "title", "description").show(truncate=False)


+----------------+------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
output_path = '/FileStore/tables/final_singapore_trending'
casted_df1.write.mode('overwrite').parquet(output_path)