In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("YouTube Trending Analysis").getOrCreate()

spark

# WORKING ON  JSON FORMAT DATA

In [0]:
df1 = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_india.json")
df2 = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_singapore.json")
df3 = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_usa.json")

# Analysis on India Data Set

In [0]:
# Load JSON (multiline must be true for nested JSON objects)
df = spark.read.option("multiline", "true").json("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_india.json")

# Show schema to understand structure
df.printSchema()


root
 |-- contentDetails: struct (nullable = true)
 |    |-- caption: string (nullable = true)
 |    |-- definition: string (nullable = true)
 |    |-- dimension: string (nullable = true)
 |    |-- duration: string (nullable = true)
 |    |-- licensedContent: boolean (nullable = true)
 |    |-- projection: string (nullable = true)
 |    |-- regionRestriction: struct (nullable = true)
 |    |    |-- allowed: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- etag: string (nullable = true)
 |-- id: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- snippet: struct (nullable = true)
 |    |-- categoryId: string (nullable = true)
 |    |-- channelId: string (nullable = true)
 |    |-- channelTitle: string (nullable = true)
 |    |-- defaultAudioLanguage: string (nullable = true)
 |    |-- defaultLanguage: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- liveBroadcastContent: string (nullable = true)
 |    

In [0]:
from pyspark.sql.functions import col

selected_df = df.select(
    "id",
    "snippet.publishedAt",
    "snippet.channelId",
    "snippet.channelTitle",
    "snippet.title",
    "snippet.description",
    "snippet.categoryId",
    "snippet.tags",
    "snippet.defaultLanguage",
    "snippet.defaultAudioLanguage",
    "statistics.viewCount",
    "statistics.likeCount",
    "statistics.commentCount",
    "contentDetails.duration",
    "contentDetails.caption"
)

selected_df.show(truncate=False)


+-----------+--------------------+------------------------+------------------------+----------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col

casted_df = selected_df.withColumn("viewCount", col("viewCount").cast("int")) \
                       .withColumn("likeCount", col("likeCount").cast("int")) \
                       .withColumn("commentCount", col("commentCount").cast("int"))


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedAt", "channelTitle",
    "title", "categoryId", "description"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df.select("title", "duration").show(10, truncate=False)

+----------------------------------------------------------------------------------------------------+----------+
|title                                                                                               |duration  |
+----------------------------------------------------------------------------------------------------+----------+
|Thalaivan Thalaivii - Trailer | Vijay Sethupathi, Nithya Menen | Pandiraaj | Santhosh Narayanan     |PT2M47S   |
|Romeo S3 - Full Movie | Palak Tiwari, Thakur Anoop Singh | Pen Movies | New Hindi Movie 2025        |PT2H25M32S|
|Stranger Things 5 | Official Teaser | Netflix                                                       |PT2M47S   |
|Dil Pe Chalai Churiya (Trending Version) | Sonu Nigam | Raju Kalakar, Anjali, Rajan, Rishabh,Deepak |PT4M1S    |
|Playing Real Life SQUID GAME in KOREA | SlayyPop                                                    |PT18M1S   |
|Dhana Mani engagement |அம்மாக்கு சேலை எடுக்குறதுக்குள்ள கடையே கலகலனு் ஆய்டுச்சி😂  | Dre

# Convert duration (ISO 8601 format) to Total Seconds

In [0]:
from pyspark.sql.functions import col, regexp_extract

# Step 1: Extract raw duration string
df_with_duration = casted_df.withColumn("duration_raw", col("duration"))

# Step 2: Extract hours, minutes, and seconds using regex
df_parsed = df_with_duration \
    .withColumn("hours", regexp_extract(col("duration_raw"), r'PT(\d+)H', 1).cast("int")) \
    .withColumn("minutes", regexp_extract(col("duration_raw"), r'(?<=PT(\d+H)?)?(\d+)M', 2).cast("int")) \
    .withColumn("seconds", regexp_extract(col("duration_raw"), r'(?<=M)?(\d+)S', 1).cast("int"))

# Step 3: Replace nulls with 0
df_filled = df_parsed.fillna({"hours": 0, "minutes": 0, "seconds": 0})

# Step 4: Calculate total duration in seconds and overwrite `duration` column
final_df = df_filled.withColumn(
    "duration",
    col("hours") * 3600 + col("minutes") * 60 + col("seconds")
)

# Step 5: Drop temporary columns (if you want clean dataframe)
final_df = final_df.drop("duration_raw", "hours", "minutes", "seconds")

#  Now update casted_df to this final version
casted_df = final_df


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = false)
 |-- caption: string (nullable = true)



In [0]:
casted_df.select("title", "duration").show(10, truncate=False)

+----------------------------------------------------------------------------------------------------+--------+
|title                                                                                               |duration|
+----------------------------------------------------------------------------------------------------+--------+
|Thalaivan Thalaivii - Trailer | Vijay Sethupathi, Nithya Menen | Pandiraaj | Santhosh Narayanan     |167     |
|Romeo S3 - Full Movie | Palak Tiwari, Thakur Anoop Singh | Pen Movies | New Hindi Movie 2025        |8732    |
|Stranger Things 5 | Official Teaser | Netflix                                                       |167     |
|Dil Pe Chalai Churiya (Trending Version) | Sonu Nigam | Raju Kalakar, Anjali, Rajan, Rishabh,Deepak |241     |
|Playing Real Life SQUID GAME in KOREA | SlayyPop                                                    |1081    |
|Dhana Mani engagement |அம்மாக்கு சேலை எடுக்குறதுக்குள்ள கடையே கலகலனு் ஆய்டுச்சி😂  | Dress Purchase |106

# in our data distinct categoryid available

In [0]:
df.select("snippet.categoryId").distinct().count()

Out[71]: 10

In [0]:
df.select("snippet.categoryId").distinct().show(truncate=False)

+----------+
|categoryId|
+----------+
|22        |
|28        |
|27        |
|17        |
|26        |
|23        |
|24        |
|1         |
|20        |
|10        |
+----------+



# Mapping distinct Categoryid to Category Name

In [0]:
from pyspark.sql.functions import when, col

casted_df = casted_df.withColumn(
    "categoryName",
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "17", "Sports")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "23", "Comedy")
    .when(col("categoryId") == "24", "Entertainment")
    .when(col("categoryId") == "26", "Howto & Style")
    .when(col("categoryId") == "27", "Education")
    .when(col("categoryId") == "28", "Science & Technology")
    .otherwise("Unknown")
)


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = false)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = false)



In [0]:
casted_df.groupBy("categoryName").count().show(truncate=False)

+--------------------+-----+
|categoryName        |count|
+--------------------+-----+
|Education           |1    |
|Gaming              |2    |
|Entertainment       |22   |
|Science & Technology|2    |
|Sports              |1    |
|Howto & Style       |1    |
|Film & Animation    |3    |
|People & Blogs      |8    |
|Music               |8    |
|Comedy              |2    |
+--------------------+-----+



# convert publishAt col to time stamp

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, year, month

# Step 1: Convert to timestamp
casted_df = casted_df.withColumn("publishedAt_ts", to_timestamp(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Step 2: Extract only the date part
casted_df = casted_df.withColumn("publishedDate", to_date(col("publishedAt_ts")))

# Step 3: Extract year and month from publishedDate
casted_df = casted_df.withColumn("year", year("publishedDate")) \
                     .withColumn("month", month("publishedDate"))

# Step 4: Drop original publishedAt and timestamp
casted_df = casted_df.drop("publishedAt", "publishedAt_ts")


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = false)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = false)
 |-- publishedDate: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [0]:
casted_df.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedDate", "channelTitle",
    "title", "categoryId", "description", "categoryName"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# in likecount there was a single null valued  so i filled it with '0' 

In [0]:
from pyspark.sql.functions import col

casted_df = casted_df.fillna({"likeCount": 0})



In [0]:
from pyspark.sql.functions import col, count, when

# Check null values for all relevant columns
casted_df.select([
    count(when(col("viewCount").isNull(), 1)).alias("null_viewCount"),
    count(when(col("likeCount").isNull(), 1)).alias("null_likeCount"),
    count(when(col("commentCount").isNull(), 1)).alias("null_commentCount"),
    count(when(col("duration").isNull(), 1)).alias("null_duration"),
    count(when(col("caption").isNull(), 1)).alias("null_caption"),
    count(when(col("categoryName").isNull(), 1)).alias("null_categoryName"),
    count(when(col("publishedDate").isNull(), 1)).alias("null_publishedDate"),
    count(when(col("year").isNull(), 1)).alias("null_year"),
    count(when(col("month").isNull(), 1)).alias("null_month")
]).show()


+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|null_viewCount|null_likeCount|null_commentCount|null_duration|null_caption|null_categoryName|null_publishedDate|null_year|null_month|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|             0|             0|                0|            0|           0|                0|                 0|        0|         0|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+



# Engegment Analysis

# Top Channels by Total Views

In [0]:
top_channels = casted_df.groupBy("channelTitle") \
    .sum("viewCount") \
    .withColumnRenamed("sum(viewCount)", "totalViews") \
    .orderBy(col("totalViews").desc())

top_channels.show(20, truncate=False)


+---------------------------+----------+
|channelTitle               |totalViews|
+---------------------------+----------+
|T-Series                   |34221562  |
|Zee Music Company          |17185883  |
|Netflix                    |14269445  |
|Sourav Joshi Vlogs         |9017936   |
|Warner Bros.               |7245936   |
|Vijay Television           |5838381   |
|Think Music India          |4200400   |
|Free Fire India Official   |3811435   |
|ETV Dhee                   |3137692   |
|Crazy XYZ                  |2771269   |
|Nischay Malhan             |2350017   |
|SlayyPop                   |2324607   |
|Wanderers Hub              |2315667   |
|MR. INDIAN HACKER          |2252999   |
|LOL (Life of Limbachiyaa’s)|1962844   |
|Pen Movies                 |1848605   |
|Saba Ibrahim               |1271272   |
|Thugesh                    |1237751   |
|Sony Sports Network        |1133050   |
|Sun TV                     |973087    |
+---------------------------+----------+
only showing top

# Top channels by total likes

In [0]:
top_liked_channels = casted_df.groupBy("channelTitle") \
    .sum("likeCount") \
    .withColumnRenamed("sum(likeCount)", "totalLikes") \
    .orderBy(col("totalLikes").desc())
top_liked_channels.show(10, truncate=False)


+------------------------+----------+
|channelTitle            |totalLikes|
+------------------------+----------+
|T-Series                |1270614   |
|Netflix                 |668886    |
|Sourav Joshi Vlogs      |427552    |
|Warner Bros.            |270301    |
|MR. INDIAN HACKER       |141791    |
|Think Music India       |139778    |
|Crazy XYZ               |136852    |
|Free Fire India Official|124167    |
|Nischay Malhan          |110810    |
|Vijay Television        |98911     |
+------------------------+----------+
only showing top 10 rows



# Most Liked Videos

In [0]:
# Here we can find top 5 most liked videos along with its title and chanelTitle 

In [0]:
most_liked = casted_df.orderBy(col("likeCount").desc())
most_liked.select("title", "channelTitle", "likeCount").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------+------------------+---------+
|title                                                                                              |channelTitle      |likeCount|
+---------------------------------------------------------------------------------------------------+------------------+---------+
|Stranger Things 5 | Official Teaser | Netflix                                                      |Netflix           |668886   |
|Dil Pe Chalai Churiya (Trending Version) | Sonu Nigam | Raju Kalakar, Anjali, Rajan, Rishabh,Deepak|T-Series          |603633   |
|6 AM (Official Video): YO YO HONEY SINGH | HEERA SOHAL | GLORY | BHUSHAN KUMAR                     |T-Series          |531193   |
|Mortal Kombat II | Official Trailer                                                                |Warner Bros.      |270301   |
|Shadi Ki Taiyari shuru Hogyi 😍                                                    

# Top videos by comment count

In [0]:
most_commented = casted_df.orderBy(col("commentCount").desc())
most_commented.select("title", "channelTitle", "commentCount").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------+------------+------------+
|title                                                                                              |channelTitle|commentCount|
+---------------------------------------------------------------------------------------------------+------------+------------+
|Stranger Things 5 | Official Teaser | Netflix                                                      |Netflix     |38411       |
|6 AM (Official Video): YO YO HONEY SINGH | HEERA SOHAL | GLORY | BHUSHAN KUMAR                     |T-Series    |33356       |
|Dil Pe Chalai Churiya (Trending Version) | Sonu Nigam | Raju Kalakar, Anjali, Rajan, Rishabh,Deepak|T-Series    |30240       |
|Mortal Kombat II | Official Trailer                                                                |Warner Bros.|21949       |
|SON OF SARDAAR 2 | THE PO PO SONG | Ajay Devgn | Mrunal Thakur | Guru Randhawa | Tanishk Bagchi    |T-S

# Like to View ratio (engagement rate)

In [0]:
from pyspark.sql.functions import expr

engagement_df = casted_df.withColumn("likeViewRatio", expr("likeCount / viewCount"))
engagement_df.orderBy(col("likeViewRatio").desc()).select("title", "channelTitle", "likeViewRatio").show(10, truncate=False)


+----------------------------------------------------------------------------------------------------+-------------------+--------------------+
|title                                                                                               |channelTitle       |likeViewRatio       |
+----------------------------------------------------------------------------------------------------+-------------------+--------------------+
|Dino James - Still Here ft. Katya Krishnan (Official Music Video)                                   |Dino James         |0.14690939597315436 |
|#Video | जल ढ़ारे चल | Khesari Lal Yadav, Srishti Bharti | Jal Dhare Chal | Bhojpuri Bolbam Song 2025|Khesari Music World|0.1374067896389325  |
|Mr. Truth | Episode 11 | Dreamz Unlimited                                                           |Dreamz Unlimited   |0.12197487036626153 |
|Sabba : Village Flow (Official Video) | Latest Punjabi Songs 2025 | New Punjabi Songs 2025          |Speed Records      |0.079081122555

# Analysis based on categories

# Top Categories by Likes

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("categoryName","channelTitle") \
    .agg(sum("likeCount").alias("totalLikes")) \
    .orderBy("totalLikes", ascending=False) \
    .show()



+--------------------+--------------------+----------+
|        categoryName|        channelTitle|totalLikes|
+--------------------+--------------------+----------+
|               Music|            T-Series|   1270614|
|       Entertainment|             Netflix|    668886|
|      People & Blogs|  Sourav Joshi Vlogs|    427552|
|    Film & Animation|        Warner Bros.|    270301|
|Science & Technology|   MR. INDIAN HACKER|    141791|
|    Film & Animation|   Think Music India|    139778|
|Science & Technology|           Crazy XYZ|    136852|
|              Gaming|Free Fire India O...|    124167|
|           Education|      Nischay Malhan|    110810|
|       Entertainment|    Vijay Television|     98911|
|       Entertainment|            ETV Dhee|     96560|
|       Entertainment|LOL (Life of Limb...|     92515|
|      People & Blogs|       Wanderers Hub|     82931|
|       Entertainment|   Zee Music Company|     80213|
|              Gaming|            SlayyPop|     74447|
|       En

# Top Categories by Comments

In [0]:
casted_df.groupBy("categoryName","channelTitle") \
    .agg(sum("commentCount").alias("totalComments")) \
    .orderBy("totalComments", ascending=False) \
    .show()

+--------------------+--------------------+-------------+
|        categoryName|        channelTitle|totalComments|
+--------------------+--------------------+-------------+
|               Music|            T-Series|        77471|
|       Entertainment|             Netflix|        38411|
|    Film & Animation|        Warner Bros.|        21949|
|      People & Blogs|  Sourav Joshi Vlogs|        10831|
|Science & Technology|   MR. INDIAN HACKER|         9984|
|              Gaming|Free Fire India O...|         6181|
|Science & Technology|           Crazy XYZ|         5680|
|               Music| Khesari Music World|         4842|
|       Entertainment|LOL (Life of Limb...|         4215|
|       Entertainment|             Thugesh|         3983|
|       Entertainment|   Zee Music Company|         3855|
|               Music|          Dino James|         3451|
|           Education|      Nischay Malhan|         2945|
|    Film & Animation|   Think Music India|         2773|
|      People 

# Average Engagement per Category

In [0]:
# create engagment column on adding likecount and commentcount

In [0]:
casted_df = casted_df.withColumn(
    "engagement",
    col("likeCount") + col("commentCount")
)

In [0]:
from pyspark.sql.functions import avg

casted_df.groupBy("categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show(truncate=False)


+--------------------+-----------------+
|categoryName        |avgEngagement    |
+--------------------+-----------------+
|Music               |190660.625       |
|Film & Animation    |156408.0         |
|Science & Technology|147153.5         |
|Education           |113755.0         |
|Gaming              |103527.5         |
|People & Blogs      |77392.625        |
|Entertainment       |58705.40909090909|
|Comedy              |22948.5          |
|Sports              |15729.0          |
|Howto & Style       |12763.0          |
+--------------------+-----------------+



# Average engagement by video duration

In [0]:
# it says that people are engage or not  in short,long or medium videos

In [0]:
# Step 1: Categorize durations
from pyspark.sql.functions import when

casted_df = casted_df.withColumn(
    "durationCategory",
    when(col("duration") <= 300, "Short")  # ≤ 5 mins
    .when((col("duration") > 300) & (col("duration") <= 1200), "Medium")  # 5–20 mins
    .otherwise("Long")  # > 20 mins
)

# Step 2: Avg engagement per duration category
casted_df.groupBy("durationCategory") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show()


+----------------+-----------------+
|durationCategory|    avgEngagement|
+----------------+-----------------+
|           Short|169278.9411764706|
|          Medium|          70355.0|
|            Long|34779.58823529412|
+----------------+-----------------+



# Top performing categories within duration groups

In [0]:
casted_df.groupBy("durationCategory", "categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("durationCategory", "avgEngagement", ascending=False) \
    .show()


+----------------+--------------------+------------------+
|durationCategory|        categoryName|     avgEngagement|
+----------------+--------------------+------------------+
|           Short|               Music|217805.42857142858|
|           Short|    Film & Animation|          217400.5|
|           Short|       Entertainment|130640.14285714286|
|           Short|      People & Blogs|            3822.0|
|          Medium|      People & Blogs|152518.33333333334|
|          Medium|Science & Technology|          151775.0|
|          Medium|           Education|          113755.0|
|          Medium|              Gaming|          103527.5|
|          Medium|              Comedy|           39837.0|
|          Medium|       Entertainment|21201.833333333332|
|          Medium|              Sports|           15729.0|
|          Medium|       Howto & Style|           12763.0|
|            Long|Science & Technology|          142532.0|
|            Long|      People & Blogs|           39441.

# Total Watch Time per Category (durationCategory)

In [0]:
casted_df.groupBy("durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("totalWatchTime_seconds", ascending=False) \
    .show()


+----------------+----------------------+
|durationCategory|totalWatchTime_seconds|
+----------------+----------------------+
|            Long|                 38208|
|          Medium|                 12478|
|           Short|                  2396|
+----------------+----------------------+



# Total Watch Time per Category and Duration Bucket

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory") \
    .show(truncate=False)


+--------------------+----------------+----------------------+
|categoryName        |durationCategory|totalWatchTime_seconds|
+--------------------+----------------+----------------------+
|Comedy              |Long            |3498                  |
|Comedy              |Medium          |755                   |
|Education           |Medium          |640                   |
|Entertainment       |Long            |14918                 |
|Entertainment       |Medium          |4455                  |
|Entertainment       |Short           |668                   |
|Film & Animation    |Long            |8732                  |
|Film & Animation    |Short           |320                   |
|Gaming              |Medium          |1604                  |
|Howto & Style       |Medium          |734                   |
|Music               |Long            |3416                  |
|Music               |Short           |1321                  |
|People & Blogs      |Long            |6034            

In [0]:
# i have attached the chanelTitle also , from here we can get idea 
# 1) Which channel is dominating which category .
# 2) Which channel do people watch for longer time?
# 3) Which channel's videos, short, medium or long, are being watched more?

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("channelTitle", "categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory", "totalWatchTime_seconds", ascending=False) \
    .show(truncate=False)


+------------------------+--------------------+----------------+----------------------+
|channelTitle            |categoryName        |durationCategory|totalWatchTime_seconds|
+------------------------+--------------------+----------------+----------------------+
|Sony Sports Network     |Sports              |Medium          |997                   |
|MR. INDIAN HACKER       |Science & Technology|Medium          |887                   |
|Crazy XYZ               |Science & Technology|Long            |1610                  |
|Telly MEK               |People & Blogs      |Short           |87                    |
|Sourav Joshi Vlogs      |People & Blogs      |Medium          |1341                  |
|RasigargalinRasigan     |People & Blogs      |Medium          |1065                  |
|Sushma Kiron            |People & Blogs      |Long            |1795                  |
|Breakthrough            |People & Blogs      |Long            |1459                  |
|Wanderers Hub           |People

# Top 5 Longest Videos with High Engagement

In [0]:
casted_df.orderBy(col("duration").desc(), col("engagement").desc()).select("title", "duration", "engagement").show(5, truncate=False)


+----------------------------------------------------------------------------------------------------+--------+----------+
|title                                                                                               |duration|engagement|
+----------------------------------------------------------------------------------------------------+--------+----------+
|Romeo S3 - Full Movie | Palak Tiwari, Thakur Anoop Singh | Pen Movies | New Hindi Movie 2025        |8732    |34423     |
|किसको इशारों में कहा Jetha ने अकलमंद? | Taarak Mehta Ka Ooltah Chashmah | Full Movie                |3498    |6060      |
|ஆடி முதல் வெள்ளி அம்மன் பாடல் |ஆடி வெள்ளி அம்மனுக்கு | Aadi Velli Ammanukku | Aadi Masam Amman Songs|3416    |647       |
|Mahalakshmi | Flowers TV | EP # 157 + 158                                                           |2515    |4961      |
|Dhee 20 | Re Release Special | 9th July 2025 | Regena Cassandrra ,Vijay Binni | Full Episode | ETV  |2022    |54832     |
+---------------

#  Education and Tech-Focused Insights

In [0]:
education_df = casted_df.filter(col("categoryName").isin("Education", "Science & Technology"))
education_df.show()

+-----------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|         id|           channelId|     channelTitle|               title|         description|categoryId|                tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|        categoryName|publishedDate|year|month|engagement|durationCategory|
+-----------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|DEtNELkJRJg|UCSiDGb0MnHFGjs4E...|MR. INDIAN HACKER|Finally We Found ...|Guy's iss video m...|        28|[treasure challen...|          

In [0]:
education_df.select("title", "likeCount", "viewCount", "engagement").show(truncate=False)

+----------------------------------------------------------------------------------+---------+---------+----------+
|title                                                                             |likeCount|viewCount|engagement|
+----------------------------------------------------------------------------------+---------+---------+----------+
|Finally We Found This ? 🤩 सच में खजाना मिल गया | Treasure Hunt Challenge Part - 4|141791   |2252999  |151775    |
|Super Chor Bazaar Challenge | Winner Will Keep Everything🤑                       |136852   |2771269  |142532    |
|Next Level Brain Rot😂 But Shhhhh....🙊                                           |110810   |2350017  |113755    |
+----------------------------------------------------------------------------------+---------+---------+----------+



In [0]:
education_df.select("channelTitle", "title", "categoryName").show(truncate=False)


+-----------------+----------------------------------------------------------------------------------+--------------------+
|channelTitle     |title                                                                             |categoryName        |
+-----------------+----------------------------------------------------------------------------------+--------------------+
|MR. INDIAN HACKER|Finally We Found This ? 🤩 सच में खजाना मिल गया | Treasure Hunt Challenge Part - 4|Science & Technology|
|Crazy XYZ        |Super Chor Bazaar Challenge | Winner Will Keep Everything🤑                       |Science & Technology|
|Nischay Malhan   |Next Level Brain Rot😂 But Shhhhh....🙊                                           |Education           |
+-----------------+----------------------------------------------------------------------------------+--------------------+



In [0]:
# i check education content by seeing the keyword like what ever i have mentioned in the below 

In [0]:
from pyspark.sql.functions import col, lower
from functools import reduce

keywords = ["tutorial", "science", "learn", "how to", "technology", "education", "experiment", "physics", "math", "engineering"]

# Debugging: ensure title is not null
education_df_filtered = education_df.filter(col("title").isNotNull())

# Apply keyword filter in lowercase
education_clean_df = education_df_filtered.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) for k in keywords])
)

education_clean_df.show(truncate=False)
education_clean_df.count()


+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
|id |channelId|channelTitle|title|description|categoryId|tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|categoryName|publishedDate|year|month|engagement|durationCategory|
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+

Out[105]: 0

In [0]:
education_df.select("title").show(20, truncate=False)



+----------------------------------------------------------------------------------+
|title                                                                             |
+----------------------------------------------------------------------------------+
|Finally We Found This ? 🤩 सच में खजाना मिल गया | Treasure Hunt Challenge Part - 4|
|Super Chor Bazaar Challenge | Winner Will Keep Everything🤑                       |
|Next Level Brain Rot😂 But Shhhhh....🙊                                           |
+----------------------------------------------------------------------------------+



# Most engaging education/tech videos

In [0]:
education_df.orderBy("engagement", ascending=False).select("title", "engagement").show(10, truncate=False)


+----------------------------------------------------------------------------------+----------+
|title                                                                             |engagement|
+----------------------------------------------------------------------------------+----------+
|Finally We Found This ? 🤩 सच में खजाना मिल गया | Treasure Hunt Challenge Part - 4|151775    |
|Super Chor Bazaar Challenge | Winner Will Keep Everything🤑                       |142532    |
|Next Level Brain Rot😂 But Shhhhh....🙊                                           |113755    |
+----------------------------------------------------------------------------------+----------+



# Keyword-Based Filtering (regardless of category)

In [0]:
from pyspark.sql.functions import lower, col
from functools import reduce

keywords = ["tutorial", "science", "learn", "how to", "technology", "education", "experiment", "physics", "math", "engineering"]

# keyword-based filter on title and description
filtered_df = casted_df.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) | lower(col("description")).contains(k) for k in keywords])
)

filtered_df.select("categoryName", "title", "description").show(truncate=False)


+-------------+-----------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
output_path = '/FileStore/tables/final_india_trending'
casted_df.write.mode('overwrite').parquet(output_path)
