# Analysis on India Data Set

## Read from Delta table

In [0]:
from pyspark.sql.functions import col

# Load table from mongo_youtube_trends schema
df = spark.table("mongo_youtube_trends.india_trending") 
# Show schema to understand structure
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- _fivetran_synced: timestamp (nullable = true)
 |-- data: string (nullable = true)
 |-- _fivetran_deleted: boolean (nullable = true)



In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, BooleanType, TimestampType
from pyspark.sql.types import *

# Define schema based on DBFS JSON
schema = StructType([
    StructField("contentDetails", StructType([
        StructField("caption", StringType(), True),
        StructField("definition", StringType(), True),
        StructField("dimension", StringType(), True),
        StructField("duration", StringType(), True),
        StructField("licensedContent", BooleanType(), True),
        StructField("projection", StringType(), True),
        StructField("regionRestriction", StructType([
            StructField("allowed", ArrayType(StringType()), True)
        ]), True)
    ]), True),

    StructField("etag", StringType(), True),
    StructField("id", StringType(), True),
    StructField("kind", StringType(), True),

    StructField("snippet", StructType([
        StructField("categoryId", StringType(), True),
        StructField("channelId", StringType(), True),
        StructField("channelTitle", StringType(), True),
        StructField("defaultAudioLanguage", StringType(), True),
        StructField("defaultLanguage", StringType(), True),
        StructField("description", StringType(), True),
        StructField("liveBroadcastContent", StringType(), True),
        StructField("localized", StructType([
            StructField("description", StringType(), True),
            StructField("title", StringType(), True)
        ]), True),
        StructField("publishedAt", StringType(), True),
        StructField("tags", ArrayType(StringType()), True),
        StructField("thumbnails", StructType([
            StructField("default", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("high", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("maxres", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("medium", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("standard", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True)
        ]), True),
        StructField("title", StringType(), True)
    ]), True),

    StructField("statistics", StructType([
        StructField("commentCount", StringType(), True),
        StructField("favoriteCount", StringType(), True),
        StructField("likeCount", StringType(), True),
        StructField("viewCount", StringType(), True)
    ]), True),
])



In [0]:
from pyspark.sql.functions import from_json, col

# Parse JSON string into struct
parsed_df = df.withColumn("json", from_json(col("data"), schema))

parsed_df.printSchema()


root
 |-- _id: string (nullable = true)
 |-- _fivetran_synced: timestamp (nullable = true)
 |-- data: string (nullable = true)
 |-- _fivetran_deleted: boolean (nullable = true)
 |-- json: struct (nullable = true)
 |    |-- contentDetails: struct (nullable = true)
 |    |    |-- caption: string (nullable = true)
 |    |    |-- definition: string (nullable = true)
 |    |    |-- dimension: string (nullable = true)
 |    |    |-- duration: string (nullable = true)
 |    |    |-- licensedContent: boolean (nullable = true)
 |    |    |-- projection: string (nullable = true)
 |    |    |-- regionRestriction: struct (nullable = true)
 |    |    |    |-- allowed: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |-- etag: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- kind: string (nullable = true)
 |    |-- snippet: struct (nullable = true)
 |    |    |-- categoryId: string (nullable = true)
 |    |    |-- channelId: string

In [0]:
from pyspark.sql.functions import col

# Select required fields
selected_df = parsed_df.select(
    col("json.id").alias("id"),
    col("json.snippet.publishedAt").alias("publishedAt"),
    col("json.snippet.channelId").alias("channelId"),
    col("json.snippet.channelTitle").alias("channelTitle"),
    col("json.snippet.title").alias("title"),
    col("json.snippet.description").alias("description"),
    col("json.snippet.categoryId").alias("categoryId"),
    col("json.snippet.tags").alias("tags"),
    col("json.snippet.defaultLanguage").alias("defaultLanguage"),
    col("json.snippet.defaultAudioLanguage").alias("defaultAudioLanguage"),
    col("json.statistics.viewCount").alias("viewCount"),
    col("json.statistics.likeCount").alias("likeCount"),
    col("json.statistics.commentCount").alias("commentCount"),
    col("json.contentDetails.duration").alias("duration"),
    col("json.contentDetails.caption").alias("caption")
)

selected_df.show(truncate=False)


+-----------+--------------------+------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
selected_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: string (nullable = true)
 |-- likeCount: string (nullable = true)
 |-- commentCount: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
from pyspark.sql.functions import col, when

casted_df = selected_df.withColumn("viewCount", when(col("viewCount") != "", col("viewCount").cast("int")).otherwise(None)) \
                       .withColumn("likeCount", when(col("likeCount") != "", col("likeCount").cast("int")).otherwise(None)) \
                       .withColumn("commentCount", when(col("commentCount") != "", col("commentCount").cast("int")).otherwise(None))



In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedAt", "channelTitle",
    "title", "categoryId", "description"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df.select("title", "duration").show(10, truncate=False)

+---------------------------------------------------------------------------------------------------+---------+
|title                                                                                              |duration |
+---------------------------------------------------------------------------------------------------+---------+
|Andhera Season 1 - Official Trailer | Prajakta Koli, Surveen Chawla, Priya Bapat, Karanvir Malhotra|PT2M21S  |
|GHAATI Trailer (Telugu) | Anushka Shetty | Vikram Prabhu | Krish Jagarlamudi | UV Creations        |PT2M22S  |
|ವರಮಹಾಲಕ್ಷ್ಮಿ ವ್ರತ ವಿಶೇಷ | ಲಕ್ಷ್ಮಿ ದೇವಿ ಹಾಡುಗಳು | Vara Lakshmi Devi Songs | Kannada Bhakthi Songs   |PT1H2M29S|
|HIMLANDS - THE RISE OF A NEW ENTITY [S-6 part 34]                                                  |PT1H9M22S|
|JATADHARA Official Teaser | Sudheer Babu | Sonakshi Sinha | Prerna Arora |Telugu-Hindi| Coming Soon|PT1M14S  |
|Raksha Bandhan Songs - Behna Pyaari Behna | Chunky Pandey | Sadhana Sargam | Rakhi Songs 2025      |PT8

In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



# Convert duration (ISO 8601 format) to Total Seconds

In [0]:
from pyspark.sql.functions import regexp_extract, col, when

# Regex extract hours, minutes, seconds from ISO 8601 duration format
casted_df = casted_df \
    .withColumn("hours", when(regexp_extract(col("duration"), "PT(\\d+)H", 1) != "", regexp_extract(col("duration"), "PT(\\d+)H", 1).cast("int")).otherwise(0)) \
    .withColumn("minutes", when(regexp_extract(col("duration"), "(\\d+)M", 1) != "", regexp_extract(col("duration"), "(\\d+)M", 1).cast("int")).otherwise(0)) \
    .withColumn("seconds", when(regexp_extract(col("duration"), "(\\d+)S", 1) != "", regexp_extract(col("duration"), "(\\d+)S", 1).cast("int")).otherwise(0))

# Total seconds calculate 
casted_df = casted_df.withColumn(
    "duration",
    (col("hours")*3600 + col("minutes")*60 + col("seconds"))
)

# Temporary columns drop 
casted_df = casted_df.drop("hours", "minutes", "seconds")


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df.select("title", "duration").show(10, truncate=False)


+---------------------------------------------------------------------------------------------------+--------+
|title                                                                                              |duration|
+---------------------------------------------------------------------------------------------------+--------+
|Andhera Season 1 - Official Trailer | Prajakta Koli, Surveen Chawla, Priya Bapat, Karanvir Malhotra|141     |
|GHAATI Trailer (Telugu) | Anushka Shetty | Vikram Prabhu | Krish Jagarlamudi | UV Creations        |142     |
|ವರಮಹಾಲಕ್ಷ್ಮಿ ವ್ರತ ವಿಶೇಷ | ಲಕ್ಷ್ಮಿ ದೇವಿ ಹಾಡುಗಳು | Vara Lakshmi Devi Songs | Kannada Bhakthi Songs   |3749    |
|HIMLANDS - THE RISE OF A NEW ENTITY [S-6 part 34]                                                  |4162    |
|JATADHARA Official Teaser | Sudheer Babu | Sonakshi Sinha | Prerna Arora |Telugu-Hindi| Coming Soon|74      |
|Raksha Bandhan Songs - Behna Pyaari Behna | Chunky Pandey | Sadhana Sargam | Rakhi Songs 2025      |502     |
|

# in our data distinct categoryid available

In [0]:
parsed_df.select("json.snippet.categoryId").distinct().count()

5

In [0]:
parsed_df.select("json.snippet.categoryId").distinct().show(truncate=False)

+----------+
|categoryId|
+----------+
|22        |
|10        |
|24        |
|1         |
|20        |
+----------+



# Mapping distinct Categoryid to Category Name

In [0]:
from pyspark.sql.functions import when, col

casted_df = casted_df.withColumn(
    "categoryName",
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "24", "Entertainment")
    .otherwise("Unknown")
)


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = false)



In [0]:
casted_df.groupBy("categoryName").count().show(truncate=False)

+----------------+-----+
|categoryName    |count|
+----------------+-----+
|People & Blogs  |3    |
|Film & Animation|4    |
|Gaming          |13   |
|Entertainment   |12   |
|Music           |18   |
+----------------+-----+



# convert publishAt col to time stamp

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, year, month

# Step 1: Convert to timestamp
casted_df = casted_df.withColumn("publishedAt_ts", to_timestamp(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Step 2: Extract only the date part
casted_df = casted_df.withColumn("publishedDate", to_date(col("publishedAt_ts")))

# Step 3: Extract year and month from publishedDate
casted_df = casted_df.withColumn("year", year("publishedDate")) \
                     .withColumn("month", month("publishedDate"))

# Step 4: Drop original publishedAt and timestamp
casted_df = casted_df.drop("publishedAt", "publishedAt_ts")


In [0]:
casted_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = false)
 |-- publishedDate: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [0]:
casted_df.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedDate", "channelTitle",
    "title", "categoryId", "description", "categoryName"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# in likecount and comment count there was a single null valued  so i filled it with '0' 

In [0]:
from pyspark.sql.functions import col

casted_df = casted_df.fillna({"likeCount": 0})
casted_df = casted_df.fillna({"commentCount": 0})



In [0]:
from pyspark.sql.functions import col, count, when

# Check null values for all relevant columns
casted_df.select([
    count(when(col("viewCount").isNull(), 1)).alias("null_viewCount"),
    count(when(col("likeCount").isNull(), 1)).alias("null_likeCount"),
    count(when(col("commentCount").isNull(), 1)).alias("null_commentCount"),
    count(when(col("duration").isNull(), 1)).alias("null_duration"),
    count(when(col("caption").isNull(), 1)).alias("null_caption"),
    count(when(col("categoryName").isNull(), 1)).alias("null_categoryName"),
    count(when(col("publishedDate").isNull(), 1)).alias("null_publishedDate"),
    count(when(col("year").isNull(), 1)).alias("null_year"),
    count(when(col("month").isNull(), 1)).alias("null_month")
]).show()


+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|null_viewCount|null_likeCount|null_commentCount|null_duration|null_caption|null_categoryName|null_publishedDate|null_year|null_month|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|             0|             0|                0|            0|           0|                0|                 0|        0|         0|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+



# Engegment Analysis

# Top Channels by Total Views

In [0]:
top_channels = casted_df.groupBy("channelTitle") \
    .sum("viewCount") \
    .withColumnRenamed("sum(viewCount)", "totalViews") \
    .orderBy(col("totalViews").desc())

top_channels.show(20, truncate=False)


+---------------------------+----------+
|channelTitle               |totalViews|
+---------------------------+----------+
|T-Series                   |20984585  |
|Universal Music India      |12361086  |
|YRF                        |9209754   |
|Zee Studios                |5489968   |
|Purav Jha                  |4492281   |
|UV Creations               |3622582   |
|Prime Video India          |2932388   |
|MrBeast Gaming             |2807057   |
|Five Star Creations        |2070010   |
|Tropical 3D Animals        |1880896   |
|iQOO ESPORTS               |1630121   |
|T-Series Tamil             |1250386   |
|Kumar Gaming               |1202528   |
|Teodorissimo - FunAnimation|1201439   |
|Techno Gamerz              |1187168   |
|Tonde Gamer                |1041139   |
|T-Series Bhakti Sagar      |931273    |
|Zee Music Company          |927282    |
|Husky Music                |889524    |
|Tips Official              |768170    |
+---------------------------+----------+
only showing top

# Top channels by total likes

In [0]:
top_liked_channels = casted_df.groupBy("channelTitle") \
    .sum("likeCount") \
    .withColumnRenamed("sum(likeCount)", "totalLikes") \
    .orderBy(col("totalLikes").desc())
top_liked_channels.show(10, truncate=False)


+---------------------+----------+
|channelTitle         |totalLikes|
+---------------------+----------+
|T-Series             |836570    |
|Purav Jha            |496748    |
|YRF                  |155697    |
|MrBeast Gaming       |120288    |
|UV Creations         |105708    |
|Techno Gamerz        |77413     |
|YesSmartyPie         |75934     |
|Five Star Creations  |64098     |
|Universal Music India|59015     |
|Tonde Gamer          |51926     |
+---------------------+----------+
only showing top 10 rows


# Most Liked Videos

In [0]:
# Here we can find top 5 most liked videos along with its title and chanelTitle 

In [0]:
most_liked = casted_df.orderBy(col("likeCount").desc())
most_liked.select("title", "channelTitle", "likeCount").show(5, truncate=False)


+----------------------------------------------------------------------------------------------------+--------------+---------+
|title                                                                                               |channelTitle  |likeCount|
+----------------------------------------------------------------------------------------------------+--------------+---------+
|ONE THOUSAND MILES (Official Video): Yo Yo Honey Singh | Mandy Takhar | Desi Kalakaar |Bhushan Kumar|T-Series      |836570   |
|Bhaiyaara | Saiyaara Trailer Spoof | Purav Jha                                                      |Purav Jha     |496748   |
|Janaab-e-Aali Song Teaser | WAR 2 | Hrithik Roshan, NTR | Pritam, Sachet Tandon, Saaj Bhatt, Amitabh|YRF           |155697   |
|$1,000 Every Minute You Survive                                                                     |MrBeast Gaming|120288   |
|GHAATI Trailer (Telugu) | Anushka Shetty | Vikram Prabhu | Krish Jagarlamudi | UV Creations         |UV

# Top videos by comment count

In [0]:
most_commented = casted_df.orderBy(col("commentCount").desc())
most_commented.select("title", "channelTitle", "commentCount").show(5, truncate=False)


+----------------------------------------------------------------------------------------------------+---------------------+------------+
|title                                                                                               |channelTitle         |commentCount|
+----------------------------------------------------------------------------------------------------+---------------------+------------+
|ONE THOUSAND MILES (Official Video): Yo Yo Honey Singh | Mandy Takhar | Desi Kalakaar |Bhushan Kumar|T-Series             |54713       |
|Bhaiyaara | Saiyaara Trailer Spoof | Purav Jha                                                      |Purav Jha            |19932       |
|MY PARENTS ARE VERY BAD                                                                             |Techno Gamerz        |8715        |
|Janaab-e-Aali Song Teaser | WAR 2 | Hrithik Roshan, NTR | Pritam, Sachet Tandon, Saaj Bhatt, Amitabh|YRF                  |8095        |
|Bheegi Saree - Param Sundari | Si

# Like to View ratio (engagement rate)

In [0]:
from pyspark.sql.functions import expr

engagement_df = casted_df.withColumn("likeViewRatio", expr("likeCount / viewCount"))
engagement_df.orderBy(col("likeViewRatio").desc()).select("title", "channelTitle", "likeViewRatio").show(10, truncate=False)


+----------------------------------------------------------------------------------------------------+------------------+--------------------+
|title                                                                                               |channelTitle      |likeViewRatio       |
+----------------------------------------------------------------------------------------------------+------------------+--------------------+
|Odum Kuthira Chaadum Kuthira |Official Trailer | Fahadh Faasil, Kalyani | Althaf Salim | Ashiq Usman|Muzik247          |0.930110658124636   |
|Bhaiyaara | Saiyaara Trailer Spoof | Purav Jha                                                      |Purav Jha         |0.11057812278439394 |
|HIMLANDS - THE RISE OF A NEW ENTITY [S-6 part 34]                                                   |YesSmartyPie      |0.1048703518281946  |
|JANGKHRITHAINI THWISAM 2 || Official Trailer || RD Motion Picture Presents                          |RD Motion Pictures|0.09635438162058055 |

# Analysis based on categories

# Top Categories by Likes

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("categoryName","channelTitle") \
    .agg(sum("likeCount").alias("totalLikes")) \
    .orderBy("totalLikes", ascending=False) \
    .show()



+----------------+--------------------+----------+
|    categoryName|        channelTitle|totalLikes|
+----------------+--------------------+----------+
|           Music|            T-Series|    836570|
|   Entertainment|           Purav Jha|    496748|
|           Music|                 YRF|    155697|
|          Gaming|      MrBeast Gaming|    120288|
|   Entertainment|        UV Creations|    105708|
|          Gaming|       Techno Gamerz|     77413|
|          Gaming|        YesSmartyPie|     75934|
|Film & Animation|Five Star Creations |     64098|
|           Music|Universal Music I...|     59015|
|          Gaming|         Tonde Gamer|     51926|
|           Music|      T-Series Tamil|     50402|
|          Gaming|        iQOO ESPORTS|     43933|
|   Entertainment|Teodorissimo - Fu...|     43708|
|  People & Blogs| Tropical 3D Animals|     40057|
|          Gaming|       LoLzZz Gaming|     37440|
|           Music|         Husky Music|     30570|
|   Entertainment|            U

# Top Categories by Comments

In [0]:
casted_df.groupBy("categoryName","channelTitle") \
    .agg(sum("commentCount").alias("totalComments")) \
    .orderBy("totalComments", ascending=False) \
    .show()

+----------------+--------------------+-------------+
|    categoryName|        channelTitle|totalComments|
+----------------+--------------------+-------------+
|           Music|            T-Series|        54713|
|   Entertainment|           Purav Jha|        19932|
|          Gaming|       Techno Gamerz|         8715|
|           Music|                 YRF|         8095|
|           Music|Universal Music I...|         7638|
|          Gaming|        YesSmartyPie|         5013|
|   Entertainment|        UV Creations|         3711|
|          Gaming|         Tonde Gamer|         3042|
|          Gaming|      MrBeast Gaming|         2454|
|          Gaming|     SenpaiUnlimited|         2193|
|           Music|      T-Series Tamil|         2056|
|           Music|         Husky Music|         1699|
|Film & Animation|Five Star Creations |         1260|
|Film & Animation|     Sparrow Cinemas|         1231|
|   Entertainment|            Udaya TV|         1212|
|  People & Blogs|         L

# Average Engagement per Category

In [0]:
# create engagment column on adding likecount and commentcount

In [0]:
casted_df = casted_df.withColumn(
    "engagement",
    col("likeCount") + col("commentCount")
)

In [0]:
from pyspark.sql.functions import avg

casted_df.groupBy("categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show(truncate=False)


+----------------+------------------+
|categoryName    |avgEngagement     |
+----------------+------------------+
|Music           |69852.55555555556 |
|Entertainment   |65544.16666666667 |
|Gaming          |36856.53846153846 |
|Film & Animation|29056.75          |
|People & Blogs  |20538.666666666668|
+----------------+------------------+



# Average engagement by video duration

In [0]:
# it says that people are engage or not  in short,long or medium videos

In [0]:
# Step 1: Categorize durations
from pyspark.sql.functions import when

casted_df = casted_df.withColumn(
    "durationCategory",
    when(col("duration") <= 300, "Short")  # ≤ 5 mins
    .when((col("duration") > 300) & (col("duration") <= 1200), "Medium")  # 5–20 mins
    .otherwise("Long")  # > 20 mins
)

# Step 2: Avg engagement per duration category
casted_df.groupBy("durationCategory") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show()



+----------------+------------------+
|durationCategory|     avgEngagement|
+----------------+------------------+
|          Medium|          165575.5|
|           Short|28264.333333333332|
|            Long|        22922.1875|
+----------------+------------------+



# Top performing categories within duration groups

In [0]:
casted_df.groupBy("durationCategory", "categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("durationCategory", "avgEngagement", ascending=False) \
    .show()


+----------------+----------------+------------------+
|durationCategory|    categoryName|     avgEngagement|
+----------------+----------------+------------------+
|           Short|           Music|32671.090909090908|
|           Short|Film & Animation|          29056.75|
|           Short|   Entertainment| 22526.11111111111|
|          Medium|           Music| 298086.6666666667|
|          Medium|   Entertainment|194598.33333333334|
|          Medium|          Gaming|45525.666666666664|
|          Medium|  People & Blogs|           41123.0|
|            Long|          Gaming|           34255.8|
|            Long|  People & Blogs|           10246.5|
|            Long|           Music|             926.0|
+----------------+----------------+------------------+



# Total Watch Time per Category (durationCategory)

In [0]:
casted_df.groupBy("durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("totalWatchTime_seconds", ascending=False) \
    .show()


+----------------+----------------------+
|durationCategory|totalWatchTime_seconds|
+----------------+----------------------+
|            Long|                102896|
|          Medium|                  5524|
|           Short|                  4007|
+----------------+----------------------+



# Total Watch Time per Category and Duration Bucket

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory") \
    .show(truncate=False)

+----------------+----------------+----------------------+
|categoryName    |durationCategory|totalWatchTime_seconds|
+----------------+----------------+----------------------+
|Entertainment   |Medium          |1497                  |
|Entertainment   |Short           |1260                  |
|Film & Animation|Short           |551                   |
|Gaming          |Long            |87699                 |
|Gaming          |Medium          |2250                  |
|Music           |Long            |11654                 |
|Music           |Medium          |1404                  |
|Music           |Short           |2196                  |
|People & Blogs  |Long            |3543                  |
|People & Blogs  |Medium          |373                   |
+----------------+----------------+----------------------+



In [0]:
# i have attached the chanelTitle also , from here we can get idea 
# 1) Which channel is dominating which category .
# 2) Which channel do people watch for longer time?
# 3) Which channel's videos, short, medium or long, are being watched more?

In [0]:
from pyspark.sql.functions import sum

casted_df.groupBy("channelTitle", "categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory", "totalWatchTime_seconds", ascending=False) \
    .show(truncate=False)

+-------------------------------------------+--------------+----------------+----------------------+
|channelTitle                               |categoryName  |durationCategory|totalWatchTime_seconds|
+-------------------------------------------+--------------+----------------+----------------------+
|Tropical 3D Animals                        |People & Blogs|Medium          |373                   |
|Podcast Tamilan                            |People & Blogs|Long            |2326                  |
|Lovely Boss                                |People & Blogs|Long            |1217                  |
|Zee Music Company                          |Music         |Short           |298                   |
|T-Series Bhakti Sagar                      |Music         |Short           |260                   |
|Bollywood Fever                            |Music         |Short           |253                   |
|Sannu Kumar                                |Music         |Short           |245           

# Top 5 Longest Videos with High Engagement

In [0]:
casted_df.orderBy(col("duration").desc(), col("engagement").desc()).select("title", "duration", "engagement").show(5, truncate=False)

+--------------------------------------------------------------------------------------+--------+----------+
|title                                                                                 |duration|engagement|
+--------------------------------------------------------------------------------------+--------+----------+
|KYA AAJ #2 HOGA? | CONQUEROR RANK PUSH | BGMI LIVE                                    |33046   |37476     |
|[HINDI] iQOO Battleground Series 2025 | Grand Finale - Day 2 #QuestForGlory #iQOONeo10|20245   |44158     |
|3.9 UPDATE IS HERE IN BGMI SOLO VS SQUAD🔥 #bgmi #bgmilive #shortsfeed #shortslive    |14863   |4174      |
|Choo Choo Charles LIVE | Kumar Gaming                                                 |8296    |4165      |
|HIMLANDS - THE RISE OF A NEW ENTITY [S-6 part 34]                                     |4162    |80947     |
+--------------------------------------------------------------------------------------+--------+----------+
only showing top 5 r

In [0]:
casted_df.write.mode("overwrite").saveAsTable("processed_data.india_trending_processed")
