In [0]:
# Here i have loaded all the 3 country data 
india_df = spark.read.parquet('/FileStore/tables/final_india_trending')
singapore_df = spark.read.parquet('/FileStore/tables/final_singapore_trending')
usa_df = spark.read.parquet('/FileStore/tables/final_usa_trending')



# Schema Check + Sampling

In [0]:
india_df.printSchema()
india_df.show(5)
# singapore_df.show(5)
# usa_df.show(5)

In [0]:
india_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = true)
 |-- publishedDate: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- engagement: integer (nullable = true)
 |-- durationCategory: string (nullable = true)



In [0]:
india_df.show(5)

+-----------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+----------------+-------------+----+-----+----------+----------------+
|         id|           channelId|     channelTitle|               title|         description|categoryId|                tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|    categoryName|publishedDate|year|month|engagement|durationCategory|
+-----------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+----------------+-------------+----+-----+----------+----------------+
|nyURE5vmj2I|UCLbdVvreihwZRL6k...|Think Music India|Thalaivan Thalaiv...|Here's the Offici...|         1|[thalaivan thalai...|             en|      

# Most Popular Categories (By Views) per Country

In [0]:
from pyspark.sql.functions import sum

india_top_categories = india_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)
singapore_top_categories = singapore_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)
usa_top_categories = usa_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)


In [0]:
usa_top_categories .show()

+--------------------+----------+
|        categoryName|totalViews|
+--------------------+----------+
|       Entertainment|  47350484|
|    Film & Animation|  12422167|
|              Gaming|   8804699|
|               Music|   4055819|
|              Sports|   3397192|
|Science & Technology|   1771137|
|      People & Blogs|   1413995|
|              Comedy|    360781|
+--------------------+----------+



# Average Engagement Metrics per Category

In [0]:
# We have to find out which category has more engagement in which country.

In [0]:
from pyspark.sql.functions import avg

usa_avg = usa_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
usa_avg.show()


+--------------------+------------------+------------------+------------------+
|        categoryName|         avg_views|         avg_likes|      avg_comments|
+--------------------+------------------+------------------+------------------+
|              Gaming| 978299.8888888889|48705.666666666664| 4206.111111111111|
|       Entertainment|2630582.4444444445|111615.77777777778|10010.944444444445|
|Science & Technology|          590379.0|           23137.0|            1630.0|
|              Sports|          679438.4|            9854.2|             874.6|
|    Film & Animation|4140722.3333333335|132544.66666666666|           11176.0|
|      People & Blogs|         353498.75|          20409.75|           1512.75|
|               Music| 579402.7142857143| 57134.57142857143| 5456.571428571428|
|              Comedy|          360781.0|           17293.0|            1164.0|
+--------------------+------------------+------------------+------------------+



In [0]:
from pyspark.sql.functions import avg

india_avg = india_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
india_avg.show()

+--------------------+------------------+---------+-----------------+
|        categoryName|         avg_views|avg_likes|     avg_comments|
+--------------------+------------------+---------+-----------------+
|           Education|         2350017.0| 110810.0|           2945.0|
|              Gaming|         3068021.0|  99307.0|           4220.5|
|       Entertainment|2250641.6818181816|  55871.0|2834.409090909091|
|Science & Technology|         2512134.0| 139321.5|           7832.0|
|              Sports|         1133050.0|  15467.0|            262.0|
|       Howto & Style|          536617.0|  11772.0|            991.0|
|    Film & Animation|         4431647.0| 147836.0|           8572.0|
|      People & Blogs|        1805826.75|75211.625|           2181.0|
|               Music|       4535913.125|179635.75|        11024.875|
|              Comedy|          805324.5|  22510.0|            438.5|
+--------------------+------------------+---------+-----------------+



In [0]:
from pyspark.sql.functions import avg

singapore_avg = singapore_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
singapore_avg.show()

+--------------------+--------------------+-----------------+------------------+
|        categoryName|           avg_views|        avg_likes|      avg_comments|
+--------------------+--------------------+-----------------+------------------+
|           Education|         2.7345187E7|         431517.5|             988.0|
|              Gaming|        3.09292035E7|        1211572.5|           13611.5|
|       Entertainment|1.4902018705882354E7|425525.4117647059|2111.6470588235293|
|Science & Technology|         1.6002138E7|         129765.5|             231.5|
|              Sports|        2.48394705E7|        793180.25|            6551.0|
|    Film & Animation|           2852714.5|         270477.0|             208.5|
|      People & Blogs|       1.739849525E7|        490016.75|            2883.5|
|     News & Politics|            157293.5|           2209.5|             251.5|
|    Autos & Vehicles|         1.0391809E7|         599792.0|            5383.0|
|               Music|      

# Top Channels in Each Country

In [0]:
singapore_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+---------------+----------+
|   channelTitle|totalViews|
+---------------+----------+
|    Mini Katana|  94520245|
|    Ben Azelart|  84420253|
|      BLACKPINK|  75930630|
|     IShowSpeed|  61858407|
|      BEN EAGLE|  61073614|
|Let's Romantic |  53777618|
|       Red Bull|  46427751|
|     Nam Phương|  35077490|
|     MaviGadget|  31441467|
|   Eternal Love|  26856450|
+---------------+----------+
only showing top 10 rows



In [0]:
usa_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+--------------------+----------+
|        channelTitle|totalViews|
+--------------------+----------+
|             Netflix|  14269445|
|        Warner Bros.|   7245936|
|      Call Her Daddy|   6892201|
|              Disney|   6385779|
|        EA SPORTS FC|   5607960|
|               Pixar|   4591295|
|         FX Networks|   4095561|
|                 IGN|   3921003|
|              Vsauce|   3100641|
|The Late Show wit...|   2623291|
+--------------------+----------+
only showing top 10 rows



In [0]:
india_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+--------------------+----------+
|        channelTitle|totalViews|
+--------------------+----------+
|            T-Series|  34221562|
|   Zee Music Company|  17185883|
|             Netflix|  14269445|
|  Sourav Joshi Vlogs|   9017936|
|        Warner Bros.|   7245936|
|    Vijay Television|   5838381|
|   Think Music India|   4200400|
|Free Fire India O...|   3811435|
|            ETV Dhee|   3137692|
|           Crazy XYZ|   2771269|
+--------------------+----------+
only showing top 10 rows



# Combine Insights

In [0]:
# category wise comparison for all countries in one table

In [0]:
from pyspark.sql.functions import sum

# Aggregate view count by category for each country
india_cat = india_df.groupBy("categoryName").agg(sum("viewCount").alias("india_views"))
usa_cat = usa_df.groupBy("categoryName").agg(sum("viewCount").alias("usa_views"))
sing_cat = singapore_df.groupBy("categoryName").agg(sum("viewCount").alias("singapore_views"))

# Join all
combined = india_cat.join(usa_cat, "categoryName", "outer").join(sing_cat, "categoryName", "outer").fillna(0)
combined.show()


+--------------------+-----------+---------+---------------+
|        categoryName|india_views|usa_views|singapore_views|
+--------------------+-----------+---------+---------------+
|    Autos & Vehicles|          0|        0|       10391809|
|              Comedy|    1610649|   360781|       10751977|
|           Education|    2350017|        0|       54690374|
|       Entertainment|   49514117| 47350484|      253334318|
|    Film & Animation|   13294941| 12422167|        5705429|
|              Gaming|    6136042|  8804699|       61858407|
|       Howto & Style|     536617|        0|              0|
|               Music|   36287305|  4055819|      238661748|
|     News & Politics|          0|        0|         314587|
|      People & Blogs|   14446614|  1413995|      139187962|
|Science & Technology|    5024268|  1771137|       32004276|
|              Sports|    1133050|  3397192|       99357882|
+--------------------+-----------+---------+---------------+



# Upload Frequency per Day (Digital Culture Insight)

In [0]:
from pyspark.sql.functions import to_date

india_daily = india_df.withColumn("upload_date", to_date("publishedDate")) \
                      .groupBy("upload_date").count().orderBy("upload_date")

india_daily.show()


+-----------+-----+
|upload_date|count|
+-----------+-----+
| 2025-07-14|    1|
| 2025-07-15|    4|
| 2025-07-16|   11|
| 2025-07-17|   27|
| 2025-07-18|    7|
+-----------+-----+



In [0]:
from pyspark.sql.functions import to_date, count, col

# Step 0: Add 'upload_date' to india_df
india_df = india_df.withColumn("upload_date", to_date("publishedDate"))

# Step 1: Group by upload_date, categoryName, channelTitle
category_channel_daily_upload = india_df.groupBy(
    "upload_date", "categoryName", "channelTitle"
).agg(
    count("*").alias("video_count")
).orderBy("upload_date", "video_count", ascending=[True, False])

category_channel_daily_upload.show(truncate=False)



+-----------+--------------------+---------------------------+-----------+
|upload_date|categoryName        |channelTitle               |video_count|
+-----------+--------------------+---------------------------+-----------+
|2025-07-14 |Music               |T-Series                   |1          |
|2025-07-15 |Music               |T-Series                   |1          |
|2025-07-15 |Education           |Nischay Malhan             |1          |
|2025-07-15 |Entertainment       |Zee Music Company          |1          |
|2025-07-15 |Comedy              |Mic Set                    |1          |
|2025-07-16 |Entertainment       |Thugesh                    |1          |
|2025-07-16 |Science & Technology|Crazy XYZ                  |1          |
|2025-07-16 |People & Blogs      |Wanderers Hub              |1          |
|2025-07-16 |Gaming              |SlayyPop                   |1          |
|2025-07-16 |Entertainment       |LOL (Life of Limbachiyaa’s)|1          |
|2025-07-16 |Music       