In [0]:
india_df = spark.read.table("processed_data.india_trending_processed")
singapore_df = spark.read.table("processed_data.singapore_trending_processed")
usa_df = spark.read.table("processed_data.usa_trending_processed")





# Schema Check + Sampling

In [0]:
india_df.show(5, truncate=False)


+-----------+------------------------+-------------------------------------------+---------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
india_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- caption: string (nullable = true)
 |-- categoryName: string (nullable = true)
 |-- publishedDate: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- engagement: integer (nullable = true)
 |-- durationCategory: string (nullable = true)



In [0]:
india_df.show(5)

+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+----------------+-------------+----+-----+----------+----------------+
|         id|           channelId|        channelTitle|               title|         description|categoryId|                tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|    categoryName|publishedDate|year|month|engagement|durationCategory|
+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+----------------+-------------+----+-----+----------+----------------+
|ikasQ17KRiA|UC4zWG9LccdWGUlF7...|   Prime Video India|Andhera Season 1 ...|Prime Video India...|        24|[andhera, andhera...|          

# Most Popular Categories (By Views) per Country

In [0]:
from pyspark.sql.functions import sum

india_top_categories = india_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)
singapore_top_categories = singapore_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)
usa_top_categories = usa_df.groupBy("categoryName").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False)


In [0]:
usa_top_categories .show()

+----------------+----------+
|    categoryName|totalViews|
+----------------+----------+
|          Gaming|  12609380|
|           Music|   9953226|
|   Entertainment|   2880310|
|  People & Blogs|   1399478|
|Film & Animation|   1231124|
+----------------+----------+



# Average Engagement Metrics per Category

In [0]:
# We have to find out which category has more engagement in which country.

In [0]:
from pyspark.sql.functions import avg

usa_avg = usa_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
usa_avg.show()


+----------------+------------------+------------------+------------------+
|    categoryName|         avg_views|         avg_likes|      avg_comments|
+----------------+------------------+------------------+------------------+
|  People & Blogs| 466492.6666666667|22476.333333333332|1412.3333333333333|
|Film & Animation| 410374.6666666667|18770.333333333332|1576.3333333333333|
|          Gaming| 700521.1111111111|25034.722222222223|2093.6111111111113|
|   Entertainment|          576062.0|           11993.2|            1399.4|
|           Music|473963.14285714284|17531.761904761905| 898.8571428571429|
+----------------+------------------+------------------+------------------+



In [0]:
from pyspark.sql.functions import avg

india_avg = india_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
india_avg.show()

+----------------+------------------+------------------+------------------+
|    categoryName|         avg_views|         avg_likes|      avg_comments|
+----------------+------------------+------------------+------------------+
|  People & Blogs|          747044.0|           19489.0|1049.6666666666667|
|Film & Animation|        2093308.75|           28217.5|            839.25|
|          Gaming| 893735.6153846154|35047.769230769234|1808.7692307692307|
|   Entertainment|        1202373.25|           63104.5|2439.6666666666665|
|           Music|2807198.6666666665| 65657.16666666667| 4195.388888888889|
+----------------+------------------+------------------+------------------+



In [0]:
from pyspark.sql.functions import avg

singapore_avg = singapore_df.groupBy("categoryName").agg(
    avg("viewCount").alias("avg_views"),
    avg("likeCount").alias("avg_likes"),
    avg("commentCount").alias("avg_comments")
)
singapore_avg.show()

+----------------+------------------+------------------+------------------+
|    categoryName|         avg_views|         avg_likes|      avg_comments|
+----------------+------------------+------------------+------------------+
|  People & Blogs|        1037603.75|           49356.5|           3451.75|
|Film & Animation| 8719190.333333334|           78767.0|4019.6666666666665|
|          Gaming|         1993430.0|41435.933333333334|            1751.2|
|          Sports|         5712448.0|          105317.0|            4694.0|
|   Entertainment| 7730351.428571428|          221408.0|           11092.0|
|           Music|3716897.9411764704|158684.17647058822| 6118.941176470588|
+----------------+------------------+------------------+------------------+



# Top Channels in Each Country

In [0]:
singapore_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+-------------------+----------+
|       channelTitle|totalViews|
+-------------------+----------+
|       Excel Movies|  25329114|
|             Sun TV|  21923784|
|             Avatar|  20183637|
|       Warner Bros.|  15850611|
|   Sony Music South|  13191803|
|        Brawl Stars|  12932859|
|Junglee Music Tamil|  11421274|
|                YRF|   9209754|
|          BLACKPINK|   8496334|
| Universal Pictures|   8435444|
+-------------------+----------+
only showing top 10 rows


In [0]:
usa_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+---------------+----------+
|   channelTitle|totalViews|
+---------------+----------+
|       T-Series|   5067756|
| MrBeast Gaming|   2807057|
|     KreekCraft|   2473405|
|        Peacock|   2057632|
| Asmongold TV  |   1387047|
|    MoreSidemen|   1213482|
|Epitaph Records|   1134819|
|     CaylusBlox|    997594|
|     DOM Studio|    838822|
|         Foltyn|    798010|
+---------------+----------+
only showing top 10 rows


In [0]:
india_df.groupBy("channelTitle").agg(sum("viewCount").alias("totalViews")).orderBy("totalViews", ascending=False).show(10)


+--------------------+----------+
|        channelTitle|totalViews|
+--------------------+----------+
|            T-Series|  20984585|
|Universal Music I...|  12361086|
|                 YRF|   9209754|
|         Zee Studios|   5489968|
|           Purav Jha|   4492281|
|        UV Creations|   3622582|
|   Prime Video India|   2932388|
|      MrBeast Gaming|   2807057|
|Five Star Creations |   2070010|
| Tropical 3D Animals|   1880896|
+--------------------+----------+
only showing top 10 rows


# Combine Insights

In [0]:
# category wise comparison for all countries in one table

In [0]:
from pyspark.sql.functions import sum

# Aggregate view count by category for each country
india_cat = india_df.groupBy("categoryName").agg(sum("viewCount").alias("india_views"))
usa_cat = usa_df.groupBy("categoryName").agg(sum("viewCount").alias("usa_views"))
sing_cat = singapore_df.groupBy("categoryName").agg(sum("viewCount").alias("singapore_views"))

# Join all
combined = india_cat.join(usa_cat, "categoryName", "outer").join(sing_cat, "categoryName", "outer").fillna(0)
combined.show()


+----------------+-----------+---------+---------------+
|    categoryName|india_views|usa_views|singapore_views|
+----------------+-----------+---------+---------------+
|  People & Blogs|    2241132|  1399478|        4150415|
|Film & Animation|    8373235|  1231124|       52315142|
|          Gaming|   11618563| 12609380|       29901450|
|   Entertainment|   14428479|  2880310|       54112460|
|           Music|   50529576|  9953226|       63187265|
|          Sports|          0|        0|        5712448|
+----------------+-----------+---------+---------------+



# Upload Frequency per Day (Digital Culture Insight)

In [0]:
from pyspark.sql.functions import to_date

india_daily = india_df.withColumn("upload_date", to_date("publishedDate")) \
                      .groupBy("upload_date").count().orderBy("upload_date")

india_daily.show()


+-----------+-----+
|upload_date|count|
+-----------+-----+
| 2025-08-03|    1|
| 2025-08-06|    1|
| 2025-08-07|    9|
| 2025-08-08|   24|
| 2025-08-09|   13|
| 2025-08-10|    2|
+-----------+-----+



In [0]:
from pyspark.sql.functions import to_date, count, col

# Step 0: Add 'upload_date' to india_df
india_df = india_df.withColumn("upload_date", to_date("publishedDate"))

# Step 1: Group by upload_date, categoryName, channelTitle
category_channel_daily_upload = india_df.groupBy(
    "upload_date", "categoryName", "channelTitle"
).agg(
    count("*").alias("video_count")
).orderBy("upload_date", "video_count", ascending=[True, False])

category_channel_daily_upload.show(truncate=False)



+-----------+----------------+-------------------------------------------+-----------+
|upload_date|categoryName    |channelTitle                               |video_count|
+-----------+----------------+-------------------------------------------+-----------+
|2025-08-03 |Music           |Anu Dubey Entertainment                    |1          |
|2025-08-06 |Entertainment   |UV Creations                               |1          |
|2025-08-07 |Music           |Think Music Telugu                         |1          |
|2025-08-07 |Music           |ಕನ್ನಡ ಭಕ್ತಿ ಸಾಂಗ್ಸ್ - Kannada Bhakthi Songs|1          |
|2025-08-07 |Music           |SRE BAKTHI                                 |1          |
|2025-08-07 |Entertainment   |Udaya TV                                   |1          |
|2025-08-07 |Entertainment   |Purav Jha                                  |1          |
|2025-08-07 |Music           |YRF                                        |1          |
|2025-08-07 |Music           |T-Series Tami