# Analysis on USA Data Set

## Read from Delta table

In [0]:
# Load table from mongo_youtube_trends schema
df = spark.table("mongo_youtube_trends.usa_trending") 
# Show schema to understand structure
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- _fivetran_synced: timestamp (nullable = true)
 |-- data: string (nullable = true)
 |-- _fivetran_deleted: boolean (nullable = true)



In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, BooleanType, TimestampType
from pyspark.sql.types import *

# Define schema based on DBFS JSON
schema = StructType([
    StructField("contentDetails", StructType([
        StructField("caption", StringType(), True),
        StructField("definition", StringType(), True),
        StructField("dimension", StringType(), True),
        StructField("duration", StringType(), True),
        StructField("licensedContent", BooleanType(), True),
        StructField("projection", StringType(), True),
        StructField("regionRestriction", StructType([
            StructField("allowed", ArrayType(StringType()), True)
        ]), True)
    ]), True),

    StructField("etag", StringType(), True),
    StructField("id", StringType(), True),
    StructField("kind", StringType(), True),

    StructField("snippet", StructType([
        StructField("categoryId", StringType(), True),
        StructField("channelId", StringType(), True),
        StructField("channelTitle", StringType(), True),
        StructField("defaultAudioLanguage", StringType(), True),
        StructField("defaultLanguage", StringType(), True),
        StructField("description", StringType(), True),
        StructField("liveBroadcastContent", StringType(), True),
        StructField("localized", StructType([
            StructField("description", StringType(), True),
            StructField("title", StringType(), True)
        ]), True),
        StructField("publishedAt", StringType(), True),
        StructField("tags", ArrayType(StringType()), True),
        StructField("thumbnails", StructType([
            StructField("default", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("high", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("maxres", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("medium", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True),
            StructField("standard", StructType([
                StructField("height", LongType(), True),
                StructField("url", StringType(), True),
                StructField("width", LongType(), True)
            ]), True)
        ]), True),
        StructField("title", StringType(), True)
    ]), True),

    StructField("statistics", StructType([
        StructField("commentCount", StringType(), True),
        StructField("favoriteCount", StringType(), True),
        StructField("likeCount", StringType(), True),
        StructField("viewCount", StringType(), True)
    ]), True),
])



In [0]:
from pyspark.sql.functions import from_json, col

# Parse JSON string into struct
parsed_df = df.withColumn("json", from_json(col("data"), schema))

parsed_df.printSchema()


root
 |-- _id: string (nullable = true)
 |-- _fivetran_synced: timestamp (nullable = true)
 |-- data: string (nullable = true)
 |-- _fivetran_deleted: boolean (nullable = true)
 |-- json: struct (nullable = true)
 |    |-- contentDetails: struct (nullable = true)
 |    |    |-- caption: string (nullable = true)
 |    |    |-- definition: string (nullable = true)
 |    |    |-- dimension: string (nullable = true)
 |    |    |-- duration: string (nullable = true)
 |    |    |-- licensedContent: boolean (nullable = true)
 |    |    |-- projection: string (nullable = true)
 |    |    |-- regionRestriction: struct (nullable = true)
 |    |    |    |-- allowed: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |-- etag: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- kind: string (nullable = true)
 |    |-- snippet: struct (nullable = true)
 |    |    |-- categoryId: string (nullable = true)
 |    |    |-- channelId: string

In [0]:
from pyspark.sql.functions import col

# Select required fields
selected_df = parsed_df.select(
    col("json.id").alias("id"),
    col("json.snippet.publishedAt").alias("publishedAt"),
    col("json.snippet.channelId").alias("channelId"),
    col("json.snippet.channelTitle").alias("channelTitle"),
    col("json.snippet.title").alias("title"),
    col("json.snippet.description").alias("description"),
    col("json.snippet.categoryId").alias("categoryId"),
    col("json.snippet.tags").alias("tags"),
    col("json.snippet.defaultLanguage").alias("defaultLanguage"),
    col("json.snippet.defaultAudioLanguage").alias("defaultAudioLanguage"),
    col("json.statistics.viewCount").alias("viewCount"),
    col("json.statistics.likeCount").alias("likeCount"),
    col("json.statistics.commentCount").alias("commentCount"),
    col("json.contentDetails.duration").alias("duration"),
    col("json.contentDetails.caption").alias("caption")
)

selected_df.show(truncate=False)


+-----------+--------------------+------------------------+------------------------+--------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
selected_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: string (nullable = true)
 |-- likeCount: string (nullable = true)
 |-- commentCount: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
from pyspark.sql.functions import col, when

casted_df2 = selected_df.withColumn("viewCount", when(col("viewCount") != "", col("viewCount").cast("int")).otherwise(None)) \
                       .withColumn("likeCount", when(col("likeCount") != "", col("likeCount").cast("int")).otherwise(None)) \
                       .withColumn("commentCount", when(col("commentCount") != "", col("commentCount").cast("int")).otherwise(None))



In [0]:
casted_df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df2.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedAt", "channelTitle",
    "title", "categoryId", "description"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df2.select("title", "duration").show(10, truncate=False)

+--------------------------------------------------------------------------------------+--------+
|title                                                                                 |duration|
+--------------------------------------------------------------------------------------+--------+
|GTA5 - Car Roulette Returns and Animal Safari... ENHANCED!! (Funny Moments)           |PT31M3S |
|Pretending To Be A DRAGON CANNELLONI In Steal A Brainrot..                            |PT14M59S|
|Gunna - at my purest (feat. Offset) [Official Visualizer]                             |PT3M14S |
|La Lotería                                                                            |PT2M54S |
|Trump saves gamers                                                                    |PT27M31S|
|I Build the FASTEST PLANE in Roblox!                                                  |PT42M6S |
|BabyChiefDoit - Mr. President (Official Music Video)                                  |PT2M15S |
|Kick The Dust Up   

# Convert duration (ISO 8601 format) to Total Seconds

In [0]:
from pyspark.sql.functions import regexp_extract, col, when

# Regex extract hours, minutes, seconds from ISO 8601 duration format
casted_df2 = casted_df2 \
    .withColumn("hours", when(regexp_extract(col("duration"), "PT(\\d+)H", 1) != "", regexp_extract(col("duration"), "PT(\\d+)H", 1).cast("int")).otherwise(0)) \
    .withColumn("minutes", when(regexp_extract(col("duration"), "(\\d+)M", 1) != "", regexp_extract(col("duration"), "(\\d+)M", 1).cast("int")).otherwise(0)) \
    .withColumn("seconds", when(regexp_extract(col("duration"), "(\\d+)S", 1) != "", regexp_extract(col("duration"), "(\\d+)S", 1).cast("int")).otherwise(0))

# Total seconds calculate karo
casted_df2 = casted_df2.withColumn(
    "duration",
    (col("hours")*3600 + col("minutes")*60 + col("seconds"))
)

# Temporary columns drop karo
casted_df2 = casted_df2.drop("hours", "minutes", "seconds")


In [0]:
casted_df2.select("title", "duration").show(10, truncate=False)

+--------------------------------------------------------------------------------------+--------+
|title                                                                                 |duration|
+--------------------------------------------------------------------------------------+--------+
|GTA5 - Car Roulette Returns and Animal Safari... ENHANCED!! (Funny Moments)           |1863    |
|Pretending To Be A DRAGON CANNELLONI In Steal A Brainrot..                            |899     |
|Gunna - at my purest (feat. Offset) [Official Visualizer]                             |194     |
|La Lotería                                                                            |174     |
|Trump saves gamers                                                                    |1651    |
|I Build the FASTEST PLANE in Roblox!                                                  |2526    |
|BabyChiefDoit - Mr. President (Official Music Video)                                  |135     |
|Kick The Dust Up   

# in our data distinct categoryid available

In [0]:
parsed_df.select("json.snippet.categoryId").distinct().count()

5

In [0]:
parsed_df.select("json.snippet.categoryId").distinct().show(truncate=False)

+----------+
|categoryId|
+----------+
|22        |
|10        |
|24        |
|1         |
|20        |
+----------+



# Mapping distinct Categoryid to Category Name

In [0]:
from pyspark.sql.functions import when, col

casted_df2 = casted_df2.withColumn(
    "categoryName",
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "24", "Entertainment")
    .otherwise("Unknown")
)


In [0]:
casted_df2.groupBy("categoryName").count().show(truncate=False)

+----------------+-----+
|categoryName    |count|
+----------------+-----+
|People & Blogs  |3    |
|Film & Animation|3    |
|Gaming          |18   |
|Entertainment   |5    |
|Music           |21   |
+----------------+-----+



# convert publishAt col to time stamp

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, year, month

# Step 1: Convert to timestamp
casted_df2 = casted_df2.withColumn("publishedAt_ts", to_timestamp(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Step 2: Extract only the date part
casted_df2 = casted_df2.withColumn("publishedDate", to_date(col("publishedAt_ts")))

# Step 3: Extract year and month from publishedDate
casted_df2 = casted_df2.withColumn("year", year("publishedDate")) \
                     .withColumn("month", month("publishedDate"))

# Step 4: Drop original publishedAt and timestamp
casted_df2 = casted_df2.drop("publishedAt", "publishedAt_ts")


In [0]:
casted_df2.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedDate", "channelTitle",
    "title", "categoryId", "description", "categoryName"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col, count, when

# Check null values for all relevant columns
casted_df2.select([
    count(when(col("viewCount").isNull(), 1)).alias("null_viewCount"),
    count(when(col("likeCount").isNull(), 1)).alias("null_likeCount"),
    count(when(col("commentCount").isNull(), 1)).alias("null_commentCount"),
    count(when(col("duration").isNull(), 1)).alias("null_duration"),
    count(when(col("caption").isNull(), 1)).alias("null_caption"),
    count(when(col("categoryName").isNull(), 1)).alias("null_categoryName"),
    count(when(col("publishedDate").isNull(), 1)).alias("null_publishedDate"),
    count(when(col("year").isNull(), 1)).alias("null_year"),
    count(when(col("month").isNull(), 1)).alias("null_month")
]).show()

+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|null_viewCount|null_likeCount|null_commentCount|null_duration|null_caption|null_categoryName|null_publishedDate|null_year|null_month|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|             0|             0|                0|            0|           0|                0|                 0|        0|         0|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+



# Engegment Analysis

# Top Channels by Total Views

In [0]:
top_channels = casted_df2.groupBy("channelTitle") \
    .sum("viewCount") \
    .withColumnRenamed("sum(viewCount)", "totalViews") \
    .orderBy(col("totalViews").desc())

top_channels.show(20, truncate=False)


+----------------+----------+
|channelTitle    |totalViews|
+----------------+----------+
|T-Series        |5067756   |
|MrBeast Gaming  |2807057   |
|KreekCraft      |2473405   |
|Peacock         |2057632   |
|Asmongold TV    |1387047   |
|MoreSidemen     |1213482   |
|Epitaph Records |1134819   |
|CaylusBlox      |997594    |
|DOM Studio      |838822    |
|Foltyn          |798010    |
|VanossGaming    |770446    |
|ElAlfaElJefeTV  |654329    |
|penguinz0       |595312    |
|rekrap2         |571670    |
|WolfeyVGC       |434901    |
|Gunna           |419239    |
|Latto           |402561    |
|Hurricane Wisdom|393519    |
|Doodle and Arkey|360495    |
|MovieGasm‍․com  |359160    |
+----------------+----------+
only showing top 20 rows


# Top channels by total likes

In [0]:
top_liked_channels = casted_df2.groupBy("channelTitle") \
    .sum("likeCount") \
    .withColumnRenamed("sum(likeCount)", "totalLikes") \
    .orderBy(col("totalLikes").desc())
top_liked_channels.show(10, truncate=False)

+---------------+----------+
|channelTitle   |totalLikes|
+---------------+----------+
|T-Series       |152032    |
|MrBeast Gaming |120301    |
|Asmongold TV   |69422     |
|MoreSidemen    |65014     |
|DOM Studio     |53885     |
|VanossGaming   |46034     |
|KreekCraft     |42803     |
|Epitaph Records|38559     |
|penguinz0      |34352     |
|ElAlfaElJefeTV |33059     |
+---------------+----------+
only showing top 10 rows


# Most Liked Videos

In [0]:
# Here we can find top 5 most liked videos along with its title and chanelTitle 

In [0]:
most_liked = casted_df2.orderBy(col("likeCount").desc())
most_liked.select("title", "channelTitle", "categoryName", "likeCount").show(5, truncate=False)

+--------------------------------------------------------------------------------------------------+--------------+----------------+---------+
|title                                                                                             |channelTitle  |categoryName    |likeCount|
+--------------------------------------------------------------------------------------------------+--------------+----------------+---------+
|Oh Mama! TETEMA (Song) | Rayvanny, Nora Fatehi | Shreya Ghoshal, Vishal Mishra, Sanjoy | Bhushan K|T-Series      |Music           |152032   |
|$1,000 Every Minute You Survive                                                                   |MrBeast Gaming|Gaming          |120301   |
|Trump saves gamers                                                                                |Asmongold TV  |Gaming          |69422    |
|SIDEMEN AMONG US MEGA CHAOS MODE                                                                  |MoreSidemen   |People & Blogs  |65014    |

# Top videos by comment count

In [0]:
most_commented = casted_df2.orderBy(col("commentCount").desc())
most_commented.select("title", "channelTitle", "commentCount").show(5, truncate=False)

+--------------------------------------------------------------------------------------------------+--------------+------------+
|title                                                                                             |channelTitle  |commentCount|
+--------------------------------------------------------------------------------------------------+--------------+------------+
|Trump saves gamers                                                                                |Asmongold TV  |12483       |
|Oh Mama! TETEMA (Song) | Rayvanny, Nora Fatehi | Shreya Ghoshal, Vishal Mishra, Sanjoy | Bhushan K|T-Series      |6419        |
|Nintendo is Bullying Again                                                                        |penguinz0     |4722        |
|ROBLOX OP ADMIN IN STEAL A BRAINROT                                                               |Flamingo      |4662        |
|skibidi toilet multiverse 047 (part 2) Trailer                                                  

# Like to View ratio (engagement rate)

In [0]:
from pyspark.sql.functions import expr

engagement_df = casted_df2.withColumn("likeViewRatio", expr("likeCount / viewCount"))
engagement_df.orderBy(col("likeViewRatio").desc()).select("title", "channelTitle", "likeViewRatio").show(10, truncate=False)

+---------------------------------------------------------------------------+-------------------+--------------------+
|title                                                                      |channelTitle       |likeViewRatio       |
+---------------------------------------------------------------------------+-------------------+--------------------+
|**ONE PIECE** Season 2 Grand Line Trailer has us hyped!                    |Sorta Stupid       |0.12503149407911313 |
|Specter                                                                    |Bad Omens - Topic  |0.10022150624244865 |
|ROBLOX OP ADMIN IN STEAL A BRAINROT                                        |Flamingo           |0.08713840980954661 |
|BabyChiefDoit - Mr. President (Official Music Video)                       |BabyChiefDoit      |0.07828743037552298 |
|Hell At Night                                                              |BigXthaPlug - Topic|0.07041394106801913 |
|Latto - Chicken Grease (Official Video)        

# Analysis based on categories

# Top Categories by Likes

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("categoryName","channelTitle") \
    .agg(sum("likeCount").alias("totalLikes")) \
    .orderBy("totalLikes", ascending=False) \
    .show()

+----------------+------------------+----------+
|    categoryName|      channelTitle|totalLikes|
+----------------+------------------+----------+
|           Music|          T-Series|    152032|
|          Gaming|    MrBeast Gaming|    120301|
|          Gaming|    Asmongold TV  |     69422|
|  People & Blogs|       MoreSidemen|     65014|
|Film & Animation|        DOM Studio|     53885|
|          Gaming|      VanossGaming|     46034|
|          Gaming|        KreekCraft|     42803|
|           Music|   Epitaph Records|     38559|
|   Entertainment|         penguinz0|     34352|
|           Music|    ElAlfaElJefeTV|     33059|
|          Gaming|           rekrap2|     31916|
|           Music|             Latto|     27956|
|          Gaming|         WolfeyVGC|     24647|
|   Entertainment|           Peacock|     21141|
|           Music|  Hurricane Wisdom|     20839|
|           Music| Bad Omens - Topic|     19908|
|          Gaming|The Game Theorists|     19272|
|          Gaming|  

# Top Categories by Comments

In [0]:
casted_df2.groupBy("categoryName","channelTitle") \
    .agg(sum("commentCount").alias("totalComments")) \
    .orderBy("totalComments", ascending=False) \
    .show()

+----------------+------------------+-------------+
|    categoryName|      channelTitle|totalComments|
+----------------+------------------+-------------+
|          Gaming|    Asmongold TV  |        12483|
|           Music|          T-Series|         6419|
|   Entertainment|         penguinz0|         4722|
|          Gaming|          Flamingo|         4662|
|Film & Animation|        DOM Studio|         4392|
|  People & Blogs|       MoreSidemen|         3880|
|           Music|    ElAlfaElJefeTV|         3172|
|           Music|   Epitaph Records|         2614|
|          Gaming|            Foltyn|         2553|
|          Gaming|    MrBeast Gaming|         2454|
|          Gaming|           rekrap2|         2451|
|   Entertainment|           Peacock|         1941|
|          Gaming|            Aphmau|         1849|
|          Gaming|         ItsFunneh|         1720|
|          Gaming|      VanossGaming|         1700|
|           Music|             Latto|         1543|
|          G

# Average Engagement per Category

In [0]:
# create engagment column on adding likecount and commentcount

In [0]:
casted_df2 = casted_df2.withColumn(
    "engagement",
    col("likeCount") + col("commentCount")
)

In [0]:
from pyspark.sql.functions import avg

casted_df2.groupBy("categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show(truncate=False)

+----------------+------------------+
|categoryName    |avgEngagement     |
+----------------+------------------+
|Gaming          |27128.333333333332|
|People & Blogs  |23888.666666666668|
|Film & Animation|20346.666666666668|
|Music           |18430.619047619046|
|Entertainment   |13392.6           |
+----------------+------------------+



# Average engagement by video duration

In [0]:
# it says that people are engage or not  in short,long or medium videos

In [0]:
# Step 1: Categorize durations
from pyspark.sql.functions import when

casted_df2 = casted_df2.withColumn(
    "durationCategory",
    when(col("duration") <= 300, "Short")  # ≤ 5 mins
    .when((col("duration") > 300) & (col("duration") <= 1200), "Medium")  # 5–20 mins
    .otherwise("Long")  # > 20 mins
)

# Step 2: Avg engagement per duration category
casted_df2.groupBy("durationCategory") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show()


+----------------+------------------+
|durationCategory|     avgEngagement|
+----------------+------------------+
|            Long|           33062.9|
|          Medium| 24095.35714285714|
|           Short|15656.076923076924|
+----------------+------------------+



# Top performing categories within duration groups

In [0]:
casted_df2.groupBy("durationCategory", "categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("durationCategory", "avgEngagement", ascending=False) \
    .show()

+----------------+----------------+------------------+
|durationCategory|    categoryName|     avgEngagement|
+----------------+----------------+------------------+
|           Short|           Music|          18950.75|
|           Short|   Entertainment| 8496.666666666666|
|           Short|          Gaming|            1473.0|
|           Short|Film & Animation|            1002.0|
|           Short|  People & Blogs|              78.0|
|          Medium|Film & Animation|           30019.0|
|          Medium|          Gaming|          28137.75|
|          Medium|   Entertainment|           20736.5|
|          Medium|           Music|            8028.0|
|          Medium|  People & Blogs|            2694.0|
|            Long|  People & Blogs|           68894.0|
|            Long|          Gaming|29081.666666666668|
+----------------+----------------+------------------+



# Total Watch Time per Category (durationCategory)

In [0]:
casted_df2.groupBy("durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("totalWatchTime_seconds", ascending=False) \
    .show()

+----------------+----------------------+
|durationCategory|totalWatchTime_seconds|
+----------------+----------------------+
|            Long|                 53131|
|          Medium|                 10862|
|           Short|                  4395|
+----------------+----------------------+



# Total Watch Time per Category and Duration Bucket

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory") \
    .show(truncate=False)

+----------------+----------------+----------------------+
|categoryName    |durationCategory|totalWatchTime_seconds|
+----------------+----------------+----------------------+
|Entertainment   |Medium          |1411                  |
|Entertainment   |Short           |339                   |
|Film & Animation|Medium          |989                   |
|Film & Animation|Short           |143                   |
|Gaming          |Long            |47567                 |
|Gaming          |Medium          |7041                  |
|Gaming          |Short           |79                    |
|Music           |Medium          |704                   |
|Music           |Short           |3774                  |
|People & Blogs  |Long            |5564                  |
|People & Blogs  |Medium          |717                   |
|People & Blogs  |Short           |60                    |
+----------------+----------------+----------------------+



In [0]:
# i have attached the chanelTitle also , from here we can get idea 
# 1) Which channel is dominating which category .
# 2) Which channel do people watch for longer time?
# 3) Which channel's videos, short, medium or long, are being watched more?

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("channelTitle", "categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory", "totalWatchTime_seconds", ascending=False) \
    .show(truncate=False)

+----------------------------------------+--------------+----------------+----------------------+
|channelTitle                            |categoryName  |durationCategory|totalWatchTime_seconds|
+----------------------------------------+--------------+----------------+----------------------+
|Starlight Cinema                        |People & Blogs|Short           |60                    |
|gara                                    |People & Blogs|Medium          |717                   |
|MoreSidemen                             |People & Blogs|Long            |5564                  |
|Bad Omens - Topic                       |Music         |Short           |275                   |
|SeanRiiVEVO                             |Music         |Short           |241                   |
|Jose Torres El Rey De Alto Mando - Topic|Music         |Short           |219                   |
|Edén Muñoz - Topic                      |Music         |Short           |209                   |
|T-Series           

# Top 5 Longest Videos with High Engagement

In [0]:
casted_df2.orderBy(col("duration").desc(), col("engagement").desc()).select("title", "duration", "engagement").show(5, truncate=False)


+-----------------------------------------------+--------+----------+
|title                                          |duration|engagement|
+-----------------------------------------------+--------+----------+
|grow a garden admin abuse (w/flamingo)         |19510   |43303     |
|$1,000,000 EWC WARZONE GRAND FINALS WATCH PARTY|11511   |1946      |
|Liminal Exit…                                  |5959    |9610      |
|SIDEMEN AMONG US MEGA CHAOS MODE               |5564    |68894     |
|I Build the FASTEST PLANE in Roblox!           |2526    |12117     |
+-----------------------------------------------+--------+----------+
only showing top 5 rows


#  Education and Tech-Focused Insights

In [0]:
# i check education content by seeing the keyword like what ever i have mentioned in the below 

# Keyword-Based Filtering (regardless of category)

In [0]:
from pyspark.sql.functions import lower, col
from functools import reduce

keywords = ["tutorial", "science", "learn", "how to", "technology", "education", "experiment", "ChatGPT", "OpenAI ", "agent"]

# keyword-based filter on title and description
filtered_df = casted_df2.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) | lower(col("description")).contains(k) for k in keywords])
)

filtered_df.select("categoryName", "title", "description").show(truncate=False)

+------------+-----------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df2.write.mode("overwrite").saveAsTable("processed_data.usa_trending_processed")

