# Analysis on USA Data Set

In [0]:
# Load JSON (multiline must be true for nested JSON objects)
df = spark.read.option("multiline", "true").json("dbfs:/FileStore/shared_uploads/krath2928@gmail.com/trending_usa.json")

# Show schema to understand structure
df.printSchema()

root
 |-- contentDetails: struct (nullable = true)
 |    |-- caption: string (nullable = true)
 |    |-- definition: string (nullable = true)
 |    |-- dimension: string (nullable = true)
 |    |-- duration: string (nullable = true)
 |    |-- licensedContent: boolean (nullable = true)
 |    |-- projection: string (nullable = true)
 |    |-- regionRestriction: struct (nullable = true)
 |    |    |-- allowed: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- blocked: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |-- etag: string (nullable = true)
 |-- id: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- snippet: struct (nullable = true)
 |    |-- categoryId: string (nullable = true)
 |    |-- channelId: string (nullable = true)
 |    |-- channelTitle: string (nullable = true)
 |    |-- defaultAudioLanguage: string (nullable = true)
 |    |-- defaultLanguage: string (nullable = true)
 |   

In [0]:
from pyspark.sql.functions import col

selected_df = df.select(
    "id",
    "snippet.publishedAt",
    "snippet.channelId",
    "snippet.channelTitle",
    "snippet.title",
    "snippet.description",
    "snippet.categoryId",
    "snippet.tags",
    "snippet.defaultLanguage",
    "snippet.defaultAudioLanguage",
    "statistics.viewCount",
    "statistics.likeCount",
    "statistics.commentCount",
    "contentDetails.duration",
    "contentDetails.caption"
)

selected_df.show(truncate=False)


+-----------+--------------------+------------------------+----------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col

casted_df2 = selected_df.withColumn("viewCount", col("viewCount").cast("int")) \
                       .withColumn("likeCount", col("likeCount").cast("int")) \
                       .withColumn("commentCount", col("commentCount").cast("int"))

In [0]:
casted_df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- defaultLanguage: string (nullable = true)
 |-- defaultAudioLanguage: string (nullable = true)
 |-- viewCount: integer (nullable = true)
 |-- likeCount: integer (nullable = true)
 |-- commentCount: integer (nullable = true)
 |-- duration: string (nullable = true)
 |-- caption: string (nullable = true)



In [0]:
casted_df2.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedAt", "channelTitle",
    "title", "categoryId", "description"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
casted_df2.select("title", "duration").show(10, truncate=False)

+--------------------------------------------------------------------------+----------+
|title                                                                     |duration  |
+--------------------------------------------------------------------------+----------+
|Huda: Love Island Tell All (Full Episode)                                 |PT1H54M19S|
|Stephen Colbert Announces The Cancellation Of “The Late Show”             |PT2M27S   |
|Mortal Kombat II | Official Trailer                                       |PT2M33S   |
|Monster High Fright Song ft. KATSEYE | Official Music Video | Monster High|PT2M44S   |
|Tron: Ares | Official Trailer                                             |PT2M25S   |
|Stranger Things 5 | Official Teaser | Netflix                             |PT2M47S   |
|Alex Warren - Eternity (Official Audio)                                   |PT3M10S   |
|Amaya: Love Island Tell All (Full Episode)                                |PT1H13M43S|
|The Open Championship 2025 high

# Convert duration (ISO 8601 format) to Total Seconds

In [0]:
from pyspark.sql.functions import col, regexp_extract

# Step 1: Extract raw duration string
df_with_duration = casted_df2.withColumn("duration_raw", col("duration"))

# Step 2: Extract hours, minutes, and seconds using regex
df_parsed = df_with_duration \
    .withColumn("hours", regexp_extract(col("duration_raw"), r'PT(\d+)H', 1).cast("int")) \
    .withColumn("minutes", regexp_extract(col("duration_raw"), r'(?<=PT(\d+H)?)?(\d+)M', 2).cast("int")) \
    .withColumn("seconds", regexp_extract(col("duration_raw"), r'(?<=M)?(\d+)S', 1).cast("int"))

# Step 3: Replace nulls with 0
df_filled = df_parsed.fillna({"hours": 0, "minutes": 0, "seconds": 0})

# Step 4: Calculate total duration in seconds and overwrite `duration` column
final_df = df_filled.withColumn(
    "duration",
    col("hours") * 3600 + col("minutes") * 60 + col("seconds")
)

# Step 5: Drop temporary columns (if you want clean dataframe)
final_df = final_df.drop("duration_raw", "hours", "minutes", "seconds")

#  Now update casted_df to this final version
casted_df2 = final_df


In [0]:
casted_df2.select("title", "duration").show(10, truncate=False)

+--------------------------------------------------------------------------+--------+
|title                                                                     |duration|
+--------------------------------------------------------------------------+--------+
|Huda: Love Island Tell All (Full Episode)                                 |6859    |
|Stephen Colbert Announces The Cancellation Of “The Late Show”             |147     |
|Mortal Kombat II | Official Trailer                                       |153     |
|Monster High Fright Song ft. KATSEYE | Official Music Video | Monster High|164     |
|Tron: Ares | Official Trailer                                             |145     |
|Stranger Things 5 | Official Teaser | Netflix                             |167     |
|Alex Warren - Eternity (Official Audio)                                   |190     |
|Amaya: Love Island Tell All (Full Episode)                                |4423    |
|The Open Championship 2025 highlights: Late First Rou

# in our data distinct categoryid available

In [0]:
df.select("snippet.categoryId").distinct().count()

Out[9]: 8

In [0]:
df.select("snippet.categoryId").distinct().show(truncate=False)

+----------+
|categoryId|
+----------+
|22        |
|28        |
|17        |
|23        |
|24        |
|1         |
|20        |
|10        |
+----------+



# Mapping distinct Categoryid to Category Name

In [0]:
from pyspark.sql.functions import when, col

casted_df2 = casted_df2.withColumn(
    "categoryName",
    when(col("categoryId") == "1", "Film & Animation")
    .when(col("categoryId") == "10", "Music")
    .when(col("categoryId") == "17", "Sports")
    .when(col("categoryId") == "20", "Gaming")
    .when(col("categoryId") == "22", "People & Blogs")
    .when(col("categoryId") == "23", "Comedy")
    .when(col("categoryId") == "24", "Entertainment")
    .when(col("categoryId") == "28", "Science & Technology")
    .otherwise("Unknown")
)


In [0]:
casted_df2.groupBy("categoryName").count().show(truncate=False)

+--------------------+-----+
|categoryName        |count|
+--------------------+-----+
|Gaming              |9    |
|Entertainment       |18   |
|Science & Technology|3    |
|Sports              |5    |
|Film & Animation    |3    |
|People & Blogs      |4    |
|Music               |7    |
|Comedy              |1    |
+--------------------+-----+



# convert publishAt col to time stamp

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date, year, month

# Step 1: Convert to timestamp
casted_df2 = casted_df2.withColumn("publishedAt_ts", to_timestamp(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))

# Step 2: Extract only the date part
casted_df2 = casted_df2.withColumn("publishedDate", to_date(col("publishedAt_ts")))

# Step 3: Extract year and month from publishedDate
casted_df2 = casted_df2.withColumn("year", year("publishedDate")) \
                     .withColumn("month", month("publishedDate"))

# Step 4: Drop original publishedAt and timestamp
casted_df2 = casted_df2.drop("publishedAt", "publishedAt_ts")

In [0]:
casted_df2.select(
    "viewCount", "likeCount", "commentCount",
    "duration", "publishedDate", "channelTitle",
    "title", "categoryId", "description", "categoryName"
).show(3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col, count, when

# Check null values for all relevant columns
casted_df2.select([
    count(when(col("viewCount").isNull(), 1)).alias("null_viewCount"),
    count(when(col("likeCount").isNull(), 1)).alias("null_likeCount"),
    count(when(col("commentCount").isNull(), 1)).alias("null_commentCount"),
    count(when(col("duration").isNull(), 1)).alias("null_duration"),
    count(when(col("caption").isNull(), 1)).alias("null_caption"),
    count(when(col("categoryName").isNull(), 1)).alias("null_categoryName"),
    count(when(col("publishedDate").isNull(), 1)).alias("null_publishedDate"),
    count(when(col("year").isNull(), 1)).alias("null_year"),
    count(when(col("month").isNull(), 1)).alias("null_month")
]).show()

+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|null_viewCount|null_likeCount|null_commentCount|null_duration|null_caption|null_categoryName|null_publishedDate|null_year|null_month|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+
|             0|             0|                0|            0|           0|                0|                 0|        0|         0|
+--------------+--------------+-----------------+-------------+------------+-----------------+------------------+---------+----------+



# Engegment Analysis

# Top Channels by Total Views

In [0]:
top_channels = casted_df2.groupBy("channelTitle") \
    .sum("viewCount") \
    .withColumnRenamed("sum(viewCount)", "totalViews") \
    .orderBy(col("totalViews").desc())

top_channels.show(20, truncate=False)


+----------------------------------+----------+
|channelTitle                      |totalViews|
+----------------------------------+----------+
|Netflix                           |14269445  |
|Warner Bros.                      |7245936   |
|Call Her Daddy                    |6892201   |
|Disney                            |6385779   |
|EA SPORTS FC                      |5607960   |
|Pixar                             |4591295   |
|FX Networks                       |4095561   |
|IGN                               |3921003   |
|Vsauce                            |3100641   |
|The Late Show with Stephen Colbert|2623291   |
|MLB                               |1910577   |
|Ryan Trahan                       |1575509   |
|Neutro Shorty                     |1340872   |
|MileyCyrusVEVO                    |969788    |
|Monster High                      |865237    |
|SMii7Yplus                        |851355    |
|theneedledrop                     |809080    |
|Jake Webber                       |7615

# Top channels by total likes

In [0]:
top_liked_channels = casted_df2.groupBy("channelTitle") \
    .sum("likeCount") \
    .withColumnRenamed("sum(likeCount)", "totalLikes") \
    .orderBy(col("totalLikes").desc())
top_liked_channels.show(10, truncate=False)

+----------------------------------+----------+
|channelTitle                      |totalLikes|
+----------------------------------+----------+
|Netflix                           |668886    |
|Vsauce                            |370710    |
|Warner Bros.                      |270309    |
|EA SPORTS FC                      |238643    |
|Monster High                      |182106    |
|Call Her Daddy                    |181429    |
|Neutro Shorty                     |177218    |
|The Late Show with Stephen Colbert|125258    |
|IGN                               |111672    |
|Disney                            |92490     |
+----------------------------------+----------+
only showing top 10 rows



# Most Liked Videos

In [0]:
# Here we can find top 5 most liked videos along with its title and chanelTitle 

In [0]:
most_liked = casted_df2.orderBy(col("likeCount").desc())
most_liked.select("title", "channelTitle", "categoryName", "likeCount").show(5, truncate=False)

+--------------------------------------------------------------------------+------------+----------------+---------+
|title                                                                     |channelTitle|categoryName    |likeCount|
+--------------------------------------------------------------------------+------------+----------------+---------+
|Stranger Things 5 | Official Teaser | Netflix                             |Netflix     |Entertainment   |668886   |
|All The Ghosts You Will Be                                                |Vsauce      |Entertainment   |370710   |
|Mortal Kombat II | Official Trailer                                       |Warner Bros.|Film & Animation|270309   |
|EA SPORTS FC 26 | Official Reveal Trailer                                 |EA SPORTS FC|Gaming          |238643   |
|Monster High Fright Song ft. KATSEYE | Official Music Video | Monster High|Monster High|Entertainment   |182106   |
+---------------------------------------------------------------

# Top videos by comment count

In [0]:
most_commented = casted_df2.orderBy(col("commentCount").desc())
most_commented.select("title", "channelTitle", "commentCount").show(5, truncate=False)

+-------------------------------------------------------------+----------------------------------+------------+
|title                                                        |channelTitle                      |commentCount|
+-------------------------------------------------------------+----------------------------------+------------+
|Stranger Things 5 | Official Teaser | Netflix                |Netflix                           |38411       |
|Stephen Colbert Announces The Cancellation Of “The Late Show”|The Late Show with Stephen Colbert|37158       |
|All The Ghosts You Will Be                                   |Vsauce                            |36579       |
|Mortal Kombat II | Official Trailer                          |Warner Bros.                      |21950       |
|EA SPORTS FC 26 | Official Reveal Trailer                    |EA SPORTS FC                      |21063       |
+-------------------------------------------------------------+----------------------------------+------

# Like to View ratio (engagement rate)

In [0]:
from pyspark.sql.functions import expr

engagement_df = casted_df2.withColumn("likeViewRatio", expr("likeCount / viewCount"))
engagement_df.orderBy(col("likeViewRatio").desc()).select("title", "channelTitle", "likeViewRatio").show(10, truncate=False)

+----------------------------------------------------------------------------+-------------------+-------------------+
|title                                                                       |channelTitle       |likeViewRatio      |
+----------------------------------------------------------------------------+-------------------+-------------------+
|Monster High Fright Song ft. KATSEYE | Official Music Video | Monster High  |Monster High       |0.2104695014198422 |
|Yeat & BNYX® - IM YEAT (Real Lyfe Shit)                                     |Yeat Music         |0.15190918869084205|
|Neutro Shorty - PRÉNDELO PINGÜINO (Video Oficial)                           |Neutro Shorty      |0.1321662321235733 |
|Eddsworld - Tales Of Terror                                                 |Eddsworld          |0.12327502495999562|
|All The Ghosts You Will Be                                                  |Vsauce             |0.11955914922108042|
|$uicideboy$ Want to "Save 100,000 Souls" In The

# Analysis based on categories

# Top Categories by Likes

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("categoryName","channelTitle") \
    .agg(sum("likeCount").alias("totalLikes")) \
    .orderBy("totalLikes", ascending=False) \
    .show()

+--------------------+--------------------+----------+
|        categoryName|        channelTitle|totalLikes|
+--------------------+--------------------+----------+
|       Entertainment|             Netflix|    668886|
|       Entertainment|              Vsauce|    370710|
|    Film & Animation|        Warner Bros.|    270309|
|              Gaming|        EA SPORTS FC|    238643|
|       Entertainment|        Monster High|    182106|
|       Entertainment|      Call Her Daddy|    181429|
|               Music|       Neutro Shorty|    177218|
|       Entertainment|The Late Show wit...|    125258|
|       Entertainment|                 IGN|    111672|
|       Entertainment|              Disney|     92490|
|    Film & Animation|           Eddsworld|     72108|
|       Entertainment|         Ryan Trahan|     69377|
|               Music|      MileyCyrusVEVO|     62608|
|              Gaming|          SMii7Yplus|     56201|
|    Film & Animation|               Pixar|     55217|
|         

# Top Categories by Comments

In [0]:
casted_df2.groupBy("categoryName","channelTitle") \
    .agg(sum("commentCount").alias("totalComments")) \
    .orderBy("totalComments", ascending=False) \
    .show()

+----------------+--------------------+-------------+
|    categoryName|        channelTitle|totalComments|
+----------------+--------------------+-------------+
|   Entertainment|             Netflix|        38411|
|   Entertainment|The Late Show wit...|        37158|
|   Entertainment|              Vsauce|        36579|
|   Entertainment|      Call Her Daddy|        23290|
|Film & Animation|        Warner Bros.|        21950|
|          Gaming|        EA SPORTS FC|        21063|
|           Music|       Neutro Shorty|        17694|
|   Entertainment|                 IGN|         9675|
|   Entertainment|         Ryan Trahan|         8885|
|   Entertainment|              Disney|         8672|
|           Music|       theneedledrop|         6501|
|          Gaming|                 3FS|         6163|
|Film & Animation|           Eddsworld|         5958|
|           Music|      MileyCyrusVEVO|         5909|
|Film & Animation|               Pixar|         5620|
|  People & Blogs|        St

# Average Engagement per Category

In [0]:
# create engagment column on adding likecount and commentcount

In [0]:
casted_df2 = casted_df2.withColumn(
    "engagement",
    col("likeCount") + col("commentCount")
)

In [0]:
from pyspark.sql.functions import avg

casted_df2.groupBy("categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show(truncate=False)

+--------------------+------------------+
|categoryName        |avgEngagement     |
+--------------------+------------------+
|Film & Animation    |143720.66666666666|
|Entertainment       |121626.72222222222|
|Music               |62591.142857142855|
|Gaming              |52911.77777777778 |
|Science & Technology|24767.0           |
|People & Blogs      |21922.5           |
|Comedy              |18457.0           |
|Sports              |10728.8           |
+--------------------+------------------+



# Average engagement by video duration

In [0]:
# it says that people are engage or not  in short,long or medium videos

In [0]:
# Step 1: Categorize durations
from pyspark.sql.functions import when

casted_df2 = casted_df2.withColumn(
    "durationCategory",
    when(col("duration") <= 300, "Short")  # ≤ 5 mins
    .when((col("duration") > 300) & (col("duration") <= 1200), "Medium")  # 5–20 mins
    .otherwise("Long")  # > 20 mins
)

# Step 2: Avg engagement per duration category
casted_df2.groupBy("durationCategory") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("avgEngagement", ascending=False) \
    .show()


+----------------+------------------+
|durationCategory|     avgEngagement|
+----------------+------------------+
|           Short|       142023.9375|
|            Long|54736.142857142855|
|          Medium|26695.153846153848|
+----------------+------------------+



# Top performing categories within duration groups

In [0]:
casted_df2.groupBy("durationCategory", "categoryName") \
    .agg(avg("engagement").alias("avgEngagement")) \
    .orderBy("durationCategory", "avgEngagement", ascending=False) \
    .show()

+----------------+--------------------+------------------+
|durationCategory|        categoryName|     avgEngagement|
+----------------+--------------------+------------------+
|           Short|       Entertainment|189970.57142857142|
|           Short|    Film & Animation|          176548.0|
|           Short|              Gaming|          136535.5|
|           Short|               Music|          73199.25|
|           Short|      People & Blogs|           23625.0|
|          Medium|    Film & Animation|           78066.0|
|          Medium|               Music|           44543.0|
|          Medium|Science & Technology|           27991.0|
|          Medium|       Entertainment|27819.666666666668|
|          Medium|      People & Blogs|           14337.0|
|          Medium|              Gaming|            6864.0|
|          Medium|              Sports| 6414.333333333333|
|            Long|       Entertainment|           97003.5|
|            Long|               Music|           56255.

# Total Watch Time per Category (durationCategory)

In [0]:
casted_df2.groupBy("durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("totalWatchTime_seconds", ascending=False) \
    .show()

+----------------+----------------------+
|durationCategory|totalWatchTime_seconds|
+----------------+----------------------+
|            Long|                 46976|
|          Medium|                 10043|
|           Short|                  2420|
+----------------+----------------------+



# Total Watch Time per Category and Duration Bucket

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory") \
    .show(truncate=False)

+--------------------+----------------+----------------------+
|categoryName        |durationCategory|totalWatchTime_seconds|
+--------------------+----------------+----------------------+
|Comedy              |Long            |2669                  |
|Entertainment       |Long            |23506                 |
|Entertainment       |Medium          |2036                  |
|Entertainment       |Short           |1015                  |
|Film & Animation    |Medium          |329                   |
|Film & Animation    |Short           |274                   |
|Gaming              |Long            |12392                 |
|Gaming              |Medium          |681                   |
|Gaming              |Short           |203                   |
|Music               |Long            |1283                  |
|Music               |Medium          |1526                  |
|Music               |Short           |767                   |
|People & Blogs      |Long            |2764            

In [0]:
# i have attached the chanelTitle also , from here we can get idea 
# 1) Which channel is dominating which category .
# 2) Which channel do people watch for longer time?
# 3) Which channel's videos, short, medium or long, are being watched more?

In [0]:
from pyspark.sql.functions import sum

casted_df2.groupBy("channelTitle", "categoryName", "durationCategory") \
    .agg(sum("duration").alias("totalWatchTime_seconds")) \
    .orderBy("categoryName", "durationCategory", "totalWatchTime_seconds", ascending=False) \
    .show(truncate=False)

+----------------------+--------------------+----------------+----------------------+
|channelTitle          |categoryName        |durationCategory|totalWatchTime_seconds|
+----------------------+--------------------+----------------+----------------------+
|Golf Channel          |Sports              |Medium          |1187                  |
|Fanatiz               |Sports              |Medium          |733                   |
|The R&A               |Sports              |Medium          |611                   |
|Baseball Doesn't Exist|Sports              |Long            |1574                  |
|MLB                   |Sports              |Long            |1258                  |
|Tom Stanton           |Science & Technology|Medium          |1076                  |
|StarTalk              |Science & Technology|Medium          |685                   |
|OpenAI                |Science & Technology|Long            |1530                  |
|Chuckyy               |People & Blogs      |Short    

# Top 5 Longest Videos with High Engagement

In [0]:
casted_df2.orderBy(col("duration").desc(), col("engagement").desc()).select("title", "duration", "engagement").show(5, truncate=False)


+------------------------------------------+--------+----------+
|title                                     |duration|engagement|
+------------------------------------------+--------+----------+
|Huda: Love Island Tell All (Full Episode) |6859    |138371    |
|Amaya: Love Island Tell All (Full Episode)|4423    |66348     |
|We went to Italy!                         |3430    |38054     |
|MINECRAFT WITH TOMAR                      |2864    |18335     |
|AI Bigfoot VLOGS are HYSTERICAL           |2669    |18457     |
+------------------------------------------+--------+----------+
only showing top 5 rows



#  Education and Tech-Focused Insights

In [0]:
education_df = casted_df2.filter(col("categoryName").isin("Education", "Science & Technology"))
education_df.show()

+-----------+--------------------+------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|         id|           channelId|channelTitle|               title|         description|categoryId|                tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|        categoryName|publishedDate|year|month|engagement|durationCategory|
+-----------+--------------------+------------+--------------------+--------------------+----------+--------------------+---------------+--------------------+---------+---------+------------+--------+-------+--------------------+-------------+----+-----+----------+----------------+
|1jn_RpbPbEc|UCXZCJLdBC09xxGZ6...|      OpenAI|Introduction to C...|Sam Altman, Casey...|        28|                null|             en|              

In [0]:
education_df.select("title", "likeCount", "viewCount", "engagement").show(truncate=False)

+--------------------------------------+---------+---------+----------+
|title                                 |likeCount|viewCount|engagement|
+--------------------------------------+---------+---------+----------+
|Introduction to ChatGPT agent         |16790    |652252   |18319     |
|Why Earth’s Rotation Speed is Changing|14177    |429063   |15563     |
|Building a Stirling Engine Bike       |38444    |689822   |40419     |
+--------------------------------------+---------+---------+----------+



In [0]:
education_df.select("channelTitle", "title", "categoryName").show(truncate=False)

+------------+--------------------------------------+--------------------+
|channelTitle|title                                 |categoryName        |
+------------+--------------------------------------+--------------------+
|OpenAI      |Introduction to ChatGPT agent         |Science & Technology|
|StarTalk    |Why Earth’s Rotation Speed is Changing|Science & Technology|
|Tom Stanton |Building a Stirling Engine Bike       |Science & Technology|
+------------+--------------------------------------+--------------------+



In [0]:
# i check education content by seeing the keyword like what ever i have mentioned in the below 

In [0]:
from pyspark.sql.functions import col, lower
from functools import reduce

keywords = ["tutorial", "science", "ChatGPT", "learn", "how to", "technology", "education", "experiment", "physics", "math", "engineering"]

# Debugging: ensure title is not null
education_df_filtered = education_df.filter(col("title").isNotNull())

# Apply keyword filter in lowercase
education_clean_df = education_df_filtered.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) for k in keywords])
)

education_clean_df.show(truncate=False)
education_clean_df.count()

+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
|id |channelId|channelTitle|title|description|categoryId|tags|defaultLanguage|defaultAudioLanguage|viewCount|likeCount|commentCount|duration|caption|categoryName|publishedDate|year|month|engagement|durationCategory|
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+
+---+---------+------------+-----+-----------+----------+----+---------------+--------------------+---------+---------+------------+--------+-------+------------+-------------+----+-----+----------+----------------+

Out[40]: 0

In [0]:
education_df.select("title").show(20, truncate=False)


+--------------------------------------+
|title                                 |
+--------------------------------------+
|Introduction to ChatGPT agent         |
|Why Earth’s Rotation Speed is Changing|
|Building a Stirling Engine Bike       |
+--------------------------------------+



# Most engaging education/tech videos

In [0]:
education_df.orderBy("engagement", ascending=False).select("title", "engagement").show(10, truncate=False)


+--------------------------------------+----------+
|title                                 |engagement|
+--------------------------------------+----------+
|Building a Stirling Engine Bike       |40419     |
|Introduction to ChatGPT agent         |18319     |
|Why Earth’s Rotation Speed is Changing|15563     |
+--------------------------------------+----------+



# Keyword-Based Filtering (regardless of category)

In [0]:
from pyspark.sql.functions import lower, col
from functools import reduce

keywords = ["tutorial", "science", "learn", "how to", "technology", "education", "experiment", "ChatGPT", "OpenAI ", "agent"]

# keyword-based filter on title and description
filtered_df = casted_df2.filter(
    reduce(lambda a, b: a | b, [lower(col("title")).contains(k) | lower(col("description")).contains(k) for k in keywords])
)

filtered_df.select("categoryName", "title", "description").show(truncate=False)

+--------------------+----------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
output_path = '/FileStore/tables/final_usa_trending'
casted_df2.write.mode('overwrite').parquet(output_path)
