In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [16]:
spark = SparkSession.builder.getOrCreate()

In [17]:
# Lendo o arquivo 'videos-preparados.snappy.parquet' no dataframe 'df_video'
# Reading the file 'videos-preparados.snappy.parquet' into the dataframe 'df_video'
df_video = spark.read.option('header', 'true').parquet('videos-preparados-parquet')

In [18]:
df_video.show(5)
df_video.printSchema()

+--------------------+-----------+------------+-------+-----+--------+-------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|Month|Keyword Index|            Features|     Features Normal|        Features PCA|
+--------------------+-----------+------------+-------+-----+--------+-------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|    8|         24.0|[3407.0,135612.0,...|[2.07229197864298...|[-0.5379150448038...|
|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech|76779|    4306|1758063|    1839148|2022|    8|         24.0|[76779.0,1758063....|[0.00466873762089...|[-0.5379209409038...|
|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech|63825|    3338|1564007|    1631170|2022|    8|    

In [19]:
# Calculando a quantidade de registros para cada valor único da coluna "Keyword"
# Counting the number of records for each unique value in the “Keyword” column
df_video.agg(count_distinct('Keyword').alias('Keyword count')).show()
df_video.groupBy('Keyword').count().show(41)

+-------------+
|Keyword count|
+-------------+
|           41|
+-------------+

+----------------+-----+
|         Keyword|count|
+----------------+-----+
|computer science|   48|
|            lofi|   41|
|         finance|   39|
|             cnn|   50|
|           apple|   42|
|            news|   39|
|         mukbang|   45|
|       education|   24|
|       interview|   50|
|          crypto|   50|
|   mathchemistry|   15|
|            food|   48|
|    data science|   50|
|        trolling|   50|
|        tutorial|   50|
|      literature|   46|
|             sat|   49|
|         history|   50|
|           cubes|   50|
|           music|   46|
|         biology|   47|
|           chess|   47|
|        reaction|   50|
|          movies|   45|
|          how-to|   48|
|          sports|   49|
|             bed|   44|
|          marvel|   50|
|game development|   50|
|machine learning|   50|
|          gaming|   43|
|            xbox|   49|
|         mrbeast|   50|
|        business| 

In [20]:
# Calculando a média da coluna "Interaction" para cada valor único da coluna 'Keyword'
# Calculating the mean of the “Interaction” column for each unique value in the “Keyword” column
video_interaction_avg = df_video.groupBy('Keyword').agg(
    format_number(avg('Interaction'), 0).alias('Interaction Average')
)
video_interaction_avg.show(41)

+----------------+-------------------+
|         Keyword|Interaction Average|
+----------------+-------------------+
|computer science|          1,226,793|
|            lofi|          4,065,808|
|         finance|            708,543|
|             cnn|            570,651|
|           apple|         10,873,628|
|            news|            251,689|
|         mukbang|         11,053,630|
|       education|          2,750,839|
|       interview|          3,044,867|
|          crypto|            413,676|
|   mathchemistry|          3,427,343|
|            food|          5,352,944|
|    data science|            562,465|
|        trolling|          1,484,585|
|        tutorial|          6,936,688|
|      literature|            881,726|
|             sat|          1,098,927|
|         history|         15,340,731|
|           cubes|         15,348,634|
|           music|         29,691,370|
|         biology|          4,192,382|
|           chess|          1,360,809|
|        reaction|       

In [21]:
# Calculando o valor máximo da coluna "Interaction" para cada valor único da coluna 'Keyword'
# Nomeando de 'Rank Interactions', em seguida ordenando pela nova coluna em ordem decrescente
# Computing the maximum value of the “Interaction” column for each unique “Keyword”
# Naming the result “Rank Interactions”, then sorting by this new column in descending order
video_interaction_max = df_video.groupBy('Keyword').agg(
    (max('Interaction')).alias('Rank Interactions'))\
    .orderBy(desc('Rank Interactions'))

video_interaction_max.show(41)

+----------------+-----------------+
|         Keyword|Rank Interactions|
+----------------+-----------------+
|          google|       4051300647|
|         animals|       1593623628|
|           music|        922551152|
|             bed|        532691631|
|         history|        440187490|
|           apple|        429916936|
|         mrbeast|        300397699|
|        business|        210025196|
|           cubes|        170925917|
|          sports|        106924567|
|         mukbang|         87433858|
|            lofi|         86445177|
|        tutorial|         69616442|
|          movies|         65253870|
|          marvel|         56247330|
|          how-to|         53053975|
|            food|         48754479|
|         physics|         43463298|
|            asmr|         34411125|
|        nintendo|         32268486|
|        reaction|         26947493|
|         biology|         24332656|
|       minecraft|         23463711|
|       interview|         22992885|
|

In [22]:
# Calculando a média e a variância da coluna 'Views' para cada valor único da coluna 'Keyword'
# Calculating the mean and variance of the “Views” column for each unique value in the “Keyword” column
video_views_avg = df_video.groupBy('Keyword').agg(
    format_number(avg('Views'), 0).alias('Views Average'))
video_views_var = video_views_avg = df_video.groupBy('Keyword').agg(
    format_number(variance('Views'), 0).alias('Views Variance'))

video_views_avg.show(truncate=False)
video_views_var.show(truncate=False)

+----------------+----------------------+
|Keyword         |Views Variance        |
+----------------+----------------------+
|computer science|2,812,198,681,651     |
|lofi            |180,410,459,661,061   |
|finance         |3,304,483,175,097     |
|cnn             |156,342,361,847       |
|apple           |4,299,927,977,442,588 |
|news            |106,751,257,667       |
|mukbang         |558,607,323,897,318   |
|education       |18,335,722,493,392    |
|interview       |18,192,209,960,343    |
|crypto          |3,513,691,634,369     |
|mathchemistry   |24,914,670,652,568    |
|food            |73,263,741,281,548    |
|data science    |547,933,652,535       |
|trolling        |6,932,651,793,973     |
|tutorial        |136,962,659,686,446   |
|literature      |938,052,188,421       |
|sat             |8,285,094,966,049     |
|history         |4,171,087,144,374,509 |
|cubes           |838,407,158,312,573   |
|music           |19,247,971,071,879,404|
+----------------+----------------

In [23]:
# Calculando a média, o valor mínimo e o valor máximo de 'Views' para cada valor único da coluna 'Keyword', sem casas decimais
# Calculating the mean, minimum, and maximum of “Views” for each unique “Keyword”, without decimal places
video_views_min = video_views_avg = df_video.groupBy('Keyword').agg(
    format_number(min('Views'), 0).alias('Views Min'))
video_views_max = video_views_avg = df_video.groupBy('Keyword').agg(
    format_number(max('Views'), 0).alias('Views Max'))

video_views_min.show()
video_views_max.show()

+----------------+---------+
|         Keyword|Views Min|
+----------------+---------+
|computer science|   16,115|
|            lofi|    6,817|
|         finance|    1,195|
|             cnn|   51,269|
|           apple|    1,954|
|            news|   10,642|
|         mukbang|    3,618|
|       education|    6,611|
|       interview|    2,587|
|          crypto|    1,599|
|   mathchemistry|       25|
|            food|   47,430|
|    data science|      911|
|        trolling|    5,388|
|        tutorial|   19,323|
|      literature|    2,847|
|             sat|    7,163|
|         history|    6,640|
|           cubes|   10,146|
|           music|    2,944|
+----------------+---------+
only showing top 20 rows
+----------------+-----------+
|         Keyword|  Views Max|
+----------------+-----------+
|computer science|  7,004,107|
|            lofi| 84,747,957|
|         finance|  9,450,554|
|             cnn|  1,889,320|
|           apple|425,478,119|
|            news|  1,465,011|


In [24]:
# Mostrando o primeiro e o último 'Published At' para cada valor único da coluna 'Keyword'
# Showing the first and last “Published At” values for each unique “Keyword”
df_video.groupBy('Keyword').agg(
    min('Published At').alias('First Published At'),
    max('Published At').alias('Last Published At')
).show(truncate=False)

+----------------+------------------+-----------------+
|Keyword         |First Published At|Last Published At|
+----------------+------------------+-----------------+
|computer science|2009-08-20        |2022-08-12       |
|lofi            |2019-12-08        |2022-08-24       |
|finance         |2012-11-27        |2022-08-24       |
|cnn             |2022-07-14        |2022-08-24       |
|apple           |2016-11-02        |2022-08-24       |
|news            |2022-08-18        |2022-08-24       |
|mukbang         |2020-02-29        |2022-08-24       |
|education       |2008-07-25        |2022-08-24       |
|interview       |2016-01-05        |2022-08-24       |
|crypto          |2022-03-11        |2022-08-24       |
|mathchemistry   |2013-04-15        |2022-05-03       |
|food            |2017-05-31        |2022-08-24       |
|data science    |2018-06-23        |2022-08-24       |
|trolling        |2020-06-14        |2022-08-24       |
|tutorial        |2017-02-01        |2022-08-23 

In [25]:
# Contando todos os 'title' de forma normal e todos os únicos e verifique se há diferença
# Counting all ‘title’ entries normally and counting the distinct ones, then checking for any difference
df_video.agg(count('Title').alias('Title Count')).show()
df_video.agg(count_distinct('Title').alias('Unique Title Count')).show()

+-----------+
|Title Count|
+-----------+
|       1881|
+-----------+

+------------------+
|Unique Title Count|
+------------------+
|              1854|
+------------------+



In [26]:
# Mostrando a quantidade de registros ordenados por ano em ordem ascendente
# Showing the record count per year, sorted in ascending order
df_video.groupBy('Year').count().orderBy('Year').show()

+----+-----+
|Year|count|
+----+-----+
|2007|    2|
|2008|    1|
|2009|    9|
|2010|    6|
|2011|    4|
|2012|   12|
|2013|    6|
|2014|   10|
|2015|   15|
|2016|   34|
|2017|   47|
|2018|   57|
|2019|   87|
|2020|  159|
|2021|  230|
|2022| 1202|
+----+-----+



In [27]:
# Mostrando a quantidade de registros ordenados por ano e mês em ordem ascendente
# Showing the record count grouped by year and month, sorted in ascending order
df_video.groupBy('Year', 'Month').count().orderBy('Year', 'Month').show()

+----+-----+-----+
|Year|Month|count|
+----+-----+-----+
|2007|    7|    1|
|2007|   12|    1|
|2008|    7|    1|
|2009|    2|    2|
|2009|    6|    2|
|2009|    7|    1|
|2009|    8|    1|
|2009|   10|    1|
|2009|   12|    2|
|2010|    3|    1|
|2010|    5|    2|
|2010|    6|    1|
|2010|    9|    1|
|2010|   10|    1|
|2011|    2|    1|
|2011|    5|    1|
|2011|    9|    1|
|2011|   10|    1|
|2012|    1|    1|
|2012|    2|    3|
+----+-----+-----+
only showing top 20 rows


In [28]:
# Calculando a média acumulativa de ‘Likes’ para cada ‘Keyword’ ao longo dos anos.
# Calculating the cumulative average of “Likes” for each “Keyword” over the years
window_likes = Window.partitionBy('Keyword').orderBy('Year').rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_video_c_avg = df_video.withColumn(
    'Likes Cumulative Average',
    avg('Likes').over(window_likes)
)

df_video_c_avg.select('Title', 'Keyword', 'Year', 'Likes', 'Likes Cumulative Average').show(truncate=False)

+-------------------------------------------------------------------------+-------+----+--------+------------------------+
|Title                                                                    |Keyword|Year|Likes   |Likes Cumulative Average|
+-------------------------------------------------------------------------+-------+----+--------+------------------------+
|The Animals - House of the Rising Sun (1964) HQ/Widescreen ♫ 58 YEARS AGO|animals|2009|1357197 |1357197.0               |
|Nickelback-Animals                                                       |animals|2010|68133   |712665.0                |
|Neon Trees - Animal                                                      |animals|2010|338601  |587977.0                |
|Martin Garrix - Animals (Official Video)                                 |animals|2013|11025176|3197276.75              |
|Maroon 5 - Animals (Official Music Video)                                |animals|2014|5743875 |3706596.4               |
|Maroon 5 - Anim