In [1]:
# Instala PySpark
!pip install -q pyspark


In [2]:
# Importa e cria a SparkSession
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Modulo Avancado de Agregacoes") \
    .getOrCreate()


In [3]:
from google.colab import files

uploaded = files.upload()


Saving videos-preparados.snappy.parquet to videos-preparados.snappy.parquet


In [4]:
df_video = spark.read.parquet("videos-preparados.snappy.parquet")
df_video.show()


+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|
+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|         mukbang|378858|   18860| 17975269|   18372987|2020|    4|         30.0|[0.6985786560867407]|[0.02303716158264...|[378858.0,1.79752...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|            news|  6379|    4853|   808787|     820019|2022|    8|         37.0|[0.8936407990235931]|[3.87946679100418...|[6379.0,808787.0,...|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-2

In [5]:
df_video.groupBy("Keyword").count().show()


+----------------+-----+
|         Keyword|count|
+----------------+-----+
|computer science|   48|
|            lofi|   40|
|         finance|   39|
|             cnn|   50|
|           apple|   42|
|            news|   39|
|         mukbang|   45|
|       education|   24|
|       interview|   50|
|          crypto|   50|
|   mathchemistry|   15|
|            food|   48|
|    data science|   50|
|        trolling|   50|
|        tutorial|   50|
|      literature|   46|
|             sat|   49|
|         history|   49|
|           cubes|   49|
|           music|   46|
+----------------+-----+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import avg

df_video.groupBy("Keyword").agg(avg("Interaction").alias("Avg Interaction")).show()


+----------------+--------------------+
|         Keyword|     Avg Interaction|
+----------------+--------------------+
|computer science|  1226793.0208333333|
|            lofi|         4167085.875|
|         finance|   708542.9487179487|
|             cnn|           570650.86|
|           apple|1.0873628214285715E7|
|            news|  251688.71794871794|
|         mukbang|1.1053630377777778E7|
|       education|         2750838.625|
|       interview|          3044867.04|
|          crypto|            413676.2|
|   mathchemistry|  3427342.7333333334|
|            food|   5352944.104166667|
|    data science|           562465.28|
|        trolling|          1484584.88|
|        tutorial|           6936688.3|
|      literature|            881726.5|
|             sat|           1098927.0|
|         history| 1.565269257142857E7|
|           cubes|1.5043961224489795E7|
|           music|2.9691370304347824E7|
+----------------+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import max as max_

df_video.groupBy("Keyword") \
    .agg(max_("Interaction").alias("Rank Interactions")) \
    .orderBy("Rank Interactions", ascending=False) \
    .show()


+--------+-----------------+
| Keyword|Rank Interactions|
+--------+-----------------+
| animals|       1593623628|
|   music|        922551152|
|     bed|        532691631|
| history|        440187490|
|   apple|        429916936|
| mrbeast|        300397699|
|  google|        239385460|
|business|        210025196|
|   cubes|        170925917|
|  sports|        106924567|
| mukbang|         87433858|
|    lofi|         86445177|
|tutorial|         69616442|
|  movies|         65253870|
|  marvel|         56247330|
|  how-to|         53053975|
|    food|         48754479|
| physics|         43463298|
|    asmr|         34411125|
|nintendo|         32268486|
+--------+-----------------+
only showing top 20 rows



In [8]:
from pyspark.sql.functions import variance

df_video.groupBy("Keyword") \
    .agg(
        avg("Views").alias("Avg Views"),
        variance("Views").alias("Var Views")
    ).show()


+----------------+--------------------+--------------------+
|         Keyword|           Avg Views|           Var Views|
+----------------+--------------------+--------------------+
|computer science|  1191958.7083333333| 2.81219868165102E12|
|            lofi|           4089363.0|1.846209641476677...|
|         finance|   694223.4358974359|3.304483175097042...|
|             cnn|           554240.38|1.563423618468118...|
|           apple|1.0746930452380951E7|4.299927977442589E15|
|            news|   247492.1794871795|1.067512576672564...|
|         mukbang|1.0904772355555555E7|5.586073238973179...|
|       education|  2684432.8333333335|1.833572249339214...|
|       interview|          2966111.86|1.819220996034335E13|
|          crypto|           404608.22|3.513691634369074E12|
|   mathchemistry|  3328125.2666666666|2.491467065256849...|
|            food|          5252406.25|7.326374128154842E13|
|    data science|           544771.98|5.479336525349994...|
|        trolling|      

In [9]:
from pyspark.sql.functions import min as min_, max as max_, round

df_video.groupBy("Keyword") \
    .agg(
        round(avg("Views"), 0).alias("Avg Views"),
        min_("Views").alias("Min Views"),
        max_("Views").alias("Max Views")
    ).show()


+----------------+-----------+---------+---------+
|         Keyword|  Avg Views|Min Views|Max Views|
+----------------+-----------+---------+---------+
|computer science|  1191959.0|    16115|  7004107|
|            lofi|  4089363.0|     6817| 84747957|
|         finance|   694223.0|     1195|  9450554|
|             cnn|   554240.0|    51269|  1889320|
|           apple| 1.074693E7|     1954|425478119|
|            news|   247492.0|    10642|  1465011|
|         mukbang|1.0904772E7|     3618| 86169225|
|       education|  2684433.0|     6611| 17103736|
|       interview|  2966112.0|     2587| 22529756|
|          crypto|   404608.0|     1599| 11805668|
|   mathchemistry|  3328125.0|       25| 18496859|
|            food|  5252406.0|    47430| 48018833|
|    data science|   544772.0|      911|  3069097|
|        trolling|  1420141.0|     5388| 14286302|
|        tutorial|  6761032.0|    19323| 68512549|
|      literature|   863021.0|     2847|  4231789|
|             sat|  1065869.0| 

In [15]:
from pyspark.sql.functions import min as min_, max as max_

df_video.groupBy("Keyword") \
    .agg(
        min_("Published At").alias("First Published"),
        max_("Published At").alias("Last Published")
    ).show(truncate=False)


+----------------+---------------+--------------+
|Keyword         |First Published|Last Published|
+----------------+---------------+--------------+
|computer science|2009-08-20     |2022-08-12    |
|lofi            |2019-12-08     |2022-08-24    |
|finance         |2012-11-27     |2022-08-24    |
|cnn             |2022-07-14     |2022-08-24    |
|apple           |2016-11-02     |2022-08-24    |
|news            |2022-08-18     |2022-08-24    |
|mukbang         |2020-02-29     |2022-08-24    |
|education       |2008-07-25     |2022-08-24    |
|interview       |2016-01-05     |2022-08-24    |
|crypto          |2022-03-11     |2022-08-24    |
|mathchemistry   |2013-04-15     |2022-05-03    |
|food            |2017-05-31     |2022-08-24    |
|data science    |2018-06-23     |2022-08-24    |
|trolling        |2020-06-14     |2022-08-24    |
|tutorial        |2017-02-01     |2022-08-23    |
|literature      |2010-05-18     |2022-03-01    |
|sat             |2011-10-07     |2022-08-24    |


In [11]:
from pyspark.sql.functions import countDistinct

total_titles = df_video.count()
unique_titles = df_video.select(countDistinct("title")).collect()[0][0]

print(f"Total de titles: {total_titles}")
print(f"Total de titles únicos: {unique_titles}")
print("Há diferença?", total_titles != unique_titles)


Total de titles: 1869
Total de titles únicos: 1854
Há diferença? True


In [12]:
df_video.groupBy("Year").count().orderBy("Year", ascending=True).show()


+----+-----+
|Year|count|
+----+-----+
|2007|    2|
|2008|    1|
|2009|    9|
|2010|    6|
|2011|    4|
|2012|   12|
|2013|    6|
|2014|   10|
|2015|   15|
|2016|   34|
|2017|   47|
|2018|   57|
|2019|   86|
|2020|  158|
|2021|  229|
|2022| 1193|
+----+-----+



In [13]:
df_video.groupBy("Year", "Month").count().orderBy("Year", "Month").show()


+----+-----+-----+
|Year|Month|count|
+----+-----+-----+
|2007|    7|    1|
|2007|   12|    1|
|2008|    7|    1|
|2009|    2|    2|
|2009|    6|    2|
|2009|    7|    1|
|2009|    8|    1|
|2009|   10|    1|
|2009|   12|    2|
|2010|    3|    1|
|2010|    5|    2|
|2010|    6|    1|
|2010|    9|    1|
|2010|   10|    1|
|2011|    2|    1|
|2011|    5|    1|
|2011|    9|    1|
|2011|   10|    1|
|2012|    1|    1|
|2012|    2|    3|
+----+-----+-----+
only showing top 20 rows



In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import mean

window_spec = Window.partitionBy("Keyword").orderBy("Year").rowsBetween(Window.unboundedPreceding, 0)

df_video = df_video.withColumn("Cumulative Avg Likes", mean("Likes").over(window_spec))
df_video.select("Keyword", "Year", "Likes", "Cumulative Avg Likes").show()


+-------+----+--------+--------------------+
|Keyword|Year|   Likes|Cumulative Avg Likes|
+-------+----+--------+--------------------+
|animals|2009| 1357197|           1357197.0|
|animals|2010|   68133|            712665.0|
|animals|2010|  338601|            587977.0|
|animals|2013|11025176|          3197276.75|
|animals|2014| 1019385|           2761698.4|
|animals|2014| 5743875|  3258727.8333333335|
|animals|2019| 1103713|  2950868.5714285714|
|animals|2020|   74617|         2591337.125|
|animals|2020|    9313|  2304445.5555555555|
|animals|2020|  164337|           2090434.7|
|animals|2020|   94089|  1908948.7272727273|
|animals|2020|   21946|           1751698.5|
|animals|2020|   28863|  1619172.6923076923|
|animals|2020|   73362|   1508757.642857143|
|animals|2020|  282754|  1427024.0666666667|
|animals|2020| 6177588|        1723934.3125|
|animals|2021|   11323|  1623192.4705882352|
|animals|2021|   27102|  1534520.7777777778|
|animals|2021|  120500|  1460098.6315789474|
|animals|2