In [2]:
!pip install pyspark



In [28]:
# Importa bibliotecas necessárias do PySpark
import pyspark
from pyspark.sql import SparkSession  # Para criar a sessão Spark
from pyspark.sql.functions import *   # Para funções SQL como avg, max, min, count, format_number etc.
from pyspark.sql.window import Window # Para criar janelas para funções de janela

In [4]:
# Cria ou obtém uma sessão Spark existente
spark = SparkSession.builder.getOrCreate()

In [5]:
# Lê o arquivo Parquet com dados preparados dos vídeos, com cabeçalho
df_video = spark.read.option('header', True).parquet('/content/drive/MyDrive/aulas/videos-preparados-parquet')
# Exibe as primeiras linhas do DataFrame
df_video.show()

+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|Month|Keyword Index|        Features PCA|            Features|     Features Normal|
+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|         mukbang|378858|   18860| 17975269|   18372987|2020|    4|         30.0|[-1.7977902050831...|[378858.0,1.79752...|[0.02303716158264...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|            news|  6379|    4853|   808787|     820019|2022|    8|         37.0|[-808813.5260760847]|[6379.0,808787.0,...|[3.87946679100418...|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-2

In [6]:
# Agrupa os dados pelo campo 'Keyword' e conta a quantidade de ocorrências, ordenando pela contagem em ordem decrescente
df_video.groupBy('Keyword').count().orderBy(desc('count')).show()

+----------------+-----+
|         Keyword|count|
+----------------+-----+
|             cnn|   50|
|       interview|   50|
|          crypto|   50|
|    data science|   50|
|        trolling|   50|
|        tutorial|   50|
|          marvel|   50|
|game development|   50|
|         mrbeast|   50|
|         physics|   50|
|             sat|   49|
|         history|   49|
|           cubes|   49|
|        reaction|   49|
|          sports|   49|
|            asmr|   49|
|computer science|   48|
|            food|   48|
|          how-to|   48|
|machine learning|   48|
+----------------+-----+
only showing top 20 rows



In [7]:
# Agrupa por 'Keyword' e calcula a média da coluna 'Interaction', formatando com 2 casas decimais,
# e ordena pela média de interação em ordem decrescente
df_video.groupBy('Keyword') \
        .agg(format_number(avg('Interaction'), 2).alias('Interaction Mean')) \
        .orderBy(desc('Interaction Mean')) \
        .show()

+----------------+----------------+
|         Keyword|Interaction Mean|
+----------------+----------------+
|         animals|   95,506,608.53|
|      literature|      881,726.50|
|          sports|    8,695,551.63|
|game development|      752,243.56|
|         finance|      708,542.95|
|          how-to|    7,975,134.50|
|        business|    7,310,180.02|
|         mrbeast|   68,965,862.82|
|        tutorial|    6,936,688.30|
|          marvel|    6,834,159.44|
|             cnn|      570,650.86|
|    data science|      562,465.28|
|             bed|   54,382,091.75|
|          gaming|      520,080.38|
|            food|    5,352,944.10|
|          crypto|      413,676.20|
|          movies|    4,897,436.32|
|         biology|    4,192,382.06|
|            lofi|    4,167,085.88|
|         physics|    3,795,529.38|
+----------------+----------------+
only showing top 20 rows



In [8]:
# Agrupa por 'Keyword' e obtém o valor máximo da coluna 'Interaction', formatado com 2 casas decimais,
# ordenando em ordem decrescente de interação máxima
df_video.groupBy('Keyword') \
        .agg(format_number(max('Interaction'), 2).alias('Rank Interactions')) \
        .orderBy(desc('Rank Interactions')) \
        .show()

+----------------+-----------------+
|         Keyword|Rank Interactions|
+----------------+-----------------+
|           music|   922,551,152.00|
|         finance|     9,677,571.00|
|         mukbang|    87,433,858.00|
|            lofi|    86,445,177.00|
|          gaming|     7,385,105.00|
|computer science|     7,199,563.00|
|        tutorial|    69,616,442.00|
|          movies|    65,253,870.00|
|game development|     6,692,070.00|
|          marvel|    56,247,330.00|
|             bed|   532,691,631.00|
|          how-to|    53,053,975.00|
|            food|    48,754,479.00|
|         history|   440,187,490.00|
|         physics|    43,463,298.00|
|           apple|   429,916,936.00|
|      literature|     4,294,863.00|
|            asmr|    34,411,125.00|
|        nintendo|    32,268,486.00|
|         mrbeast|   300,397,699.00|
+----------------+-----------------+
only showing top 20 rows



In [9]:
# Agrupa por 'Keyword' e calcula a média e a variância da coluna 'Views', formatando os números,
# e ordena pelo nome da palavra-chave em ordem decrescente
df_video.groupBy('Keyword') \
        .agg(
            format_number(avg('Views'), 2).alias('Views Mean'),
            format_number(variance('Views'), 2).alias('Variance Views')
        ) \
        .orderBy(desc('Keyword')) \
        .show()

+----------------+-------------+--------------------+
|         Keyword|   Views Mean|      Variance Views|
+----------------+-------------+--------------------+
|            xbox| 1,739,205.85|18,418,777,237,37...|
|        tutorial| 6,761,032.02|136,962,659,686,4...|
|        trolling| 1,420,141.02|6,932,651,793,973.29|
|            tech| 1,918,806.94|6,469,796,473,930.02|
|          sports| 8,601,204.73|309,771,202,558,8...|
|             sat| 1,065,868.71|8,285,094,966,049.21|
|        reaction|   149,439.94|   94,551,525,626.39|
|         physics| 3,692,387.28|47,414,096,705,62...|
|        nintendo| 1,665,042.25|22,531,339,745,29...|
|            news|   247,492.18|  106,751,257,667.26|
|           music|29,364,893.26|19,247,971,071,87...|
|         mukbang|10,904,772.36|558,607,323,897,3...|
|         mrbeast|66,764,003.98|3,824,123,679,605...|
|          movies| 4,862,426.45|151,688,590,780,7...|
|       minecraft| 1,784,606.79|24,351,962,909,48...|
|   mathchemistry| 3,328,125

In [10]:
# Agrupa por 'Keyword' e calcula a média, o valor máximo e o valor mínimo da coluna 'Views',
# formatando os números e ordenando pelo nome da palavra-chave em ordem decrescente
df_video.groupBy('Keyword') \
        .agg(
            format_number(avg('Views'), 2).alias('Views Mean'),
            format_number(max('Views'), 0).alias('Higher Views'),
            format_number(min('Views'), 0).alias('Lower Views')
          ) \
        .orderBy(desc('Keyword')) \
        .show()

+----------------+-------------+------------+-----------+
|         Keyword|   Views Mean|Higher Views|Lower Views|
+----------------+-------------+------------+-----------+
|            xbox| 1,739,205.85|  22,439,610|      4,368|
|        tutorial| 6,761,032.02|  68,512,549|     19,323|
|        trolling| 1,420,141.02|  14,286,302|      5,388|
|            tech| 1,918,806.94|  11,422,924|      3,461|
|          sports| 8,601,204.73| 106,014,469|        867|
|             sat| 1,065,868.71|  18,116,954|      7,163|
|        reaction|   149,439.94|   1,765,945|      9,533|
|         physics| 3,692,387.28|  42,252,029|     30,388|
|        nintendo| 1,665,042.25|  31,667,627|      2,225|
|            news|   247,492.18|   1,465,011|     10,642|
|           music|29,364,893.26| 915,457,091|      2,944|
|         mukbang|10,904,772.36|  86,169,225|      3,618|
|         mrbeast|66,764,003.98| 285,526,909|    889,300|
|          movies| 4,862,426.45|  65,067,408|      2,758|
|       minecr

In [14]:
# Agrupa por 'Keyword' e obtém as primeiras e últimas datas de publicação ('Published At')
df_video.groupBy('Keyword') \
        .agg(
            first('Published At').alias('First Published At'),
            last('Published At').alias('Last Published At')
        ) \
        .show()

+----------------+------------------+-----------------+
|         Keyword|First Published At|Last Published At|
+----------------+------------------+-----------------+
|computer science|        2022-02-08|       2020-09-08|
|            lofi|        2022-06-07|       2020-07-19|
|         finance|        2020-09-23|       2017-12-31|
|             cnn|        2022-08-17|       2022-08-13|
|           apple|        2022-08-22|       2022-08-02|
|            news|        2022-08-22|       2022-08-23|
|         mukbang|        2020-04-18|       2022-08-24|
|       education|        2015-02-06|       2010-10-14|
|       interview|        2021-08-03|       2018-10-05|
|          crypto|        2022-08-23|       2022-08-22|
|   mathchemistry|        2020-08-11|       2019-10-04|
|            food|        2022-07-17|       2022-08-20|
|    data science|        2019-08-18|       2021-08-06|
|        trolling|        2022-08-23|       2022-07-20|
|        tutorial|        2018-06-01|       2022

In [17]:
# Conta o total de títulos no DataFrame (pode conter duplicados)
total_titles = df_video.select('Title').count()
print(f"Total Titles: {total_titles}")

# Conta o número de títulos únicos no DataFrame (sem duplicados)
unique_titles = df_video.select('Title').distinct().count()
print(f"Unique Titles: {unique_titles}")

Total Titles: 1869
Unique Titles: 1854


In [20]:
# Agrupa por 'Year' e conta o total de vídeos por ano, ordenando por ano ascendente
df_video.groupBy('Year') \
        .agg(count('*').alias('Total Videos Per Year')) \
        .orderBy('Year') \
        .show()

+----+---------------------+
|Year|Total Videos Per Year|
+----+---------------------+
|2007|                    2|
|2008|                    1|
|2009|                    9|
|2010|                    6|
|2011|                    4|
|2012|                   12|
|2013|                    6|
|2014|                   10|
|2015|                   15|
|2016|                   34|
|2017|                   47|
|2018|                   57|
|2019|                   86|
|2020|                  158|
|2021|                  229|
|2022|                 1193|
+----+---------------------+



In [27]:
# Converte as colunas 'Year' e 'Month' para tipo inteiro, para facilitar ordenação e cálculos
df_video = df_video.withColumn('Year', col('Year').cast('int'))
df_video = df_video.withColumn('Month', col('Month').cast('int'))

# Agrupa por 'Year' e 'Month', conta total de vídeos por combinação ano-mês,
# ordena os resultados por ano e mês em ordem crescente e exibe até 100 linhas
df_video.groupBy('Year', 'Month') \
        .agg(count('*').alias('Total Videos Per Year And Month')) \
        .orderBy(col('Year').asc(), col('Month').asc()) \
        .show(100)

+----+-----+-------------------------------+
|Year|Month|Total Videos Per Year And Month|
+----+-----+-------------------------------+
|2007|    7|                              1|
|2007|   12|                              1|
|2008|    7|                              1|
|2009|    2|                              2|
|2009|    6|                              2|
|2009|    7|                              1|
|2009|    8|                              1|
|2009|   10|                              1|
|2009|   12|                              2|
|2010|    3|                              1|
|2010|    5|                              2|
|2010|    6|                              1|
|2010|    9|                              1|
|2010|   10|                              1|
|2011|    2|                              1|
|2011|    5|                              1|
|2011|    9|                              1|
|2011|   10|                              1|
|2012|    1|                              1|
|2012|    

In [31]:
# Define uma janela para cálculo acumulado por 'Keyword' ordenado por 'Year'
# Essa janela considera todas as linhas anteriores e a linha atual (cumulative)
window = Window.partitionBy('Keyword').orderBy('Year') \
          .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Cria uma nova coluna 'Cumulative Avg Likes' com a média acumulada da coluna 'Likes' por palavra-chave ao longo dos anos
df_resultado = df_video.withColumn('Cumulative Avg Likes', avg('Likes').over(window))

# Exibe as colunas 'Keyword', 'Year', 'Likes' e 'Cumulative Avg Likes', ordenadas por ano
df_resultado.select('Keyword', 'Year', 'Likes', 'Cumulative Avg Likes') \
          .orderBy('Year') \
          .show()

+----------------+----+-------+--------------------+
|         Keyword|Year|  Likes|Cumulative Avg Likes|
+----------------+----+-------+--------------------+
|             bed|2007| 515049|            515049.0|
|             bed|2007| 119272|            317160.5|
|       education|2008|  50074|             50074.0|
|         biology|2009| 132214|             75283.5|
|         animals|2009|1357197|           1357197.0|
|             bed|2009|1430457|   688259.3333333334|
|             bed|2009|1072689|           784366.75|
|             bed|2009| 160501|            659593.6|
|computer science|2009|  61813|             61813.0|
|         biology|2009|  18353|             18353.0|
|        business|2009| 229141|            229141.0|
|           cubes|2009|2307773|           2307773.0|
|         animals|2010| 338601|            587977.0|
|             bed|2010| 144517|            573747.5|
|         animals|2010|  68133|            712665.0|
|       education|2010| 130817|             90