In [1]:
# Instala o PySpark (caso rode em ambiente que precise, ex: Google Colab)
!pip install pyspark



In [33]:
# Importa as bibliotecas necessárias do PySpark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import month
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler, PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
# Cria uma SparkSession, que é o ponto de entrada para usar Spark
spark = SparkSession.builder.getOrCreate()

In [6]:
# Lê um arquivo parquet contendo dados tratados de vídeos no Google Drive
df_video = spark.read.option('header', True).parquet('/content/drive/MyDrive/aulas/videos-tratados-parquet')
# Mostra as primeiras linhas do dataframe para visualização inicial
df_video.show(10)

In [16]:
# Cria uma nova coluna 'Month' extraída da coluna 'Published At' (mês da data de publicação)
df_video = df_video.withColumn('Month', month('Published At'))
df_video.show(10)

+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+
|               Title|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|Month|Keyword Index|
+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+-----+-------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|         mukbang|378858|   18860| 17975269|   18372987|2020|    4|         30.0|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|            news|  6379|    4853|   808787|     820019|2022|    8|         37.0|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|            news|  1029|    2347|    97434|     100810|2022|    8|         37.0|
|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|         mukbang| 45628|   17264|  5283664|    5346556|2022|    5|         30.0|
|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|         physics| 10959|     525|   844015|     855499|20

In [14]:
# Converte a coluna categórica 'Keyword' para índices numéricos, facilitando o uso em ML
indexer = StringIndexer(inputCol='Keyword', outputCol='Keyword Index')
df_video = indexer.fit(df_video).transform(df_video)
df_video.show(10)

+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+
|               Title|   Video ID|Published At|  Keyword| Likes|Comments|   Views|Interaction|Year|month|Keyword Index|
+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|  mukbang|378858|   18860|17975269|   18372987|2020|    4|         30.0|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|     news|  6379|    4853|  808787|     820019|2022|    8|         37.0|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|     news|  1029|    2347|   97434|     100810|2022|    8|         37.0|
|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|  mukbang| 45628|   17264| 5283664|    5346556|2022|    5|         30.0|
|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|  physics| 10959|     525|  844015|     855499|2017|    1|          7.0|
|Eating ONLY KOREA...|-7hzaGya86g|  2022

In [30]:
# Define as colunas que serão usadas para criar o vetor de características (features)
feature_cols = ['Likes', 'Views', 'Year', 'Month', 'Keyword Index']

# Remove linhas com valores nulos nas colunas usadas para features (obrigatório para alguns transformadores)
df_video = df_video.na.drop(subset=feature_cols)

# Remove a coluna 'Features' existente caso já exista para evitar conflito
if 'Features' in df_video.columns:
    df_video = df_video.drop('Features')

# Cria um vetor de características chamado 'Features' a partir das colunas selecionadas
assembler = VectorAssembler(inputCols=feature_cols, outputCol='Features')
df_video = assembler.transform(df_video)

In [31]:
# Remoção de linhas com nulos na coluna 'Features' (corrigindo seu erro: estava usando variável 'features' minúscula)
df_video = df_video.na.drop(subset=['Features'])

# Remove a coluna 'Features Normal' caso exista (para evitar conflito)
if 'Features Normal' in df_video.columns:
    df_video = df_video.drop('Features Normal')

# Aplica a normalização MinMax para escalar os dados entre 0 e 1, produzindo 'Features Normal'
scaler = MinMaxScaler(inputCol='Features', outputCol='Features Normal')
scaler_model = scaler.fit(df_video)
df_video = scaler_model.transform(df_video)
df_video.show(10)

+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|  Keyword| Likes|Comments|   Views|Interaction|Year|Month|Keyword Index|        Features PCA|            Features|     Features Normal|
+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|  mukbang|378858|   18860|17975269|   18372987|2020|    4|         30.0|[-1.7977902050831...|[378858.0,1.79752...|[0.02303716158264...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|     news|  6379|    4853|  808787|     820019|2022|    8|         37.0|[-808813.5260760847]|[6379.0,808787.0,...|[3.87946679100418...|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|     news|  1029|    2347|   97434|   

In [27]:
# Aplica PCA para reduzir dimensionalidade (k=1), gerando uma nova coluna 'Features PCA'
pca = PCA(k=1, inputCol='Features', outputCol='Features PCA')
pca_model = pca.fit(df_video)
df_video = pca_model.transform(df_video)
df_video.show(10)

+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|  Keyword| Likes|Comments|   Views|Interaction|Year|Month|Keyword Index|            Features|     Features Normal|        Features PCA|
+--------------------+-----------+------------+---------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|  mukbang|378858|   18860|17975269|   18372987|2020|    4|         30.0|[378858.0,18860.0...|[0.02303716158264...|[-1.7977902050831...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|     news|  6379|    4853|  808787|     820019|2022|    8|         37.0|[6379.0,4853.0,80...|[3.87946679100418...|[-808813.5260760847]|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|     news|  1029|    2347|   97434|   

In [32]:
# Divide o dataset em 80% para treino e 20% para teste, com seed fixo para reprodutibilidade
df_train, df_test = df_video.randomSplit([0.8, 0.2], seed=42)
print(f'Train: {df_train.count()}')
print(f'Test: {df_test.count()}')

Train: 1541
Test: 328


In [35]:
# Cria um modelo de regressão linear para prever o valor da coluna 'Comments'
# usando como features a coluna normalizada 'Features Normal'
lr = LinearRegression(featuresCol='Features Normal', labelCol='Comments')
lr_model = lr.fit(df_train)

# Aplica o modelo nos dados de teste para gerar previsões
predictions = lr_model.transform(df_test)

# Exibe as primeiras previsões junto com os valores reais
predictions.select('Comments', 'prediction').show(10)

# Avalia o modelo calculando RMSE (erro quadrático médio) e R2 (coeficiente de determinação)
evaluator_rmse = RegressionEvaluator(labelCol='Comments', predictionCol='prediction', metricName='rmse')
evaluator_r2 = RegressionEvaluator(labelCol='Comments', predictionCol='prediction', metricName='r2')
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

+--------+------------------+
|Comments|        prediction|
+--------+------------------+
|    4043| 6226.043963974927|
|     595|-709.5924419754601|
|    7921|1894.1868192825377|
|    1035|-360.3658439220635|
|     292|1148.0004563140067|
|    2293|1064.2382039989695|
|      37|337.60873773678986|
|     199| 810.2822547693347|
|  131040|132684.92433314162|
|    7270|17455.702984320327|
+--------+------------------+
only showing top 10 rows

RMSE: 8949.141405909737
R2: 0.8505524810197589


In [36]:
# Salva o dataframe final, já processado e com colunas de features, em formato parquet
df_video.write.mode('overwrite').parquet('/content/drive/MyDrive/aulas/videos-preparados-parquet')

In [37]:
# Encerra a sessão Spark para liberar recursos
spark.stop()