In [1]:
!pip install -q pyspark

In [2]:
from pyspark.sql import SparkSession

# Cria a sessão Spark
spark = SparkSession.builder \
    .appName("Modulo Preparacao de Dados") \
    .getOrCreate()


In [3]:
from google.colab import files

# Abre seletor de arquivos para upload manual
uploaded = files.upload()


Saving videos-comments-tratados.snappy.parquet to videos-comments-tratados.snappy.parquet


In [6]:
df_video = spark.read.parquet("videos-tratados.snappy.parquet")
df_video.show()


+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|             Comment|Sentiment|Likes Comment|
+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Let's not forget ...|        1|           95|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Here in NZ 50% of...|        0|           19|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|I will forever ac...|        2|          161|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Whenever I go to ...|        0|            8|
|wAZZ-UWGVHI|

In [7]:
from pyspark.sql.functions import month

df_video = df_video.withColumn("Month", month("Published At"))
df_video.select("Published At", "Month").show()


+------------+-----+
|Published At|Month|
+------------+-----+
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-23|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
|  2022-08-24|    8|
+------------+-----+
only showing top 20 rows



In [9]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Keyword", outputCol="Keyword Index")
df_video = indexer.fit(df_video).transform(df_video)
df_video.select("Keyword", "Keyword Index").show()

+-------+-------------+
|Keyword|Keyword Index|
+-------+-------------+
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
|   tech|         17.0|
+-------+-------------+
only showing top 20 rows



In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import IntegerType

# Cast 'Year' column to IntegerType
df_video = df_video.withColumn("Year", df_video["Year"].cast(IntegerType()))

assembler = VectorAssembler(
    inputCols=["Likes", "Views", "Year", "Month", "Keyword Index"],
    outputCol="Features"
)

df_video = assembler.transform(df_video)
df_video.select("Features").show(truncate=False)

+-----------------------------------+
|Features                           |
+-----------------------------------+
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[3407.0,135612.0,2022.0,8.0,17.0]  |
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
|[76779.0,1758063.0,2022.0,8.0,17.0]|
+-----------------------------------+
only showing top 20 rows



In [12]:
from pyspark.ml.feature import Normalizer

# Filtra registros sem valores nulos nos campos do vetor
df_video = df_video.na.drop(subset=["Likes", "Views", "Year", "Month", "Keyword Index"])

normalizer = Normalizer(inputCol="Features", outputCol="Features Normal", p=2)
df_video = normalizer.transform(df_video)
df_video.select("Features Normal").show(truncate=False)


+--------------------------------------------------------------------------------------------------------+
|Features Normal                                                                                         |
+--------------------------------------------------------------------------------------------------------+
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592899,0.014903826049070024,5.896667081728991E-5,1.2530417548674107E-4]|
|[0.02511243093431334,0.9995735203592

In [13]:
from pyspark.ml.feature import PCA

pca = PCA(k=1, inputCol="Features", outputCol="Features PCA")
pca_model = pca.fit(df_video)
df_video = pca_model.transform(df_video)
df_video.select("Features PCA").show(truncate=False)


+---------------------+
|Features PCA         |
+---------------------+
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-135636.63188203107]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
|[-1758667.8498040342]|
+---------------------+
only showing top 20 rows



In [14]:
df_train, df_test = df_video.randomSplit([0.8, 0.2], seed=42)
print("Treino:", df_train.count(), "| Teste:", df_test.count())


Treino: 14789 | Teste: 3620


In [15]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="Features Normal", labelCol="Comments")
lr_model = lr.fit(df_train)

# Avaliar no conjunto de teste
test_results = lr_model.evaluate(df_test)
print("RMSE:", test_results.rootMeanSquaredError)
print("R2:", test_results.r2)


RMSE: 43345.23343236093
R2: 0.008252984288786291
