In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, LongType, IntegerType
from pyspark.sql.functions import when, col, min, max, month, count
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, MinMaxScaler, PCA, VectorAssembler

In [8]:
spark = SparkSession.builder.getOrCreate()

In [9]:
# Lendo o arquivo ‘videos-tratados.snappy.parquet' no dataframe 'df_video'
# Reading the 'videos-tratados.snappy.parquet' file into the 'df_video' dataframe
schema = StructType([
    StructField('Title', StringType(), True),
    StructField('Video ID', StringType(), True),
    StructField('Published At', DateType(), True),
    StructField('Keyword', StringType(), True),
    StructField('Likes', LongType(), True),
    StructField('Comments', LongType(), True),
    StructField('Views', LongType(), True),
    StructField('Interaction', LongType(), True),
    StructField('Year', IntegerType(), True),
])
df_video = spark.read.parquet('videos-tratados-parquet', header=True, schema=schema)

df_video.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: long (nullable = true)
 |-- Comments: long (nullable = true)
 |-- Views: long (nullable = true)
 |-- Interaction: long (nullable = true)
 |-- Year: integer (nullable = true)



In [10]:
# Adicionando a coluna 'Month' com o valor do mês da coluna "Published At"
# Adding the 'Month' column with the month value from the "Published At" column
df_video = df_video.withColumn('Month', month(col('Published At')))

In [11]:
# Adicionando a coluna "Keyword Index" com a transformação da coluna 'keyword' para valores numéricos
# Adding the "Keyword Index" column by transforming the 'keyword' column into numerical values
indexador = StringIndexer(inputCol='Keyword', outputCol='Keyword Index')
modelo_indexer = indexador.fit(df_video)
df_video = modelo_indexer.transform(df_video)

In [12]:
df_video_countnull = df_video.select([count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns])

df_video_countnull.show()

+-----+--------+------------+-------+-----+--------+-----+-----------+----+-----+-------------+
|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|Month|Keyword Index|
+-----+--------+------------+-------+-----+--------+-----+-----------+----+-----+-------------+
|    0|       0|           0|      0|    0|       0|    0|          0|   0|    0|            0|
+-----+--------+------------+-------+-----+--------+-----+-----------+----+-----+-------------+



In [13]:
# Criando um vetor chamado "Features" com os campos: "Likes", "Views", "Year", "Month", "Keyword Index"
# Creating a vector called "Features" with the fields: "Likes", "Views", "Year", "Month", "Keyword Index"
df_video = df_video.na.drop()

vetor = VectorAssembler(
    inputCols=['Likes', 'Views', 'Year', 'Month', 'Keyword Index'],
    outputCol='Features'
)

# Transformando o dataframe df_video com o VectorAssembler
# Transforming the df_video dataframe with VectorAssembler
df_video = vetor.transform(df_video)

In [14]:
# Adicionando a coluna "Features Normal" com os dados normalizados da coluna Features
# Adding the "Features Normal" column with normalized data from the Features column
scaler = MinMaxScaler(inputCol=('Features'), outputCol=('Features Normal'))
modelo_scaler = scaler.fit(df_video)

df_video = modelo_scaler.transform(df_video)

In [15]:
# Adicionando a coluna "Features PCA" com a redução de 5 características para 1, utilizando o modelo PCA
# Adding the "Features PCA" column with the reduction of 5 features to 1, using the PCA model
pca = PCA(k=1, inputCol='Features Normal', outputCol='Features PCA')
modelo_pca = pca.fit(df_video)
df_video = modelo_pca.transform(df_video)

In [16]:
# Separando o dataframe df_video em 2 conjuntos: 80% para treinamento e 20% para teste
# Splitting the df_video dataframe into 2 sets: 80% for training and 20% for testing
train_data, test_data = df_video.randomSplit([0.8, 0.2], seed=42)

# Visualizando os tamanhos dos conjuntos
# Visualizing the sizes of the sets
print(f'Treino: {train_data.count()} Teste: {test_data.count()}')

Treino: 1551 Teste: 330


In [17]:
# Criando um modelo de regressão linear para estimar o valor do campo "Comments", utilizando a "Features Normal"
# Creating a linear regression model to estimate the value of the "Comments" field, using "Features Normal"
regressao_linear = LinearRegression(featuresCol='Features Normal', labelCol='Comments')
modelo_lr = regressao_linear.fit(train_data)
# Avaliando o modelo
# Evaluating the model
avaliar_test = modelo_lr.evaluate(test_data)
print('MSRE (Erro Quadratico Médio da Raiz):', avaliar_test.rootMeanSquaredError)
print('r2 (Coeficiente de Determinação):', avaliar_test.r2)

MSRE (Erro Quadratico Médio da Raiz): 5832.752938141989
r2 (Coeficiente de Determinação): 0.9282984934013104


In [18]:
# Salvando o dataframe df_video como 'videos-preparados-parquet' no formato parquet
# Saving the df_video dataframe as 'videos-preparados-parquet' in parquet format
df_video.write.mode('overwrite').option('header', 'true').option('inferSchema', 'true').\
  parquet('videos-preparados-parquet')