In [1]:
from pyspark.sql.types import *
import pandas as pd
import pyspark 
import os 
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, sum as spark_sum
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
# Start a Spark session
spark = SparkSession.builder.appName("s33ding").getOrCreate()

# Read the Parquet file into a DataFrame
df = spark.read.parquet("dataset/enem.parquet")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/10 16:55:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/10 16:55:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [3]:
# Assemble the features
assembler = VectorAssembler(
    inputCols=["NOTA_CH_CIENCIAS_HUMANAS", "NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO", "NOTA_MEDIA_5_NOTAS"],
    outputCol="features"
)
df = assembler.transform(df)

In [4]:
df.show()

23/06/10 16:56:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+------+------------+----------------+---------------+---------+--------------------+-------------+--------------------+--------------------+---------------+-------------+--------------+----------------------+------------------+---------------+-----------+-----------+------------------+--------------------+----------------------------+------------------------+----------------------------+------------------+------------+------------------+--------------------+
|NU_ANO|NU_INSCRICAO| TP_FAIXA_ETARIA|Idade_Calculada|  TP_SEXO|     TP_ESTADO_CIVIL|  TP_COR_RACA|    TP_NACIONALIDADE|     TP_ST_CONCLUSAO|TP_ANO_CONCLUIU|    TP_ESCOLA|     TP_ENSINO|TP_DEPENDENCIA_ADM_ESC|TP_LOCALIZACAO_ESC|TP_SIT_FUNC_ESC|CO_UF_PROVA|SG_UF_PROVA|CO_MUNICIPIO_PROVA|  NO_MUNICIPIO_PROVA|NOTA_CN_CIENCIAS_DA_NATUREZA|NOTA_CH_CIENCIAS_HUMANAS|NOTA_LC_LINGUAGENS_E_CODIGOS|NOTA_MT_MATEMATICA|NOTA_REDACAO|NOTA_MEDIA_5_NOTAS|            features|
+------+------------+----------------+---------------+---------+--------

                                                                                

In [5]:
# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Linear Regression model
linear_regression = LinearRegression(labelCol="NOTA_CN_CIENCIAS_DA_NATUREZA")
linear_regression_model = linear_regression.fit(train_data)

# Decision Tree Regression model
decision_tree = DecisionTreeRegressor(labelCol="NOTA_CN_CIENCIAS_DA_NATUREZA")
decision_tree_model = decision_tree.fit(train_data)

# Random Forest Regression model
random_forest = RandomForestRegressor(labelCol="NOTA_CN_CIENCIAS_DA_NATUREZA")
random_forest_model = random_forest.fit(train_data)

# Make predictions
linear_regression_predictions = linear_regression_model.transform(test_data)
decision_tree_predictions = decision_tree_model.transform(test_data)
random_forest_predictions = random_forest_model.transform(test_data)

# Join the predictions of all models
joined_predictions = linear_regression_predictions \
    .join(decision_tree_predictions, "NOTA_CN_CIENCIAS_DA_NATUREZA") \
    .join(random_forest_predictions, "NOTA_CN_CIENCIAS_DA_NATUREZA") \
    .select(
        linear_regression_predictions["NOTA_CN_CIENCIAS_DA_NATUREZA"],
        linear_regression_predictions["prediction"].alias("Linear Regression"),
        decision_tree_predictions["prediction"].alias("Decision Tree"),
        random_forest_predictions["prediction"].alias("Random Forest")
    )

# Display the comparison table
joined_predictions.show()

23/06/10 16:56:08 WARN Instrumentation: [a0d8642e] regParam is zero, which might cause numerical instability and overfitting.
23/06/10 16:56:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/06/10 16:56:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
23/06/10 16:56:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
[Stage 34:>                                                         (0 + 1) / 1]

+----------------------------+-----------------+-----------------+------------------+
|NOTA_CN_CIENCIAS_DA_NATUREZA|Linear Regression|    Decision Tree|     Random Forest|
+----------------------------+-----------------+-----------------+------------------+
|                       607.1|607.0999626637774|615.0572091725065| 608.8109228035678|
|                       607.1|607.0999626637774|615.0572091725065| 611.2194083932978|
|                       607.1|607.0999626637774|615.0572091725065| 608.8502278198961|
|                       607.1|607.0999626637774|615.0572091725065|  616.878763796022|
|                       607.1|607.0999626637774|615.0572091725065| 599.2841532149544|
|                       607.1|607.0999626637774|615.0572091725065| 539.9509205585699|
|                       607.1|607.0999626637774|615.0572091725065| 566.1729586801608|
|                       607.1|607.0999626637774|615.0572091725065| 516.2101385738076|
|                       607.1|607.0999626637774|615.05

                                                                                

In [7]:
import matplotlib.pyplot as plt

# Order the joined DataFrame by NOTA_CN_CIENCIAS_DA_NATUREZA
ordered_predictions_pd = joined_predictions.orderBy("NOTA_CN_CIENCIAS_DA_NATUREZA")
ordered_predictions_pd = joined_predictions.sample(False, 0.1).toPandas()

# Create the line graph using matplotlib
plt.figure(figsize=(10, 6))
plt.plot(ordered_predictions_pd["NOTA_CN_CIENCIAS_DA_NATUREZA"], ordered_predictions_pd["Linear Regression"], color="red", label="Linear Regression")
plt.plot(ordered_predictions_pd["NOTA_CN_CIENCIAS_DA_NATUREZA"], ordered_predictions_pd["Decision Tree"], color="blue", label="Decision Tree")
plt.plot(ordered_predictions_pd["NOTA_CN_CIENCIAS_DA_NATUREZA"], ordered_predictions_pd["Random Forest"], color="green", label="Random Forest")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Comparison of Predicted Grades from Different Models")
plt.legend()
plt.show()


ConnectionRefusedError: [Errno 111] Connection refused