In [43]:
from pyspark.sql.types import *
import pandas as pd
import pyspark 
import os 
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, sum as spark_sum
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


In [44]:
# Start a Spark session
spark = SparkSession.builder.appName("s33ding").getOrCreate()

# Read the Parquet file into a DataFrame
df = spark.read.parquet("dataset/enem.parquet")

In [50]:
# Select the relevant columns
selected_cols = ["NOTA_CH_CIENCIAS_HUMANAS", "NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO"]
df_selected = df.select(selected_cols)

# Handle missing or null values (if any)
df_selected = df_selected.na.drop()

# Split the dataset into training and testing sets
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed=42)

# Prepare the feature vector and the target column
assembler = VectorAssembler(inputCols=["NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO"], outputCol="features")
train_data = assembler.transform(train_data).select("features", "NOTA_CH_CIENCIAS_HUMANAS")
test_data = assembler.transform(test_data).select("features", "NOTA_CH_CIENCIAS_HUMANAS")

# Train the machine learning models
lr = LinearRegression(labelCol="NOTA_CH_CIENCIAS_HUMANAS")
lr_model = lr.fit(train_data)

dt = DecisionTreeRegressor(labelCol="NOTA_CH_CIENCIAS_HUMANAS")
dt_model = dt.fit(train_data)

rf = RandomForestRegressor(labelCol="NOTA_CH_CIENCIAS_HUMANAS")
rf_model = rf.fit(train_data)

# Make predictions using the trained models
lr_predictions = lr_model.transform(test_data)
dt_predictions = dt_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)

23/06/12 02:35:00 WARN Instrumentation: [6c758799] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [53]:
# Create temporary views for the predictions
lr_predictions.createOrReplaceTempView("lr_predictions")
dt_predictions.createOrReplaceTempView("dt_predictions")
rf_predictions.createOrReplaceTempView("rf_predictions")

# Join the prediction views
joined_predictions = spark.sql("""
    SELECT lr_predictions.*, dt_predictions.prediction AS dt_prediction, rf_predictions.prediction AS rf_prediction
    FROM lr_predictions
    JOIN dt_predictions ON lr_predictions.features = dt_predictions.features
    JOIN rf_predictions ON lr_predictions.features = rf_predictions.features
""")

# Show the joined predictions
joined_predictions.show()


                                                                                

+--------------------+------------------------+------------------+------------------+------------------+
|            features|NOTA_CH_CIENCIAS_HUMANAS|        prediction|     dt_prediction|     rf_prediction|
+--------------------+------------------------+------------------+------------------+------------------+
|[438.600006103515...|                   312.0| 427.2173158397308|431.31284618680303| 433.4427031739048|
|[402.600006103515...|                   315.8|441.09851806307626|431.31284618680303|452.26744537016594|
|[403.0,470.700012...|                   318.9| 433.0226707416916|431.31284618680303| 448.4609288775179|
|[397.799987792968...|                   322.7|  427.406093463856|431.31284618680303| 445.1902182127048|
|[393.799987792968...|                   324.9| 399.7897680506144|431.31284618680303|428.91260725591854|
|[431.600006103515...|                   326.7|  425.010834104182|431.31284618680303|  434.390219283859|
|[342.5,395.299987...|                   328.4| 372.846

In [61]:
from pyspark.ml.evaluation import RegressionEvaluator

# Define the evaluator
evaluator = RegressionEvaluator(labelCol="NOTA_CH_CIENCIAS_HUMANAS", predictionCol="prediction")

# Calculate MSE for linear regression
lr_mse = evaluator.evaluate(lr_predictions, {evaluator.metricName: "mse"})

# Calculate MSE for decision tree regression
dt_mse = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mse"})

# Calculate MSE for random forest regression
rf_mse = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mse"})

# Calculate MAE for linear regression
lr_mae = evaluator.evaluate(lr_predictions, {evaluator.metricName: "mae"})

# Calculate MAE for decision tree regression
dt_mae = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mae"})

# Calculate MAE for random forest regression
rf_mae = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mae"})

# Print the MSE and MAE values for comparison
print("Linear Regression - MSE:", lr_mse)
print("Decision Tree Regression - MSE:", dt_mse)
print("Random Forest Regression - MSE:", rf_mse)
print("Linear Regression - MAE:", lr_mae)
print("Decision Tree Regression - MAE:", dt_mae)
print("Random Forest Regression - MAE:", rf_mae)




Linear Regression - MSE: 3486.2284549459614
Decision Tree Regression - MSE: 3570.4170485784316
Random Forest Regression - MSE: 3525.2685756522715
Linear Regression - MAE: 46.53917523663759
Decision Tree Regression - MAE: 47.47608486753266
Random Forest Regression - MAE: 47.557332279815576


                                                                                

In [63]:
# Specify the path to save the model
model_path = "models/linear_regression_model"

# Save the Linear Regression model
lr_model.save(model_path)