In [1]:
from pyspark.sql.types import *
import pandas as pd
import pyspark 
import os 
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, sum as spark_sum
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


In [11]:
# Start a Spark session
spark = SparkSession.builder.appName("s33ding").getOrCreate()

# Read the Parquet file into a DataFrame
df = spark.read.csv("dataset/score_cn.csv", sep=';',inferSchema=True,header=True)

In [15]:
# Select the relevant columns
selected_cols = ["NOTA_CH_CIENCIAS_HUMANAS", "NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO"]
df = df.select(*selected_cols)
df.show()

+------------------------+----------------------------+------------------+------------+
|NOTA_CH_CIENCIAS_HUMANAS|NOTA_LC_LINGUAGENS_E_CODIGOS|NOTA_MT_MATEMATICA|NOTA_REDACAO|
+------------------------+----------------------------+------------------+------------+
|                   385.4|                       461.4|             493.4|         500|
|                   562.8|                       590.4|             577.1|         580|
|                   487.3|                       447.5|             431.4|         500|
|                   427.1|                       478.2|             458.9|         540|
|                   465.5|                       507.4|             455.1|        null|
|                   705.0|                       622.6|             645.1|         700|
|                   485.6|                       495.9|             536.0|         540|
|                   518.9|                       561.5|             555.0|         560|
|                   542.2|      

In [6]:
# Create temporary views for the predictions
lr_predictions.createOrReplaceTempView("lr_predictions")
dt_predictions.createOrReplaceTempView("dt_predictions")
rf_predictions.createOrReplaceTempView("rf_predictions")

# Join the prediction views
joined_predictions = spark.sql("""
    SELECT lr_predictions.*, dt_predictions.prediction AS dt_prediction, rf_predictions.prediction AS rf_prediction
    FROM lr_predictions
    JOIN dt_predictions ON lr_predictions.features = dt_predictions.features
    JOIN rf_predictions ON lr_predictions.features = rf_predictions.features
""")

# Show the joined predictions
joined_predictions.show()
joined_predictions.write.mode('overwrite').parquet('data_for_dashboards/moldes/joined_predictions.parquet')

                                                                                

+--------------------+------------------------+------------------+------------------+------------------+
|            features|NOTA_CH_CIENCIAS_HUMANAS|        prediction|     dt_prediction|     rf_prediction|
+--------------------+------------------------+------------------+------------------+------------------+
|[438.600006103515...|                   312.0| 427.2173158397308|445.56465763382846| 433.7275720494611|
|[402.600006103515...|                   315.8|441.09851806307626|429.52856984921965|451.90907202865935|
|[403.0,470.700012...|                   318.9| 433.0226707416916|429.52856984921965| 445.6404117874622|
|[397.799987792968...|                   322.7|  427.406093463856|429.52856984921965| 440.6019828343871|
|[393.799987792968...|                   324.9| 399.7897680506144|429.52856984921965| 428.4943453442205|
|[431.600006103515...|                   326.7|  425.010834104182|429.52856984921965|433.84513935371905|
|[342.5,395.299987...|                   328.4| 372.846

                                                                                

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

# Define the evaluator
evaluator = RegressionEvaluator(labelCol="NOTA_CH_CIENCIAS_HUMANAS", predictionCol="prediction")

# Calculate MSE for linear regression
lr_mse = evaluator.evaluate(lr_predictions, {evaluator.metricName: "mse"})

# Calculate MSE for decision tree regression
dt_mse = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mse"})

# Calculate MSE for random forest regression
rf_mse = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mse"})

# Calculate MAE for linear regression
lr_mae = evaluator.evaluate(lr_predictions, {evaluator.metricName: "mae"})

# Calculate MAE for decision tree regression
dt_mae = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mae"})

# Calculate MAE for random forest regression
rf_mae = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mae"})

# Define the data
data_comparing_models = [
    ("Linear Regression - MSE", lr_mse),
    ("Decision Tree Regression - MSE", dt_mse),
    ("Random Forest Regression - MSE", rf_mse),
    ("Linear Regression - MAE", lr_mae),
    ("Decision Tree Regression - MAE", dt_mae),
    ("Random Forest Regression - MAE", rf_mae)
]

# Define the data
data_comparing_models = [
    ("Linear Regression - MSE", lr_mse),
    ("Decision Tree Regression - MSE", dt_mse),
    ("Random Forest Regression - MSE", rf_mse),
    ("Linear Regression - MAE", lr_mae),
    ("Decision Tree Regression - MAE", dt_mae),
    ("Random Forest Regression - MAE", rf_mae)
]

# Define the schema for the DataFrame
schema_comparing_models = StructType([
    StructField("Model", StringType(), True),
    StructField("Metric", DoubleType(), True)
])

# Create the DataFrame
df_comparing_models = spark.createDataFrame(data_comparing_models, schema_comparing_models)

# Show the DataFrame
df_comparing_models.write.mode('overwrite')\
    .parquet('data_for_dashboards/models/comparing_models.parquet')

df_comparing_models.show()

                                                                                

+--------------------+------------------+
|               Model|            Metric|
+--------------------+------------------+
|Linear Regression...|3486.2284549459614|
|Decision Tree Reg...|3575.3004645918954|
|Random Forest Reg...|3557.4326972395693|
|Linear Regression...| 46.53917523663759|
|Decision Tree Reg...| 47.50908573152333|
|Random Forest Reg...| 47.89572724833537|
+--------------------+------------------+



In [63]:
# Specify the path to save the model
model_path = "models/linear_regression_model"

# Save the Linear Regression model
lr_model.save(model_path)