In [47]:
from pyspark.sql.types import *
import pandas as pd
import pyspark 
import os 
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, sum as spark_sum
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegressionModel


In [48]:
# Start a Spark session
spark = SparkSession.builder.appName("s33ding").getOrCreate()

# Read the Parquet file into a DataFrame
df = spark.read.csv("dataset/score_cn.csv", sep=';',inferSchema=True,header=True)

In [49]:
# Select the relevant columns
selected_cols = ["NOTA_CH_CIENCIAS_HUMANAS", "NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO"]
df = df.select(*selected_cols)

# filling nulls
for my_col in selected_cols:
    print('col:',my_col)
    # Calculate the mean value for the column
    mean_val = df.agg({my_col: "mean"}).collect()[0][0]
    mean_val = round(mean_val,1)
    # Fill missing values with the mean
    df = df.na.fill(mean_val, [my_col])
    df = df.withColumn(my_col, df[my_col].cast('float'))

#rmv outliers
for my_col in selected_cols:
    # Calculate values used for outlier filtering
    mean_val = df.agg({my_col: "mean"}).collect()[0][0]
    stddev_val = df.agg({my_col: "stddev"}).collect()[0][0]

    # Create three standard deviation (μ ± 3σ) lower and upper bounds for data
    low_bound = mean_val - (3 * stddev_val)
    hi_bound = mean_val + (3 * stddev_val)

    # Filter the data to fit between the lower and upper bounds
    df = df.where((df[my_col] < hi_bound) & (df[my_col] > low_bound))

df.show()
df.dtypes

col: NOTA_CH_CIENCIAS_HUMANAS
col: NOTA_LC_LINGUAGENS_E_CODIGOS
col: NOTA_MT_MATEMATICA
col: NOTA_REDACAO
+------------------------+----------------------------+------------------+------------+
|NOTA_CH_CIENCIAS_HUMANAS|NOTA_LC_LINGUAGENS_E_CODIGOS|NOTA_MT_MATEMATICA|NOTA_REDACAO|
+------------------------+----------------------------+------------------+------------+
|                   385.4|                       461.4|             493.4|       500.0|
|                   562.8|                       590.4|             577.1|       580.0|
|                   487.3|                       447.5|             431.4|       500.0|
|                   427.1|                       478.2|             458.9|       540.0|
|                   465.5|                       507.4|             455.1|       610.0|
|                   705.0|                       622.6|             645.1|       700.0|
|                   485.6|                       495.9|             536.0|       540.0|
|             

[('NOTA_CH_CIENCIAS_HUMANAS', 'float'),
 ('NOTA_LC_LINGUAGENS_E_CODIGOS', 'float'),
 ('NOTA_MT_MATEMATICA', 'float'),
 ('NOTA_REDACAO', 'float')]

In [50]:
# Import the necessary libraries
from pyspark.ml.regression import RandomForestRegressionModel

# Specify the path of the saved Random Forest model
model_path = "models/nota_ch_ciencias_humanas/random_forest"

# Load the Random Forest model from the specified path
rf_model = RandomForestRegressionModel.load(model_path)


In [57]:
# Create a VectorAssembler to assemble the features
assembler = VectorAssembler(inputCols=["NOTA_LC_LINGUAGENS_E_CODIGOS", "NOTA_MT_MATEMATICA", "NOTA_REDACAO"], outputCol="features")

# Apply the VectorAssembler to transform the DataFrame
df = assembler.transform(df)

# Make predictions using the loaded Random Forest model
predictions = rf_model.transform(df)


IllegalArgumentException: Output column features already exists.

In [59]:
predictions.show()

# Get a sample from the joined_predictions DataFrame
sample_predictions = predictions.sample(fraction=0.1, seed=42)
sample_predictions.write.mode('overwrite').parquet('data_for_dashboards/models/prediction_score_nota_ch_ciencias_humanas.parquet')

+------------------------+----------------------------+------------------+------------+--------------------+------------------+
|NOTA_CH_CIENCIAS_HUMANAS|NOTA_LC_LINGUAGENS_E_CODIGOS|NOTA_MT_MATEMATICA|NOTA_REDACAO|            features|        prediction|
+------------------------+----------------------------+------------------+------------+--------------------+------------------+
|                   385.4|                       461.4|             493.4|       500.0|[461.399993896484...|448.11820412149837|
|                   562.8|                       590.4|             577.1|       580.0|[590.400024414062...| 518.5185769791453|
|                   487.3|                       447.5|             431.4|       500.0|[447.5,431.399993...|430.94040639160204|
|                   427.1|                       478.2|             458.9|       540.0|[478.200012207031...|  446.670221071543|
|                   465.5|                       507.4|             455.1|       610.0|[507.399993896484