In [321]:
from pyspark.sql.types import *
import pandas as pd
import pyspark 
import os 
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, udf, rank, asc, sum as spark_sum
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml import Pipeline


In [338]:
# Start a Spark session
spark = SparkSession.builder.appName("s33ding").getOrCreate()

# Read the Parquet file into a DataFrame
df = spark.read.parquet("dataset/enem.parquet")
df.dtypes
df.show()

+------+------------+----------------+---------------+---------+--------------------+-------------+--------------------+--------------------+---------------+-------------+--------------+----------------------+------------------+---------------+-----------+-----------+------------------+--------------------+----------------------------+------------------------+----------------------------+------------------+------------+------------------+
|NU_ANO|NU_INSCRICAO| TP_FAIXA_ETARIA|Idade_Calculada|  TP_SEXO|     TP_ESTADO_CIVIL|  TP_COR_RACA|    TP_NACIONALIDADE|     TP_ST_CONCLUSAO|TP_ANO_CONCLUIU|    TP_ESCOLA|     TP_ENSINO|TP_DEPENDENCIA_ADM_ESC|TP_LOCALIZACAO_ESC|TP_SIT_FUNC_ESC|CO_UF_PROVA|SG_UF_PROVA|CO_MUNICIPIO_PROVA|  NO_MUNICIPIO_PROVA|NOTA_CN_CIENCIAS_DA_NATUREZA|NOTA_CH_CIENCIAS_HUMANAS|NOTA_LC_LINGUAGENS_E_CODIGOS|NOTA_MT_MATEMATICA|NOTA_REDACAO|NOTA_MEDIA_5_NOTAS|
+------+------------+----------------+---------------+---------+--------------------+-------------+---------------

In [339]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when

# Calculate mean, lowest, and highest values of "NOTA_MEDIA_5_NOTAS" column
mean_value = df.agg({"NOTA_MEDIA_5_NOTAS": "mean"}).collect()[0][0]
lowest_value = df.agg({"NOTA_MEDIA_5_NOTAS": "min"}).collect()[0][0]
highest_value = df.agg({"NOTA_MEDIA_5_NOTAS": "max"}).collect()[0][0]

# Create a column with categorized values based on mean, lowest, and highest values
df = df.withColumn(
    "NOTA_MEDIA_5_NOTAS",
    when(col("NOTA_MEDIA_5_NOTAS") < lowest_value, "low")
    .when(col("NOTA_MEDIA_5_NOTAS") < mean_value, "medium_low")
    .when(col("NOTA_MEDIA_5_NOTAS") < highest_value, "medium_high")
    .otherwise("high")
)

lst_cols = ['NOTA_MEDIA_5_NOTAS', 'TP_FAIXA_ETARIA','CO_UF_PROVA', 'TP_COR_RACA', 'TP_ESCOLA', 'TP_DEPENDENCIA_ADM_ESC', 'TP_LOCALIZACAO_ESC']
df = df.select(*lst_cols)

drop_values = ['Não informado', 'Não respondeu']

# Loop through each column and drop rows with specific values
for column in df.columns:
    if column != 'TP_COR_RACA':
        df = df.filter(~col(column).isin(drop_values))

In [340]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when

# Select the columns for feature selection
feature_cols = lst_cols

# Convert string columns to numerical using StringIndexer
indexers = [StringIndexer(inputCol=col_name, outputCol=col_name+"_index").fit(df) for col_name in feature_cols]
pipeline = Pipeline(stages=indexers)
indexed_df = pipeline.fit(df).transform(df)

# Select the features and the label column
# Select the features and the label column
selected_cols = feature_cols + ['TP_COR_RACA_index']
selected_df = indexed_df.select(*selected_cols)


# Compute the chi-square statistics for feature selection
selector = ChiSqSelector(numTopFeatures=3, featuresCol='features', outputCol='selected_features', labelCol='TP_COR_RACA_index')
selector_model = selector.fit(selected_df)
selected_features = selector_model.transform(selected_df)

# Get the selected feature names
selected_feature_names = [feature_cols[i] for i in selector_model.selectedFeatures]

print("Selected Features:")
for feature in selected_feature_names:
    print(feature)


IllegalArgumentException: features does not exist. Available: NOTA_MEDIA_5_NOTAS, TP_FAIXA_ETARIA, CO_UF_PROVA, TP_COR_RACA, TP_ESCOLA, TP_DEPENDENCIA_ADM_ESC, TP_LOCALIZACAO_ESC, TP_COR_RACA_index

In [None]:
# Convert string columns to numerical using StringIndexer
indexers = [StringIndexer(inputCol=col_name, outputCol=col_name+"_index") for col_name in df.columns if col_name != 'TP_COR_RACA']
pipeline_stages = indexers + [StringIndexer(inputCol="TP_COR_RACA", outputCol="label")]
pipeline = Pipeline(stages=pipeline_stages)
df = pipeline.fit(df).transform(df)

# Split the data into training and testing sets
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=123)

In [330]:
from pyspark.ml.feature import VectorAssembler

# Define the input features
input_cols = [x+'_index' for x in lst_cols if x!='TP_COR_RACA']

# Create a VectorAssembler to combine the input features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')

# Transform the training and testing data using the VectorAssembler
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

In [327]:
# Logistic Regression Classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")
lr_model = lr.fit(trainingData)
lr_predictions = lr_model.transform(testData)
lr_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)
print("Logistic Regression Classifier Accuracy:", lr_accuracy)



Logistic Regression Classifier Accuracy: 0.5383949056515428


                                                                                

In [331]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_predictions = dt_model.transform(testData)
dt_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
dt_accuracy = dt_evaluator.evaluate(dt_predictions)
print("Decision Tree Classifier Accuracy:", dt_accuracy)



Decision Tree Classifier Accuracy: 0.5933651730111907


                                                                                

In [332]:
# Naive Bayes Classifier
nb = NaiveBayes(labelCol="label", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_predictions = nb_model.transform(testData)
nb_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nb_accuracy = nb_evaluator.evaluate(nb_predictions)
print("Naive Bayes Classifier Accuracy:", nb_accuracy)



Naive Bayes Classifier Accuracy: 0.5349299995317695


                                                                                

In [333]:
# Create a Random Forest Classifier
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

# Train the Random Forest Classifier
nb_model = rf.fit(trainingData)

# Make predictions on the test data
nb_predictions = nb_model.transform(testData)

# Evaluate the model
nb_evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
nb_accuracy = nb_evaluator.evaluate(nb_predictions)

print("Random Forest Classifier Accuracy:", nb_accuracy)



Random Forest Classifier Accuracy: 0.5848667884066114


                                                                                