In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import optuna
import pandas as pd

df = pd.read_csv('./datasets/original_treated.csv')

model_random_forest = None
model_ridge = None
model_xgboost = None

### Random Forest (classificação) - Previsão de aprovação de alunos

In [22]:
df_rf = df.copy()

def prepare_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline',
        'skipped_discipline',
        'class_skips',
        'id',
        'student_id',
    ]

    df_clean = dataframe.drop(columns=cols_to_drop, errors='ignore')

    X = df_clean.drop(columns=['is_approved'])
    y = df_clean['is_approved']

    return X, y

X, y = prepare_data(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    clf = RandomForestClassifier(
        **hiperparams,
        random_state=42,
        n_jobs=-1,
    )

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_random_forest = RandomForestClassifier(**study.best_params, random_state=42)
model_random_forest.fit(X, y)

[I 2025-11-30 15:10:43,702] A new study created in memory with name: no-name-14e822bf-0107-4508-a018-f559463caf0b
[I 2025-11-30 15:10:44,134] Trial 0 finished with value: 0.9638250140213124 and parameters: {'n_estimators': 68, 'max_depth': 25, 'min_samples_split': 19, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9638250140213124.
[I 2025-11-30 15:10:45,231] Trial 1 finished with value: 0.9638250140213124 and parameters: {'n_estimators': 190, 'max_depth': 22, 'min_samples_split': 16, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9638250140213124.
[I 2025-11-30 15:10:45,866] Trial 2 finished with value: 0.9632641615255187 and parameters: {'n_estimators': 178, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9638250140213124.
[I 2025-11-30 15:10:47,091] Trial 3 finished with value: 0.9641054402692092 and parameters: {'n_estimators': 255, 'max_depth': 9, 'min_samples_split': 12, 'min_samples_leaf': 10}. Best is trial 3 with valu


--- RESULTADOS ---
Melhor Acurácia: 0.9652
Melhores parâmetros:
	- n_estimators: 278
	- max_depth: 30
	- min_samples_split: 8
	- min_samples_leaf: 1


0,1,2
,n_estimators,278
,criterion,'gini'
,max_depth,30
,min_samples_split,8
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Regressão Linear Múltipla (regressão) - Previsão da nota final

In [23]:
df_rlm = df.copy()

def prepare_regression_data(dataframe: pd.DataFrame):
    cols_to_drop = ['id', 'student_id', 'final_grade', 'is_approved', 'status', 'g2']

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['final_grade']

    return X, y

X, y = prepare_regression_data(df_rlm)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True),
        'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr']),
    }

    model = Ridge(**hiperparams, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor RMSE (Erro Médio na Nota): {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_ridge = Ridge(**study.best_params)
model_ridge.fit(X, y)

[I 2025-11-30 15:11:56,972] A new study created in memory with name: no-name-e42211b4-9231-4d6a-b65e-c025af9f6505


[I 2025-11-30 15:11:56,997] Trial 0 finished with value: 0.09607870251914413 and parameters: {'alpha': 36.8007003670937, 'solver': 'lsqr'}. Best is trial 0 with value: 0.09607870251914413.
[I 2025-11-30 15:11:57,008] Trial 1 finished with value: 0.0960997809078382 and parameters: {'alpha': 0.04187424135126884, 'solver': 'cholesky'}. Best is trial 0 with value: 0.09607870251914413.
[I 2025-11-30 15:11:57,019] Trial 2 finished with value: 0.0961619529227311 and parameters: {'alpha': 52.41790523866149, 'solver': 'cholesky'}. Best is trial 0 with value: 0.09607870251914413.
[I 2025-11-30 15:11:57,029] Trial 3 finished with value: 0.09609788264784438 and parameters: {'alpha': 1.1716559951684211, 'solver': 'auto'}. Best is trial 0 with value: 0.09607870251914413.
[I 2025-11-30 15:11:57,038] Trial 4 finished with value: 0.09609981722716802 and parameters: {'alpha': 0.021102378365858203, 'solver': 'cholesky'}. Best is trial 0 with value: 0.09607870251914413.
[I 2025-11-30 15:11:57,056] Trial 5


--- RESULTADOS ---
Melhor RMSE (Erro Médio na Nota): 0.0961
Melhores parâmetros:
	- alpha: 14.367319074315201
	- solver: lsqr


0,1,2
,alpha,14.367319074315201
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'lsqr'
,positive,False
,random_state,


### XGBoost (classificação) - Previsão de aprovação de alunos

In [24]:
df_xgb = df.copy()

def prepare_data_xgb(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline', 
        'skipped_discipline',
        'class_skips', 
        'id',
        'student_id',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['is_approved']

    return X, y

X, y = prepare_data_xgb(df_xgb)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': 42,
        'n_jobs': -1,
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_xgboost = XGBClassifier(**study.best_params, random_state=42)
model_xgboost.fit(X, y)

[I 2025-11-30 15:11:57,858] A new study created in memory with name: no-name-07c97dd5-3742-4354-b6af-5e0431c04e14


[I 2025-11-30 15:11:58,387] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 408, 'max_depth': 4, 'learning_rate': 0.09245839431975725}. Best is trial 0 with value: 1.0.
[I 2025-11-30 15:11:58,725] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 409, 'max_depth': 6, 'learning_rate': 0.08450809899433231}. Best is trial 0 with value: 1.0.
[I 2025-11-30 15:11:58,925] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 168, 'max_depth': 8, 'learning_rate': 0.09243931628493761}. Best is trial 0 with value: 1.0.
[I 2025-11-30 15:11:59,309] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 227, 'max_depth': 4, 'learning_rate': 0.017783870879315978}. Best is trial 0 with value: 1.0.
[I 2025-11-30 15:11:59,706] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 285, 'max_depth': 10, 'learning_rate': 0.03606617106381433}. Best is trial 0 with value: 1.0.
[I 2025-11-30 15:12:00,001] Trial 5 finished with value: 1.0 an


--- RESULTADOS ---
Melhor Acurácia: 1.0000
Melhores parâmetros:
	- n_estimators: 408
	- max_depth: 4
	- learning_rate: 0.09245839431975725


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
