In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import optuna
import pandas as pd

df = pd.read_csv('./datasets/original_treated.csv')

  from .autonotebook import tqdm as notebook_tqdm


### Random Forest (classificação) - Previsão de aprovação de alunos

In [2]:
df_rf = df.copy()

def prepare_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline', 
        'skipped_discipline',
        'class_skips',
        'id',
        'student_id'
    ]

    df_clean = dataframe.drop(columns=cols_to_drop, errors='ignore')

    X = df_clean.drop(columns=['is_approved'])
    y = df_clean['is_approved']

    return X, y

X, y = prepare_data(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    n_estimators = trial.suggest_int('n_estimators', 10, 300)
    max_depth = trial.suggest_int('max_depth', 2, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
    )

    clf.fit(X_train, y_train)

    preds = clf.predict(X_test)

    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')

print("Iniciando otimização com Optuna...")

study.optimize(objective, n_trials=50)

print("\n--- Resultados ---")
print(f"Melhor Acurácia encontrada: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X, y)
print("\nModelo final treinado com os melhores parâmetros.")

[I 2025-11-30 13:54:11,980] A new study created in memory with name: no-name-10892c17-f5bc-4f9a-a23a-f6be9e5306ce
[I 2025-11-30 13:54:12,151] Trial 0 finished with value: 0.9646662927650028 and parameters: {'n_estimators': 15, 'max_depth': 21, 'min_samples_split': 16, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9646662927650028.


Iniciando otimização com Optuna...


[I 2025-11-30 13:54:13,458] Trial 1 finished with value: 0.9638250140213124 and parameters: {'n_estimators': 229, 'max_depth': 13, 'min_samples_split': 17, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9646662927650028.
[I 2025-11-30 13:54:13,839] Trial 2 finished with value: 0.963684800897364 and parameters: {'n_estimators': 67, 'max_depth': 17, 'min_samples_split': 19, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.9646662927650028.
[I 2025-11-30 13:54:14,660] Trial 3 finished with value: 0.9648065058889512 and parameters: {'n_estimators': 157, 'max_depth': 11, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 3 with value: 0.9648065058889512.
[I 2025-11-30 13:54:14,808] Trial 4 finished with value: 0.9631239484015703 and parameters: {'n_estimators': 17, 'max_depth': 24, 'min_samples_split': 14, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.9648065058889512.
[I 2025-11-30 13:54:15,017] Trial 5 finished with value: 0.963684800897364 and parameters: {


--- Resultados ---
Melhor Acurácia encontrada: 0.9648
Melhores parâmetros:
  n_estimators: 157
  max_depth: 11
  min_samples_split: 8
  min_samples_leaf: 2

Modelo final treinado com os melhores parâmetros.


### Regressão Linear Múltipla (regressão) - Previsão da nota final

In [3]:
df_rlm = df.copy()

def prepare_regression_data(dataframe: pd.DataFrame):
    cols_to_drop = ['id', 'student_id', 'final_grade', 'is_approved', 'status', 'g2']

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['final_grade']

    return X, y

X, y = prepare_regression_data(df_rlm)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial: optuna.trial.Trial):
    alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr'])

    model = Ridge(alpha=alpha, solver=solver, random_state=42)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\n--- Resultado da Regressão ---")
print(f"Melhor RMSE (Erro Médio na Nota): {study.best_value:.4f}")
print("Melhores parâmetros:")
print(study.best_params)

final_model = Ridge(**study.best_params)
final_model.fit(X, y)

[I 2025-11-30 13:54:47,035] A new study created in memory with name: no-name-23209ea5-0f3f-486d-b668-31f2e6b13058


[I 2025-11-30 13:54:47,053] Trial 0 finished with value: 0.09609976123109967 and parameters: {'alpha': 0.05314008941753026, 'solver': 'svd'}. Best is trial 0 with value: 0.09609976123109967.
[I 2025-11-30 13:54:47,070] Trial 1 finished with value: 0.0960618271843787 and parameters: {'alpha': 0.07488523777322567, 'solver': 'lsqr'}. Best is trial 1 with value: 0.0960618271843787.
[I 2025-11-30 13:54:47,085] Trial 2 finished with value: 0.0960618255378404 and parameters: {'alpha': 0.07589987305354857, 'solver': 'lsqr'}. Best is trial 2 with value: 0.0960618255378404.
[I 2025-11-30 13:54:47,100] Trial 3 finished with value: 0.09609547528026713 and parameters: {'alpha': 2.7558390084737154, 'solver': 'svd'}. Best is trial 2 with value: 0.0960618255378404.
[I 2025-11-30 13:54:47,114] Trial 4 finished with value: 0.0960617566560483 and parameters: {'alpha': 0.11841381483070955, 'solver': 'lsqr'}. Best is trial 4 with value: 0.0960617566560483.
[I 2025-11-30 13:54:47,122] Trial 5 finished with 


--- Resultado da Regressão ---
Melhor RMSE (Erro Médio na Nota): 0.0961
Melhores parâmetros:
{'alpha': 14.021353427175534, 'solver': 'lsqr'}


0,1,2
,alpha,14.021353427175534
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'lsqr'
,positive,False
,random_state,


### XGBoost (classificação) - Previsão de aprovação de alunos

In [4]:
df_xgb = df.copy()

def prepare_data_xgb(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline', 
        'skipped_discipline',
        'class_skips', 
        'id',
        'student_id',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['is_approved']

    return X, y

X, y = prepare_data_xgb(df_xgb)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': 42,
        'n_jobs': -1,
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')

print("Iniciando otimização com XGBoost...")
study.optimize(objective, n_trials=50)

print("\n--- Resultados XGBoost ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

best_params = study.best_params
final_model_xgb = XGBClassifier(**best_params, random_state=42)
final_model_xgb.fit(X, y)
print("\nModelo XGBoost final treinado.")


[I 2025-11-30 13:54:47,882] A new study created in memory with name: no-name-a9d45524-b290-4bc0-b5f4-56eb86e4b6df


Iniciando otimização com XGBoost...


[I 2025-11-30 13:54:48,558] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 481, 'max_depth': 3, 'learning_rate': 0.023738085008367583}. Best is trial 0 with value: 1.0.
[I 2025-11-30 13:54:48,790] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 247, 'max_depth': 9, 'learning_rate': 0.050443358454023944}. Best is trial 0 with value: 1.0.
[I 2025-11-30 13:54:49,197] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 408, 'max_depth': 10, 'learning_rate': 0.02963317766183363}. Best is trial 0 with value: 1.0.
[I 2025-11-30 13:54:49,554] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 362, 'max_depth': 8, 'learning_rate': 0.08017815834489309}. Best is trial 0 with value: 1.0.
[I 2025-11-30 13:54:49,773] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 261, 'max_depth': 5, 'learning_rate': 0.08125021158785276}. Best is trial 0 with value: 1.0.
[I 2025-11-30 13:54:50,129] Trial 5 finished with value: 1.0 a


--- Resultados XGBoost ---
Melhor Acurácia: 1.0000
Melhores parâmetros:
  n_estimators: 481
  max_depth: 3
  learning_rate: 0.023738085008367583

Modelo XGBoost final treinado.
