In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import optuna
import pandas as pd

model_random_forest = None
model_ridge = None
model_xgboost = None

### Random Forest (classificação) - Previsão de aprovação de alunos

In [50]:
df_rf = pd.read_csv('./datasets/original_treated_equalized.csv')

def prepare_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline',
        'skipped_discipline',
        'class_skips',
        'id',
        'student_id',
    ]

    df_clean = dataframe.drop(columns=cols_to_drop, errors='ignore')

    X = df_clean.drop(columns=['is_approved'])
    y = df_clean['is_approved']

    return X, y

X, y = prepare_data(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    clf = RandomForestClassifier(
        **hiperparams,
        random_state=42,
        n_jobs=-1,
    )

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_random_forest = RandomForestClassifier(**study.best_params, random_state=42)
model_random_forest.fit(X, y)

[I 2025-11-30 16:24:37,984] A new study created in memory with name: no-name-4b41becd-63cf-4c1b-a681-c723e7824b8b


[I 2025-11-30 16:24:38,141] Trial 0 finished with value: 0.8538461538461538 and parameters: {'n_estimators': 58, 'max_depth': 10, 'min_samples_split': 17, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8538461538461538.
[I 2025-11-30 16:24:38,475] Trial 1 finished with value: 0.8596153846153847 and parameters: {'n_estimators': 160, 'max_depth': 26, 'min_samples_split': 13, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.8596153846153847.
[I 2025-11-30 16:24:38,558] Trial 2 finished with value: 0.8557692307692307 and parameters: {'n_estimators': 27, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.8596153846153847.
[I 2025-11-30 16:24:38,843] Trial 3 finished with value: 0.8480769230769231 and parameters: {'n_estimators': 139, 'max_depth': 3, 'min_samples_split': 14, 'min_samples_leaf': 10}. Best is trial 1 with value: 0.8596153846153847.
[I 2025-11-30 16:24:39,238] Trial 4 finished with value: 0.8557692307692307 and parameters:


--- RESULTADOS ---
Melhor Acurácia: 0.8712
Melhores parâmetros:
	- n_estimators: 270
	- max_depth: 18
	- min_samples_split: 4
	- min_samples_leaf: 2


0,1,2
,n_estimators,270
,criterion,'gini'
,max_depth,18
,min_samples_split,4
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Regressão Linear Múltipla (regressão) - Previsão da nota final

In [51]:
df_rlm = pd.read_csv('./datasets/original_treated.csv')

def prepare_regression_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'id',
        'student_id',
        'final_grade',
        'is_approved',
        'status',
        'g2',
        'canceled_discipline',
        'skipped_discipline',
        'class_skips',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['final_grade']

    return X, y

X, y = prepare_regression_data(df_rlm)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True),
        'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr']),
    }

    model = Ridge(**hiperparams, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor RMSE (Erro Médio na Nota): {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_ridge = Ridge(**study.best_params)
model_ridge.fit(X, y)

[I 2025-11-30 16:25:04,075] A new study created in memory with name: no-name-cb49febb-974b-42c8-aa9a-28d0485da588
[I 2025-11-30 16:25:04,082] Trial 0 finished with value: 0.09739712346104566 and parameters: {'alpha': 49.48615374033999, 'solver': 'cholesky'}. Best is trial 0 with value: 0.09739712346104566.
[I 2025-11-30 16:25:04,094] Trial 1 finished with value: 0.0975547145194691 and parameters: {'alpha': 77.47251920134468, 'solver': 'lsqr'}. Best is trial 0 with value: 0.09739712346104566.
[I 2025-11-30 16:25:04,108] Trial 2 finished with value: 0.0973084883457964 and parameters: {'alpha': 0.013565324301873698, 'solver': 'lsqr'}. Best is trial 2 with value: 0.0973084883457964.
[I 2025-11-30 16:25:04,115] Trial 3 finished with value: 0.09730848886036296 and parameters: {'alpha': 0.07520357907061416, 'solver': 'cholesky'}. Best is trial 2 with value: 0.0973084883457964.
[I 2025-11-30 16:25:04,128] Trial 4 finished with value: 0.09730599741109232 and parameters: {'alpha': 4.700844260091


--- RESULTADOS ---
Melhor RMSE (Erro Médio na Nota): 0.0973
Melhores parâmetros:
	- alpha: 7.5317088026771914
	- solver: lsqr


0,1,2
,alpha,7.5317088026771914
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'lsqr'
,positive,False
,random_state,


### XGBoost (classificação) - Previsão de aprovação de alunos

In [52]:
df_rf = pd.read_csv('./datasets/original_treated_equalized.csv')

def prepare_data_xgb(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline', 
        'skipped_discipline',
        'class_skips', 
        'id',
        'student_id',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['is_approved']

    return X, y

X, y = prepare_data_xgb(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': 42,
        'n_jobs': -1,
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_xgboost = XGBClassifier(**study.best_params, random_state=42)
model_xgboost.fit(X, y)

[I 2025-11-30 16:25:04,690] A new study created in memory with name: no-name-51bbbb49-a22b-41ba-8ad0-d1b921f4701b


[I 2025-11-30 16:25:04,776] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 132, 'max_depth': 7, 'learning_rate': 0.11001524778314276}. Best is trial 0 with value: 1.0.
[I 2025-11-30 16:25:04,845] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 121, 'max_depth': 9, 'learning_rate': 0.193307350867768}. Best is trial 0 with value: 1.0.
[I 2025-11-30 16:25:04,981] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 273, 'max_depth': 4, 'learning_rate': 0.04405308497506903}. Best is trial 0 with value: 1.0.
[I 2025-11-30 16:25:05,087] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 165, 'max_depth': 9, 'learning_rate': 0.011707630207990278}. Best is trial 0 with value: 1.0.
[I 2025-11-30 16:25:05,207] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 199, 'max_depth': 3, 'learning_rate': 0.11720599690107185}. Best is trial 0 with value: 1.0.
[I 2025-11-30 16:25:05,299] Trial 5 finished with value: 1.0 and p


--- RESULTADOS ---
Melhor Acurácia: 1.0000
Melhores parâmetros:
	- n_estimators: 132
	- max_depth: 7
	- learning_rate: 0.11001524778314276


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
