In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, mean_squared_error, silhouette_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import optuna
import pandas as pd

model_random_forest = None
model_ridge = None
model_xgboost = None

### Random Forest (classificação) - Previsão de aprovação de alunos

In [58]:
df_rf = pd.read_csv('./datasets/original_treated.csv')

def prepare_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline',
        'skipped_discipline',
        'class_skips',
        'id',
        'student_id',
    ]

    df_clean = dataframe.drop(columns=cols_to_drop, errors='ignore')

    X = df_clean.drop(columns=['is_approved'])
    y = df_clean['is_approved']

    return X, y

X, y = prepare_data(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    clf = RandomForestClassifier(
        **hiperparams,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )

    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_random_forest = RandomForestClassifier(**study.best_params, random_state=42)
model_random_forest.fit(X, y)

[I 2025-11-30 16:34:45,970] A new study created in memory with name: no-name-5270bc06-a2c5-417a-a2a8-68822bb86118
[I 2025-11-30 16:34:46,816] Trial 0 finished with value: 0.9693800212217675 and parameters: {'n_estimators': 167, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.9693800212217675.
[I 2025-11-30 16:34:47,192] Trial 1 finished with value: 0.969531605275125 and parameters: {'n_estimators': 65, 'max_depth': 28, 'min_samples_split': 14, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.969531605275125.
[I 2025-11-30 16:34:48,035] Trial 2 finished with value: 0.968470516901622 and parameters: {'n_estimators': 169, 'max_depth': 13, 'min_samples_split': 20, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.969531605275125.
[I 2025-11-30 16:34:48,758] Trial 3 finished with value: 0.9680157647415492 and parameters: {'n_estimators': 228, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.


--- RESULTADOS ---
Melhor Acurácia: 0.9732
Melhores parâmetros:
	- n_estimators: 48
	- max_depth: 30
	- min_samples_split: 2
	- min_samples_leaf: 1


0,1,2
,n_estimators,48
,criterion,'gini'
,max_depth,30
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Regressão Linear Múltipla (regressão) - Previsão da nota final

In [59]:
df_rlm = pd.read_csv('./datasets/original_treated.csv')

def prepare_regression_data(dataframe: pd.DataFrame):
    cols_to_drop = [
        'id',
        'student_id',
        'final_grade',
        'is_approved',
        'status',
        'g2',
        'canceled_discipline',
        'skipped_discipline',
        'class_skips',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['final_grade']

    return X, y

X, y = prepare_regression_data(df_rlm)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    hiperparams = {
        'alpha': trial.suggest_float('alpha', 0.01, 100.0, log=True),
        'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr']),
    }

    model = Ridge(**hiperparams, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor RMSE (Erro Médio na Nota): {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_ridge = Ridge(**study.best_params)
model_ridge.fit(X, y)

[I 2025-11-30 16:35:19,680] A new study created in memory with name: no-name-e8f1f618-3829-4335-bdab-683cf32d5cf0
[I 2025-11-30 16:35:19,686] Trial 0 finished with value: 0.09730852329224689 and parameters: {'alpha': 0.03091077238231144, 'solver': 'cholesky'}. Best is trial 0 with value: 0.09730852329224689.
[I 2025-11-30 16:35:19,691] Trial 1 finished with value: 0.09733441209008604 and parameters: {'alpha': 30.846039761745562, 'solver': 'auto'}. Best is trial 0 with value: 0.09730852329224689.
[I 2025-11-30 16:35:19,701] Trial 2 finished with value: 0.09730786167330249 and parameters: {'alpha': 0.8649437653723232, 'solver': 'lsqr'}. Best is trial 2 with value: 0.09730786167330249.
[I 2025-11-30 16:35:19,710] Trial 3 finished with value: 0.09730851018895 and parameters: {'alpha': 0.047734898114253785, 'solver': 'svd'}. Best is trial 2 with value: 0.09730786167330249.
[I 2025-11-30 16:35:19,716] Trial 4 finished with value: 0.09730841254743573 and parameters: {'alpha': 0.17435070944962


--- RESULTADOS ---
Melhor RMSE (Erro Médio na Nota): 0.0973
Melhores parâmetros:
	- alpha: 7.7559477075239025
	- solver: lsqr


0,1,2
,alpha,7.7559477075239025
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'lsqr'
,positive,False
,random_state,


### XGBoost (classificação) - Previsão de aprovação de alunos

In [61]:
df_rf = pd.read_csv('./datasets/original_treated.csv')

def prepare_data_xgb(dataframe: pd.DataFrame):
    cols_to_drop = [
        'status',
        'g2',
        'final_grade',
        'canceled_discipline', 
        'skipped_discipline',
        'class_skips', 
        'id',
        'student_id',
        'is_approved',
    ]

    X = dataframe.drop(columns=cols_to_drop, errors='ignore')
    y = dataframe['is_approved']

    return X, y

X, y = prepare_data_xgb(df_rf)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

def objective(trial: optuna.trial.Trial):
    # sugestões de hiperparâmetros do Optuna
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': 42,
        'n_jobs': -1,
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("\n--- RESULTADOS ---")
print(f"Melhor Acurácia: {study.best_value:.4f}")
print("Melhores parâmetros:")
for key, value in study.best_params.items():
    print(f"\t- {key}: {value}")

model_xgboost = XGBClassifier(**study.best_params, random_state=42)
model_xgboost.fit(X, y)

[I 2025-11-30 16:36:14,917] A new study created in memory with name: no-name-a20ceb1a-bb08-404d-8ae3-b6759569106a
[I 2025-11-30 16:36:16,169] Trial 0 finished with value: 0.9731696225557072 and parameters: {'n_estimators': 395, 'max_depth': 9, 'learning_rate': 0.2179394319074811}. Best is trial 0 with value: 0.9731696225557072.
[I 2025-11-30 16:36:16,435] Trial 1 finished with value: 0.9727148703956344 and parameters: {'n_estimators': 122, 'max_depth': 5, 'learning_rate': 0.1489746514793036}. Best is trial 0 with value: 0.9731696225557072.
[I 2025-11-30 16:36:17,716] Trial 2 finished with value: 0.974685463089283 and parameters: {'n_estimators': 313, 'max_depth': 10, 'learning_rate': 0.07940717634085298}. Best is trial 2 with value: 0.974685463089283.
[I 2025-11-30 16:36:18,116] Trial 3 finished with value: 0.9737759587691375 and parameters: {'n_estimators': 96, 'max_depth': 10, 'learning_rate': 0.10864050812819233}. Best is trial 2 with value: 0.974685463089283.
[I 2025-11-30 16:36:18


--- RESULTADOS ---
Melhor Acurácia: 0.9754
Melhores parâmetros:
	- n_estimators: 443
	- max_depth: 8
	- learning_rate: 0.0442847973662326


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False
