In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb

In [None]:
data_path = "../data/processed/adult_balanced.csv"

In [None]:
df = pd.read_csv(data_path)

In [None]:
df

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income
0,3.316630,-0.538790,-0.439738,-0.147445,10.555814,-1.914161,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
1,1.184831,-0.467906,-2.400559,-0.147445,9.427915,-0.077734,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
2,0.195067,0.708645,-0.047574,-0.147445,9.427915,-0.077734,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
3,-0.337883,0.256222,-0.439738,-0.147445,9.106365,0.339636,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
4,-0.033340,-0.370964,-1.616231,-0.147445,9.106365,-0.077734,False,True,False,False,...,False,False,False,False,False,False,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45303,0.507883,-0.802791,-0.439738,-0.147445,-0.218586,-0.077734,False,True,False,False,...,False,False,False,False,False,False,True,False,False,1
45304,1.461288,0.333465,-0.439738,0.110457,-0.218586,1.591745,False,True,True,False,...,False,False,False,False,False,False,True,False,False,1
45305,0.220682,-0.344527,-0.699963,-0.147445,-0.218586,-0.077734,False,True,False,False,...,False,False,False,False,False,False,True,False,False,1
45306,1.056800,0.668374,1.128918,-0.147445,-0.218586,0.757005,False,True,False,False,...,False,False,False,False,False,False,True,False,False,1


In [None]:
X = df.drop('income', axis=1)
y = df['income']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (36246, 96), Test shape: (9062, 96)


In [None]:
def evaluate_model(y_true, y_pred, name=""):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

In [None]:
model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)

NameError: name 'param_grid' is not defined

In [None]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

In [None]:
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)

In [None]:
grid_search.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
def train_xgboost(X_train, y_train):
    param_grid = {
        'n_estimators': [100],
        'max_depth': [5],
        'learning_rate': [0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.8]
    }

    model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_

In [None]:
def train_lightgbm(X_train, y_train):
    param_grid = {
        'n_estimators': [100],
        'max_depth': [5],
        'learning_rate': [0.1],
        'num_leaves': [31],
        'subsample': [0.8],
        'colsample_bytree': [0.8]
    }

    model = lgb.LGBMClassifier(random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


def train_eval_nn(X_train, y_train, X_val, y_val, hidden_dim=64, lr=0.001, batch_size=64, epochs=10, dropout=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = SimpleNN(X_train.shape[1], hidden_dim, dropout).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_ds = TensorDataset(
        torch.tensor(X_train.values, dtype=torch.float32),
        torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
    )
    loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    model.train()
    for epoch in range(epochs):
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        X_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
        preds = model(X_val_t).cpu().numpy()
        preds_labels = (preds > 0.5).astype(int)

    f1 = f1_score(y_val.values, preds_labels)
    return model, preds_labels, f1


def search_best_nn(X_train, y_train, X_val, y_val):
    param_grid = {
        'hidden_dim': [64],
        'lr': [0.001],
        'batch_size': [64],
        'dropout': [0.5]
    }

    best_model, best_f1, best_params = None, 0, None
    for params in ParameterGrid(param_grid):
        model, preds, f1 = train_eval_nn(
            X_train, y_train, X_val, y_val,
            hidden_dim=params['hidden_dim'],
            lr=params['lr'],
            batch_size=params['batch_size'],
            dropout=params['dropout']
        )
        if f1 > best_f1:
            best_model = model
            best_f1 = f1
            best_params = params

    return best_model, best_params

In [None]:
def save_model(model, name, model_type='sklearn'):
    os.makedirs("models", exist_ok=True)
    path = f"models/{name}.pkl" if model_type == 'sklearn' else f"models/{name}.pt"

    if model_type == 'sklearn':
        joblib.dump(model, path)
    elif model_type == 'torch':
        torch.save(model.state_dict(), path)
    print(f"✅ Saved: {path}")

In [None]:
xgb_model, xgb_params = train_xgboost(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
xgb_metrics = evaluate_model(y_test, xgb_preds)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
lgb_model, lgb_params = train_lightgbm(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)
lgb_metrics = evaluate_model(y_test, lgb_preds)

[LightGBM] [Info] Number of positive: 18123, number of negative: 18123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1690
[LightGBM] [Info] Number of data points in the train set: 36246, number of used features: 87
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [None]:
nn_model, nn_params = search_best_nn(X_train, y_train, X_test, y_test)
nn_model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(next(nn_model.parameters()).device)
    nn_preds = nn_model(X_test_tensor).cpu().numpy()
    nn_preds_labels = (nn_preds > 0.5).astype(int).flatten()
nn_metrics = evaluate_model(y_test, nn_preds_labels)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
print(X_train.dtypes)
print(X_train.head())

age                               float64
fnlwgt                            float64
education.num                     float64
capital.gain                      float64
capital.loss                      float64
                                   ...   
native.country_Thailand              bool
native.country_Trinadad&Tobago       bool
native.country_United-States         bool
native.country_Vietnam               bool
native.country_Yugoslavia            bool
Length: 96, dtype: object
            age    fnlwgt  education.num  capital.gain  capital.loss  \
17196  1.565509 -0.462246       0.344590     -0.147445     -0.218586   
6169   0.423474 -0.536992      -0.047574     -0.147445     -0.218586   
26874 -1.403782  0.201182      -0.047574     -0.147445     -0.218586   
21275 -0.337883 -0.142022      -0.047574     -0.147445     -0.218586   
26071 -0.566290  1.461471      -0.439738     -0.147445     -0.218586   

       hours.per.week  workclass_Local-gov  workclass_Private  \
17196       -0

In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, dropout=0.5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


def train_eval_nn(X_train, y_train, X_val, y_val, hidden_dim=64, lr=0.001, batch_size=64, epochs=10, dropout=0.5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X_train = X_train.astype(np.float32)
    y_train = y_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    y_val = y_val.astype(np.float32)
    
    model = SimpleNN(X_train.shape[1], hidden_dim, dropout).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_ds = TensorDataset(
        torch.tensor(X_train.values, dtype=torch.float32),
        torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
    )
    loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    model.train()
    for epoch in range(epochs):
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        X_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
        preds = model(X_val_t).cpu().numpy()
        preds_labels = (preds > 0.5).astype(int)

    f1 = f1_score(y_val.values, preds_labels)
    return model, preds_labels, f1


def search_best_nn(X_train, y_train, X_val, y_val):
    param_grid = {
        'hidden_dim': [64],
        'lr': [0.001],
        'batch_size': [64],
        'dropout': [0.5]
    }

    best_model, best_f1, best_params = None, 0, None
    for params in ParameterGrid(param_grid):
        model, preds, f1 = train_eval_nn(
            X_train, y_train, X_val, y_val,
            hidden_dim=params['hidden_dim'],
            lr=params['lr'],
            batch_size=params['batch_size'],
            dropout=params['dropout']
        )
        if f1 > best_f1:
            best_model = model
            best_f1 = f1
            best_params = params

    return best_model, best_params

In [None]:
nn_model, nn_params = search_best_nn(X_train, y_train, X_test, y_test)
nn_model.eval()
with torch.no_grad():
    X_test = X_test.astype(np.float32)
    
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(next(nn_model.parameters()).device)
    nn_preds = nn_model(X_test_tensor).cpu().numpy()
    nn_preds_labels = (nn_preds > 0.5).astype(int).flatten()

nn_metrics = evaluate_model(y_test, nn_preds_labels)

In [None]:
results = pd.DataFrame({
    "XGBoost": xgb_metrics,
    "LightGBM": lgb_metrics,
    "NeuralNet": nn_metrics
}).T

In [None]:
print("\n📊 Сравнение моделей по метрикам:")
print(results)


📊 Сравнение моделей по метрикам:
           accuracy  precision    recall        f1
XGBoost    0.875745   0.852850  0.908188  0.879649
LightGBM   0.882366   0.861918  0.910616  0.885598
NeuralNet  0.857868   0.824755  0.908850  0.864763


In [None]:
save_model(xgb_model, "xgboost_model")
save_model(lgb_model, "lightgbm_model")
save_model(nn_model, "neural_net_model", model_type='torch')


print(X_train.dtypes)
print(X_train.head())

✅ Saved: models/xgboost_model.pkl
✅ Saved: models/lightgbm_model.pkl
✅ Saved: models/neural_net_model.pt
age                               float64
fnlwgt                            float64
education.num                     float64
capital.gain                      float64
capital.loss                      float64
                                   ...   
native.country_Thailand              bool
native.country_Trinadad&Tobago       bool
native.country_United-States         bool
native.country_Vietnam               bool
native.country_Yugoslavia            bool
Length: 96, dtype: object
            age    fnlwgt  education.num  capital.gain  capital.loss  \
17196  1.565509 -0.462246       0.344590     -0.147445     -0.218586   
6169   0.423474 -0.536992      -0.047574     -0.147445     -0.218586   
26874 -1.403782  0.201182      -0.047574     -0.147445     -0.218586   
21275 -0.337883 -0.142022      -0.047574     -0.147445     -0.218586   
26071 -0.566290  1.461471      -0.439738     -0

In [None]:
joblib.dump(xgb_model, 'models/xgboost_model.pkl')

joblib.dump(lgb_model, 'models/lightgbm_model.pkl')

torch.save(nn_model.state_dict(), 'models/neural_net_model.pt')

In [None]:
joblib.dump(xgb_model, 'models/xgboost_model.pkl')

['models/xgboost_model.pkl']

In [None]:
joblib.dump(xgb_model, '../models/xgboost_model.pkl')

['../models/xgboost_model.pkl']

In [None]:
joblib.dump(lgb_model, '../models/lightgbm_model.pkl')

['../models/lightgbm_model.pkl']

In [None]:
torch.save(nn_model.state_dict(), '../models/neural_net_model.pt')