In [45]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer

In [46]:
X_train_path = "data/train/X_train.npy"
y_train_path = "data/train/y_train.npy"
X_test_path  = "data/test/X_test.npy"
ids_train_path = "data/train/ids.npy"
ids_test_path = "data/test/ids.npy"
metadata_train_path = "data/train/metadata_train.csv"
metadata_test_path = "data/test/metadata_test.csv"

In [47]:
def load_data(X_train_path, y_train_path, X_test_path, ids_train_path, ids_test_path):
    X_train = np.load(X_train_path)
    y_train = np.load(y_train_path)
    X_test = np.load(X_test_path)
    ids_train = np.load(ids_train_path)
    ids_test = np.load(ids_test_path)
    return X_train, y_train, X_test, ids_train, ids_test

def load_metadata(metadata_train_path, metadata_test_path):
    metadata_train = pd.read_csv(metadata_train_path)
    metadata_test = pd.read_csv(metadata_test_path)
    return metadata_train, metadata_test

In [48]:
X_train, y_train, X_test, ids_train, ids_test = load_data(X_train_path, y_train_path, X_test_path, ids_train_path, ids_test_path)
metadata_train, metadata_test = load_metadata(metadata_train_path, metadata_test_path)

In [49]:
#pretreatment for data correlation
categorical_cols = [
    "Organism group",
    "Isolation type",
    "Location",
    "Isolation source",
    "Laboratory typing platform",
    "Testing standard"
]
numerical_cols = ["Days Since Ref Date"]

def prepare_metadata(metadata_train, metadata_test):
    #unknown is a new value possible for missing values
    metadata_train[categorical_cols] = metadata_train[categorical_cols].fillna("Unknown")
    metadata_test[categorical_cols] = metadata_test[categorical_cols].fillna("Unknown")
    
    #remove timezones
    metadata_train["Create date"] = pd.to_datetime(metadata_train["Create date"], utc=True).dt.tz_localize(None)
    metadata_test["Create date"]  = pd.to_datetime(metadata_test["Create date"], utc=True).dt.tz_localize(None)
    
    #compute days since ref_date
    #print(min(metadata_train["Create date"].min(), metadata_test["Create date"].min())) #2013
    ref_date = pd.to_datetime("2010-01-01") # smaller than 2013
    metadata_train["Days Since Ref Date"] = (pd.to_datetime(metadata_train["Create date"]) - ref_date).dt.days
    metadata_test["Days Since Ref Date"]  = (pd.to_datetime(metadata_test["Create date"]) - ref_date).dt.days
        
    metadata_train.drop(columns=["Create date", "Unnamed: 0"], inplace=True)
    metadata_test.drop(columns=["Create date", "Unnamed: 0"], inplace=True)

    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    
    metadata_train_cat = encoder.fit_transform(metadata_train[categorical_cols])
    metadata_test_cat  = encoder.transform(metadata_test[categorical_cols])
    
    scaler = StandardScaler()
    
    metadata_train_num = scaler.fit_transform(metadata_train[numerical_cols])
    metadata_test_num  = scaler.transform(metadata_test[numerical_cols])

    metadata_train_features = np.hstack([metadata_train_cat, metadata_train_num])
    metadata_test_features  = np.hstack([metadata_test_cat, metadata_test_num])

    return metadata_train_features, metadata_test_features

In [50]:
metadata_train_features, metadata_test_features = prepare_metadata(metadata_train, metadata_test)

In [51]:
def select_top_variance_features(X_train, X_test, top_k=50000, batch_size=10000):
    n_features = X_train.shape[1]
    variances = np.zeros(n_features)

    #compute variance in batches
    for start in tqdm(range(0, n_features, batch_size), desc="Computing variance in batches"):
        end = min(start + batch_size, n_features)
        batch = X_train[:, start:end]
        variances[start:end] = np.var(batch, axis=0)

    # select top_k features
    top_features = np.argsort(variances)[-top_k:]

    # reduce matrices
    X_train_reduced = X_train[:, top_features]
    X_test_reduced  = X_test[:, top_features]

    return X_train_reduced, X_test_reduced, top_features

def transform_tfidf(X_train, X_test, n_components=1000):
    tfidf = TfidfTransformer()
    X_train_reduced = tfidf.fit_transform(X_train)
    X_test_reduced = tfidf.transform(X_test)

    svd = TruncatedSVD(n_components=n_components)
    normalizer = Normalizer()

    X_train_svd = normalizer.fit_transform(svd.fit_transform(X_train_reduced)).astype(np.float32)
    X_test_svd = normalizer.transform(svd.transform(X_test_reduced)).astype(np.float32)
    return X_train_svd, X_test_svd

In [52]:
X_train_reduced, X_test_reduced, top_features_variance = select_top_variance_features(X_train, X_test, top_k=100000, batch_size=10000)

X_train_reduced, X_test_reduced = transform_tfidf(X_train_reduced, X_test_reduced)

Computing variance in batches: 100%|██████████| 100/100 [00:07<00:00, 12.66it/s]


In [53]:
#X_train_reduced = np.hstack([X_train_reduced.toarray(), metadata_train_features])
#X_test_reduced = np.hstack([X_test_reduced.toarray(),  metadata_test_features]) 

X_train_reduced = np.hstack([X_train_reduced, metadata_train_features])
X_test_reduced = np.hstack([X_test_reduced,  metadata_test_features]) 

In [54]:
print(X_train_reduced.shape)

(1939, 1105)


In [None]:
# Random Search
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import joblib
import warnings
warnings.filterwarnings('ignore')

def tune_xgboost(X_train, y_train, n_iter=20, cv_folds=3):
    """Tune XGBoost hyperparameters"""
    print("Tuning XGBoost...")
    
    n_pos = np.sum(y_train == 1)
    n_neg = np.sum(y_train == 0)
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1
    
    param_dist = {
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'n_estimators': randint(100, 400),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'min_child_weight': randint(1, 7),
        'gamma': uniform(0, 0.3),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 2)
    }
    
    base_model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        tree_method='hist',
        device='cpu',
        random_state=42,
        scale_pos_weight=scale_pos_weight
    )
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"Best params: {random_search.best_params_}")
    print(f"Best F1: {random_search.best_score_:.4f}\n")
    
    return random_search.best_params_

def tune_lightgbm(X_train, y_train, n_iter=20, cv_folds=3):
    """Tune LightGBM hyperparameters"""
    print("Tuning LightGBM...")
    
    n_pos = np.sum(y_train == 1)
    n_neg = np.sum(y_train == 0)
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1
    
    param_dist = {
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'n_estimators': randint(100, 400),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'min_child_weight': uniform(1, 10),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 2),
        'num_leaves': randint(20, 100)
    }
    
    base_model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        device='cpu',
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        verbose=-1
    )
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"Best params: {random_search.best_params_}")
    print(f"Best F1: {random_search.best_score_:.4f}\n")
    
    return random_search.best_params_

def tune_catboost(X_train, y_train, n_iter=20, cv_folds=3):
    """Tune CatBoost hyperparameters"""
    print("Tuning CatBoost...")
    
    n_pos = np.sum(y_train == 1)
    n_neg = np.sum(y_train == 0)
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1
    
    param_dist = {
        'depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.2),
        'iterations': randint(100, 400),
        'l2_leaf_reg': uniform(1, 10),
        'border_count': randint(32, 255),
        'bagging_temperature': uniform(0, 1)
    }
    
    base_model = CatBoostClassifier(
        random_seed=42,
        verbose=False,
        scale_pos_weight=scale_pos_weight
    )
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"Best params: {random_search.best_params_}")
    print(f"Best F1: {random_search.best_score_:.4f}\n")
    
    return random_search.best_params_

def tune_random_forest(X_train, y_train, n_iter=20, cv_folds=3):
    """Tune Random Forest hyperparameters"""
    print("Tuning Random Forest...")
    
    param_dist = {
        'n_estimators': randint(50, 300),
        'max_depth': randint(5, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10),
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    }
    
    base_model = RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"Best params: {random_search.best_params_}")
    print(f"Best : {random_search.best_score_:.4f}\n")
    
    return random_search.best_params_

def tune_logistic_regression(X_train, y_train, n_iter=15, cv_folds=3):
    """Tune Logistic Regression hyperparameters"""
    print("Tuning Logistic Regression...")
    
    param_dist = {
        'C': uniform(0.001, 10),
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['saga'],
        'l1_ratio': uniform(0, 1),  # Only used with elasticnet
        'max_iter': [200, 500, 1000]
    }
    
    base_model = LogisticRegression(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    
    random_search.fit(X_train, y_train)
    
    print(f"Best params: {random_search.best_params_}")
    print(f"Best F1: {random_search.best_score_:.4f}\n")
    
    return random_search.best_params_

# Run hyperparameter tuning
print("="*80)
print("HYPERPARAMETER TUNING FOR ALL MODELS")
print("="*80)

best_params = {}
best_params['xgboost'] = tune_xgboost(X_train_reduced, y_train, n_iter=50, cv_folds=4)
best_params['lightgbm'] = tune_lightgbm(X_train_reduced, y_train, n_iter=50, cv_folds=4)
best_params['catboost'] = tune_catboost(X_train_reduced, y_train, n_iter=50, cv_folds=4)
best_params['random_forest'] = tune_random_forest(X_train_reduced, y_train, n_iter=50, cv_folds=4)
best_params['logistic'] = tune_logistic_regression(X_train_reduced, y_train, n_iter=50, cv_folds=4)

# Save best parameters
joblib.dump(best_params, 'best_hyperparameters.joblib')
print("\n" + "="*80)
print("saved to 'best_hyperparameters.joblib'")
print("="*80)

HYPERPARAMETER TUNING FOR ALL MODELS
Tuning XGBoost...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
[CV] END colsample_bytree=0.749816047538945, gamma=0.2852142919229748, learning_rate=0.15639878836228102, max_depth=7, min_child_weight=5, n_estimators=202, reg_alpha=0.44583275285359114, reg_lambda=0.19994983163600577, subsample=0.7836995567863468; total time=  12.3s
[CV] END colsample_bytree=0.7334834444556088, gamma=0.04286004537658223, learning_rate=0.14017769458977059, max_depth=7, min_child_weight=2, n_estimators=393, reg_alpha=0.0007787658410143283, reg_lambda=1.984423118582435, subsample=0.8469926038510867; total time=  22.3s
[CV] END colsample_bytree=0.8446612641953124, gamma=0.002119891565915222, learning_rate=0.014612485008283152, max_depth=5, min_child_weight=3, n_estimators=269, reg_alpha=0.04666566321361543, reg_lambda=1.9475110376829183, subsample=0.6931085361721216; total time=  34.4s
[CV] END colsample_bytree=0.6362425738131283, gamma=0.1855158027999262,




[CV] END colsample_bytree=0.7540390914407701, gamma=0.25534100145505706, learning_rate=0.07338440103125553, max_depth=4, min_child_weight=1, n_estimators=322, reg_alpha=0.8442131407263114, reg_lambda=1.8600336696216637, subsample=0.6281664523398175; total time=  24.7s
[CV] END colsample_bytree=0.6835674870461441, gamma=0.20134305504721517, learning_rate=0.08172935625923278, max_depth=9, min_child_weight=3, n_estimators=372, reg_alpha=0.8486697949246744, reg_lambda=0.27324266288405763, subsample=0.8835643987640474; total time=  22.0s
[CV] END colsample_bytree=0.8211279907631631, gamma=0.08895304309433955, learning_rate=0.0939561712892553, max_depth=4, min_child_weight=1, n_estimators=112, reg_alpha=0.08159418040024036, reg_lambda=0.010369725547973552, subsample=0.8511577659794545; total time=  10.8s
[CV] END colsample_bytree=0.6777095814048169, gamma=0.021282275099978296, learning_rate=0.08935676544277768, max_depth=8, min_child_weight=4, n_estimators=242, reg_alpha=0.375582952639944, 




[CV] END colsample_bytree=0.7540390914407701, gamma=0.25534100145505706, learning_rate=0.07338440103125553, max_depth=4, min_child_weight=1, n_estimators=322, reg_alpha=0.8442131407263114, reg_lambda=1.8600336696216637, subsample=0.6281664523398175; total time=  24.5s
[CV] END colsample_bytree=0.6835674870461441, gamma=0.20134305504721517, learning_rate=0.08172935625923278, max_depth=9, min_child_weight=3, n_estimators=372, reg_alpha=0.8486697949246744, reg_lambda=0.27324266288405763, subsample=0.8835643987640474; total time=  22.5s
[CV] END colsample_bytree=0.8211279907631631, gamma=0.08895304309433955, learning_rate=0.0939561712892553, max_depth=4, min_child_weight=1, n_estimators=112, reg_alpha=0.08159418040024036, reg_lambda=0.010369725547973552, subsample=0.8511577659794545; total time=  10.4s
[CV] END colsample_bytree=0.6777095814048169, gamma=0.021282275099978296, learning_rate=0.08935676544277768, max_depth=8, min_child_weight=4, n_estimators=242, reg_alpha=0.375582952639944, 




[CV] END colsample_bytree=0.7540390914407701, gamma=0.25534100145505706, learning_rate=0.07338440103125553, max_depth=4, min_child_weight=1, n_estimators=322, reg_alpha=0.8442131407263114, reg_lambda=1.8600336696216637, subsample=0.6281664523398175; total time=  24.1s
[CV] END colsample_bytree=0.6835674870461441, gamma=0.20134305504721517, learning_rate=0.08172935625923278, max_depth=9, min_child_weight=3, n_estimators=372, reg_alpha=0.8486697949246744, reg_lambda=0.27324266288405763, subsample=0.8835643987640474; total time=  22.7s
[CV] END colsample_bytree=0.8211279907631631, gamma=0.08895304309433955, learning_rate=0.0939561712892553, max_depth=4, min_child_weight=1, n_estimators=112, reg_alpha=0.08159418040024036, reg_lambda=0.010369725547973552, subsample=0.8511577659794545; total time=  10.2s
[CV] END colsample_bytree=0.6777095814048169, gamma=0.021282275099978296, learning_rate=0.08935676544277768, max_depth=8, min_child_weight=4, n_estimators=242, reg_alpha=0.375582952639944, 



[CV] END colsample_bytree=0.8603553891795411, learning_rate=0.021282315805420053, max_depth=6, min_child_weight=10.385527090157503, n_estimators=291, num_leaves=79, reg_alpha=0.18340450985343382, reg_lambda=0.6084844859190754, subsample=0.8099025726528951; total time= 6.0min
[CV] END colsample_bytree=0.7727780074568463, learning_rate=0.06824582803960838, max_depth=5, min_child_weight=4.998609717152554, n_estimators=287, num_leaves=99, reg_alpha=0.23277134043030423, reg_lambda=0.1812128690656416, subsample=0.8473544037332349; total time= 3.8min
[CV] END colsample_bytree=0.7529847965068651, learning_rate=0.20664617716135766, max_depth=3, min_child_weight=1.4645041271999772, n_estimators=234, num_leaves=40, reg_alpha=0.450499251969543, reg_lambda=0.026529922319733057, subsample=0.9768807022739411; total time= 1.7min
[CV] END colsample_bytree=0.8253152871382157, learning_rate=0.08708330050798323, max_depth=4, min_child_weight=1.9767211400638387, n_estimators=191, num_leaves=79, reg_alpha=0



[CV] END colsample_bytree=0.8603553891795411, learning_rate=0.021282315805420053, max_depth=6, min_child_weight=10.385527090157503, n_estimators=291, num_leaves=79, reg_alpha=0.18340450985343382, reg_lambda=0.6084844859190754, subsample=0.8099025726528951; total time= 5.8min
[CV] END colsample_bytree=0.7727780074568463, learning_rate=0.06824582803960838, max_depth=5, min_child_weight=4.998609717152554, n_estimators=287, num_leaves=99, reg_alpha=0.23277134043030423, reg_lambda=0.1812128690656416, subsample=0.8473544037332349; total time= 3.7min
[CV] END colsample_bytree=0.7529847965068651, learning_rate=0.20664617716135766, max_depth=3, min_child_weight=1.4645041271999772, n_estimators=234, num_leaves=40, reg_alpha=0.450499251969543, reg_lambda=0.026529922319733057, subsample=0.9768807022739411; total time= 1.8min
[CV] END colsample_bytree=0.8253152871382157, learning_rate=0.08708330050798323, max_depth=4, min_child_weight=1.9767211400638387, n_estimators=191, num_leaves=79, reg_alpha=0



[CV] END colsample_bytree=0.8603553891795411, learning_rate=0.021282315805420053, max_depth=6, min_child_weight=10.385527090157503, n_estimators=291, num_leaves=79, reg_alpha=0.18340450985343382, reg_lambda=0.6084844859190754, subsample=0.8099025726528951; total time= 6.1min
[CV] END colsample_bytree=0.7727780074568463, learning_rate=0.06824582803960838, max_depth=5, min_child_weight=4.998609717152554, n_estimators=287, num_leaves=99, reg_alpha=0.23277134043030423, reg_lambda=0.1812128690656416, subsample=0.8473544037332349; total time= 3.8min
[CV] END colsample_bytree=0.7529847965068651, learning_rate=0.20664617716135766, max_depth=3, min_child_weight=1.4645041271999772, n_estimators=234, num_leaves=40, reg_alpha=0.450499251969543, reg_lambda=0.026529922319733057, subsample=0.9768807022739411; total time= 1.8min
[CV] END colsample_bytree=0.8253152871382157, learning_rate=0.08708330050798323, max_depth=4, min_child_weight=1.9767211400638387, n_estimators=191, num_leaves=79, reg_alpha=0



[CV] END colsample_bytree=0.8603553891795411, learning_rate=0.021282315805420053, max_depth=6, min_child_weight=10.385527090157503, n_estimators=291, num_leaves=79, reg_alpha=0.18340450985343382, reg_lambda=0.6084844859190754, subsample=0.8099025726528951; total time= 5.9min
[CV] END colsample_bytree=0.7727780074568463, learning_rate=0.06824582803960838, max_depth=5, min_child_weight=4.998609717152554, n_estimators=287, num_leaves=99, reg_alpha=0.23277134043030423, reg_lambda=0.1812128690656416, subsample=0.8473544037332349; total time= 4.0min
[CV] END colsample_bytree=0.7529847965068651, learning_rate=0.20664617716135766, max_depth=3, min_child_weight=1.4645041271999772, n_estimators=234, num_leaves=40, reg_alpha=0.450499251969543, reg_lambda=0.026529922319733057, subsample=0.9768807022739411; total time= 1.9min
[CV] END colsample_bytree=0.8253152871382157, learning_rate=0.08708330050798323, max_depth=4, min_child_weight=1.9767211400638387, n_estimators=191, num_leaves=79, reg_alpha=0



Best params: {'colsample_bytree': np.float64(0.8603553891795411), 'learning_rate': np.float64(0.021282315805420053), 'max_depth': 6, 'min_child_weight': np.float64(10.385527090157503), 'n_estimators': 291, 'num_leaves': 79, 'reg_alpha': np.float64(0.18340450985343382), 'reg_lambda': np.float64(0.6084844859190754), 'subsample': np.float64(0.8099025726528951)}
Best ROC-AUC: 0.6162

Tuning CatBoost...
Fitting 4 folds for each of 50 candidates, totalling 200 fits

[CV] END colsample_bytree=0.9394679179698697, learning_rate=0.037324266288405766, max_depth=8, min_child_weight=4.594911512197552, n_estimators=269, num_leaves=45, reg_alpha=0.8101133946791808, reg_lambda=1.7341446371602074, subsample=0.9652962210225885; total time= 6.0min
[CV] END colsample_bytree=0.8045369595443751, learning_rate=0.11030325893743992, max_depth=7, min_child_weight=2.942739535120422, n_estimators=363, num_leaves=35, reg_alpha=0.050768531039396936, reg_lambda=1.7732342979013198, subsample=0.6110467087494819; total




[CV] END bootstrap=False, max_depth=21, max_features=log2, min_samples_leaf=8, min_samples_split=5, n_estimators=51; total time=   1.2s
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=19, n_estimators=267; total time=   5.2s
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=19, n_estimators=267; total time=   5.6s
[CV] END bootstrap=False, max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=15, n_estimators=144; total time=   2.5s
[CV] END bootstrap=False, max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=15, n_estimators=144; total time=   3.9s
[CV] END bootstrap=False, max_depth=19, max_features=log2, min_samples_leaf=7, min_samples_split=9, n_estimators=262; total time=   6.2s
[CV] END bootstrap=False, max_depth=19, max_features=log2, min_samples_leaf=7, min_samples_split=9, n_estimators=262; total time=   5.9s
[CV] END bootstrap=True, max_depth=19, 




[CV] END bootstrap=True, max_depth=7, max_features=sqrt, min_samples_leaf=5, min_samples_split=11, n_estimators=280; total time=  10.8s
[CV] END bootstrap=False, max_depth=13, max_features=None, min_samples_leaf=9, min_samples_split=9, n_estimators=61; total time= 1.7min
[CV] END bootstrap=False, max_depth=5, max_features=None, min_samples_leaf=7, min_samples_split=6, n_estimators=148; total time= 3.1min
[CV] END bootstrap=False, max_depth=12, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=276; total time=   5.9s
[CV] END bootstrap=False, max_depth=12, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=276; total time=   6.0s
[CV] END bootstrap=False, max_depth=27, max_features=None, min_samples_leaf=7, min_samples_split=10, n_estimators=256; total time= 8.4min
[CV] END bootstrap=True, max_depth=14, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=290; total time=  12.0s
[CV] END bootstrap=True, max_depth=14, max_




[CV] END bootstrap=False, max_depth=27, max_features=None, min_samples_leaf=7, min_samples_split=10, n_estimators=256; total time=10.5min
[CV] END bootstrap=True, max_depth=5, max_features=None, min_samples_leaf=8, min_samples_split=16, n_estimators=108; total time= 1.3min
[CV] END bootstrap=True, max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=13, n_estimators=267; total time= 3.5min
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=12, n_estimators=233; total time= 7.0min
[CV] END bootstrap=True, max_depth=21, max_features=None, min_samples_leaf=6, min_samples_split=6, n_estimators=165; total time= 2.8min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=196; total time=   7.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=3, min_samples_split=5, n_estimators=196; total time=   7.5s
[CV] END bootstrap=False, max_depth=7, max_



Best params: {'C': np.float64(6.833635188254583), 'l1_ratio': np.float64(0.6099966577826209), 'max_iter': 1000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best F1: 0.6444


saved to 'best_hyperparameters.joblib'


In [60]:
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score, f1_score, precision_recall_curve
# -----------------------------
# Load best parameters (or use defaults)
# -----------------------------
try:
    best_params = joblib.load('best_hyperparameters.joblib')
    print("Loaded tuned hyperparameters!")
    print("="*80)
except:
    print("No tuned parameters found, using defaults...")
    print("="*80)
    best_params = {
        'xgboost': {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 200, 
                    'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 
                    'gamma': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1.0},
        'lightgbm': {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 200, 
                     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 
                     'reg_alpha': 0.1, 'reg_lambda': 1.0, 'num_leaves': 31},
        'catboost': {'depth': 6, 'learning_rate': 0.1, 'iterations': 200, 
                     'l2_leaf_reg': 3, 'border_count': 128, 'bagging_temperature': 0.5},
        'random_forest': {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 10, 
                         'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True},
        'logistic': {'C': 0.1, 'penalty': 'l2', 'max_iter': 500}
    }

# -----------------------------
# Compute class weights
# -----------------------------
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1

print(f"Class distribution - Negative: {n_neg}, Positive: {n_pos}")
print(f"Scale pos weight: {scale_pos_weight:.4f}\n")

# -----------------------------
# Define models with tuned parameters
# -----------------------------
def get_models(best_params, scale_pos_weight):
    return {
        'xgboost': xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            tree_method='hist',
            device='cpu',
            random_state=42,
            scale_pos_weight=scale_pos_weight,
            **best_params['xgboost']
        ),
        'lightgbm': lgb.LGBMClassifier(
            objective='binary',
            metric='binary_logloss',
            device='cpu',
            random_state=42,
            scale_pos_weight=scale_pos_weight,
            verbose=-1,
            **best_params['lightgbm']
        ),
        'catboost': CatBoostClassifier(
            random_seed=42,
            verbose=False,
            scale_pos_weight=scale_pos_weight,
            **best_params['catboost']
        ),
        'random_forest': RandomForestClassifier(
            class_weight='balanced',
            random_state=42,
            n_jobs=-1,
            **best_params['random_forest']
        ),
        'logistic': LogisticRegression(
            class_weight='balanced',
            random_state=42,
            n_jobs=-1,
            **best_params['logistic']
        )
    }

# -----------------------------
# Cross-validation for all models
# -----------------------------
k = 5
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

model_scores = {name: {'acc': [], 'logloss': [], 'roc_auc': [], 'pr_auc': [], 'f1': [], 'thresh': []} 
                for name in ['xgboost', 'lightgbm', 'catboost', 'random_forest', 'logistic']}

print("="*80)
print("CROSS-VALIDATION FOR ALL MODELS")
print("="*80)

oof_preds = {name: np.zeros(len(X_train_reduced)) for name in model_names}

for model_name in ['xgboost', 'lightgbm', 'catboost', 'random_forest', 'logistic']:
    print(f"\n{'='*80}")
    print(f"Model: {model_name.upper()}")
    print(f"{'='*80}")
    
    for fold, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_train_reduced, y_train), 
                                                       total=k, 
                                                       desc=f"{model_name} CV"), 1):
        X_tr, X_val = X_train_reduced[train_idx], X_train_reduced[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Get fresh model for this fold
        models = get_models(best_params, scale_pos_weight)
        model = models[model_name]
        
        # Train model
        if model_name == 'catboost':
            model.fit(X_tr, y_tr, verbose=False)
        elif model_name == 'lightgbm':
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
        else:
            model.fit(X_tr, y_tr)
        
        # Predict probabilities
        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)

        oof_preds[model_name][val_idx] = y_prob
        
        # Metrics
        acc = accuracy_score(y_val, y_pred)
        ll = log_loss(y_val, y_prob)
        roc = roc_auc_score(y_val, y_prob)
        pr = average_precision_score(y_val, y_prob)
        
        # Find best threshold by F1
        precisions, recalls, thresholds = precision_recall_curve(y_val, y_prob)
        f1_scores = 2 * (precisions[:-1] * recalls[:-1]) / (precisions[:-1] + recalls[:-1] + 1e-12)
        best_idx = np.argmax(f1_scores) if f1_scores.size else 0
        best_thresh = thresholds[best_idx] if f1_scores.size else 0.5
        best_f1 = f1_scores[best_idx] if f1_scores.size else f1_score(y_val, y_pred)
        
        model_scores[model_name]['acc'].append(acc)
        model_scores[model_name]['logloss'].append(ll)
        model_scores[model_name]['roc_auc'].append(roc)
        model_scores[model_name]['pr_auc'].append(pr)
        model_scores[model_name]['f1'].append(best_f1)
        model_scores[model_name]['thresh'].append(best_thresh)
        
        print(f"Fold {fold}: acc={acc:.4f}, logloss={ll:.4f}, ROC-AUC={roc:.4f}, PR-AUC={pr:.4f}, F1={best_f1:.4f} @thresh={best_thresh:.3f}")

# -----------------------------
# Print CV summary
# -----------------------------
print(f"\n{'='*80}")
print("CROSS-VALIDATION SUMMARY")
print(f"{'='*80}")
print(f"{'Model':<15} {'Accuracy':<12} {'LogLoss':<12} {'ROC-AUC':<12} {'PR-AUC':<12} {'F1':<12}")
print("-"*80)

for model_name in ['xgboost', 'lightgbm', 'catboost', 'random_forest', 'logistic']:
    scores = model_scores[model_name]
    print(f"{model_name:<15} "
          f"{np.mean(scores['acc']):.4f}±{np.std(scores['acc']):.4f}  "
          f"{np.mean(scores['logloss']):.4f}±{np.std(scores['logloss']):.4f}  "
          f"{np.mean(scores['roc_auc']):.4f}±{np.std(scores['roc_auc']):.4f}  "
          f"{np.mean(scores['pr_auc']):.4f}±{np.std(scores['pr_auc']):.4f}  "
          f"{np.mean(scores['f1']):.4f}±{np.std(scores['f1']):.4f}")

# -----------------------------
# Train final models on full data
# -----------------------------
print(f"\n{'='*80}")
print("TRAINING FINAL MODELS ON FULL DATASET")
print(f"{'='*80}\n")

final_models = get_models(best_params, scale_pos_weight)
test_predictions = {}

for model_name, model in tqdm(final_models.items(), desc="Training final models"):
    if model_name == 'catboost':
        model.fit(X_train_reduced, y_train, verbose=False)
    elif model_name == 'lightgbm':
        model.fit(X_train_reduced, y_train, callbacks=[lgb.log_evaluation(0)])
    else:
        model.fit(X_train_reduced, y_train)
    
    test_predictions[model_name] = model.predict_proba(X_test_reduced)[:, 1]
    
    # Save model
    joblib.dump(model, f"{model_name}_final_model.joblib")
    print(f"Saved {model_name}_final_model.joblib")



Loaded tuned hyperparameters!
Class distribution - Negative: 1666, Positive: 273
Scale pos weight: 6.1026

CROSS-VALIDATION FOR ALL MODELS

Model: XGBOOST


xgboost CV:  20%|██████▌                          | 1/5 [00:12<00:51, 12.91s/it]

Fold 1: acc=0.8763, logloss=0.2814, ROC-AUC=0.8921, PR-AUC=0.6297, F1=0.6050 @thresh=0.514


xgboost CV:  40%|█████████████▏                   | 2/5 [00:25<00:38, 12.84s/it]

Fold 2: acc=0.8866, logloss=0.2606, ROC-AUC=0.9178, PR-AUC=0.7051, F1=0.6571 @thresh=0.370


xgboost CV:  60%|███████████████████▊             | 3/5 [00:37<00:24, 12.47s/it]

Fold 3: acc=0.8686, logloss=0.2769, ROC-AUC=0.9058, PR-AUC=0.6131, F1=0.6000 @thresh=0.348


xgboost CV:  80%|██████████████████████████▍      | 4/5 [00:53<00:13, 13.60s/it]

Fold 4: acc=0.8789, logloss=0.2729, ROC-AUC=0.9019, PR-AUC=0.6566, F1=0.6299 @thresh=0.461


xgboost CV: 100%|█████████████████████████████████| 5/5 [01:05<00:00, 13.07s/it]


Fold 5: acc=0.8863, logloss=0.2603, ROC-AUC=0.9284, PR-AUC=0.6906, F1=0.6607 @thresh=0.556

Model: LIGHTGBM


lightgbm CV:  20%|██████▍                         | 1/5 [00:04<00:17,  4.39s/it]

Fold 1: acc=0.8866, logloss=0.2571, ROC-AUC=0.9030, PR-AUC=0.6633, F1=0.6504 @thresh=0.403


lightgbm CV:  40%|████████████▊                   | 2/5 [00:08<00:13,  4.41s/it]

Fold 2: acc=0.8918, logloss=0.2417, ROC-AUC=0.9158, PR-AUC=0.7009, F1=0.6885 @thresh=0.376


lightgbm CV:  60%|███████████████████▏            | 3/5 [00:14<00:09,  4.79s/it]

Fold 3: acc=0.8789, logloss=0.2481, ROC-AUC=0.9120, PR-AUC=0.6707, F1=0.6338 @thresh=0.292


lightgbm CV:  80%|█████████████████████████▌      | 4/5 [00:18<00:04,  4.63s/it]

Fold 4: acc=0.8892, logloss=0.2499, ROC-AUC=0.9129, PR-AUC=0.6803, F1=0.6415 @thresh=0.568


lightgbm CV: 100%|████████████████████████████████| 5/5 [00:22<00:00,  4.60s/it]


Fold 5: acc=0.8889, logloss=0.2295, ROC-AUC=0.9379, PR-AUC=0.6953, F1=0.6600 @thresh=0.625

Model: CATBOOST


catboost CV:  20%|██████▍                         | 1/5 [00:04<00:19,  4.75s/it]

Fold 1: acc=0.8814, logloss=0.2735, ROC-AUC=0.8993, PR-AUC=0.6448, F1=0.6412 @thresh=0.162


catboost CV:  40%|████████████▊                   | 2/5 [00:09<00:14,  4.82s/it]

Fold 2: acc=0.8995, logloss=0.2406, ROC-AUC=0.9113, PR-AUC=0.6884, F1=0.7154 @thresh=0.320


catboost CV:  60%|███████████████████▏            | 3/5 [00:14<00:09,  4.96s/it]

Fold 3: acc=0.8918, logloss=0.2338, ROC-AUC=0.9196, PR-AUC=0.7146, F1=0.6552 @thresh=0.364


catboost CV:  80%|█████████████████████████▌      | 4/5 [00:20<00:05,  5.14s/it]

Fold 4: acc=0.8840, logloss=0.2973, ROC-AUC=0.8879, PR-AUC=0.5718, F1=0.6078 @thresh=0.582


catboost CV: 100%|████████████████████████████████| 5/5 [00:25<00:00,  5.00s/it]


Fold 5: acc=0.8889, logloss=0.2356, ROC-AUC=0.9293, PR-AUC=0.6499, F1=0.6400 @thresh=0.325

Model: RANDOM_FOREST


random_forest CV:  20%|█████▍                     | 1/5 [00:02<00:08,  2.07s/it]

Fold 1: acc=0.8866, logloss=0.4051, ROC-AUC=0.8586, PR-AUC=0.5559, F1=0.6195 @thresh=0.498


random_forest CV:  40%|██████████▊                | 2/5 [00:04<00:06,  2.08s/it]

Fold 2: acc=0.8943, logloss=0.4020, ROC-AUC=0.9020, PR-AUC=0.6236, F1=0.6557 @thresh=0.458


random_forest CV:  60%|████████████████▏          | 3/5 [00:06<00:04,  2.09s/it]

Fold 3: acc=0.8711, logloss=0.4032, ROC-AUC=0.8872, PR-AUC=0.5806, F1=0.6026 @thresh=0.407


random_forest CV:  80%|█████████████████████▌     | 4/5 [00:08<00:02,  2.10s/it]

Fold 4: acc=0.8943, logloss=0.4005, ROC-AUC=0.8877, PR-AUC=0.6157, F1=0.6496 @thresh=0.479


random_forest CV: 100%|███████████████████████████| 5/5 [00:10<00:00,  2.10s/it]


Fold 5: acc=0.8941, logloss=0.4046, ROC-AUC=0.9132, PR-AUC=0.6851, F1=0.6496 @thresh=0.490

Model: LOGISTIC


logistic CV:  20%|██████▍                         | 1/5 [00:21<01:24, 21.02s/it]

Fold 1: acc=0.8608, logloss=0.3397, ROC-AUC=0.9038, PR-AUC=0.6761, F1=0.6408 @thresh=0.790


logistic CV:  40%|████████████▊                   | 2/5 [00:40<01:00, 20.18s/it]

Fold 2: acc=0.8686, logloss=0.3400, ROC-AUC=0.9118, PR-AUC=0.6196, F1=0.6885 @thresh=0.608


logistic CV:  60%|███████████████████▏            | 3/5 [01:01<00:40, 20.48s/it]

Fold 3: acc=0.8634, logloss=0.3301, ROC-AUC=0.9186, PR-AUC=0.6350, F1=0.6370 @thresh=0.531


logistic CV:  80%|█████████████████████████▌      | 4/5 [01:19<00:19, 19.67s/it]

Fold 4: acc=0.8737, logloss=0.3114, ROC-AUC=0.9032, PR-AUC=0.7233, F1=0.6833 @thresh=0.606


logistic CV: 100%|████████████████████████████████| 5/5 [01:39<00:00, 19.90s/it]


Fold 5: acc=0.8630, logloss=0.3121, ROC-AUC=0.9401, PR-AUC=0.7069, F1=0.6815 @thresh=0.562

CROSS-VALIDATION SUMMARY
Model           Accuracy     LogLoss      ROC-AUC      PR-AUC       F1          
--------------------------------------------------------------------------------
xgboost         0.8793±0.0067  0.2704±0.0086  0.9092±0.0127  0.6590±0.0349  0.6306±0.0253
lightgbm        0.8871±0.0044  0.2453±0.0093  0.9163±0.0116  0.6821±0.0142  0.6548±0.0190
catboost        0.8891±0.0063  0.2562±0.0251  0.9095±0.0146  0.6539±0.0484  0.6519±0.0353
random_forest   0.8881±0.0090  0.4031±0.0017  0.8897±0.0183  0.6122±0.0439  0.6354±0.0207
logistic        0.8659±0.0047  0.3267±0.0127  0.9155±0.0135  0.6722±0.0400  0.6662±0.0225

TRAINING FINAL MODELS ON FULL DATASET



Training final models:  20%|████▍                 | 1/5 [00:13<00:53, 13.30s/it]

Saved xgboost_final_model.joblib


Training final models:  40%|████████▊             | 2/5 [00:21<00:31, 10.50s/it]

Saved lightgbm_final_model.joblib

[CV] END C=1.613212872540044, l1_ratio=0.9296976523425731, max_iter=200, penalty=l2, solver=saga; total time=   5.2s
[CV] END C=6.335037565104235, l1_ratio=0.8714605901877177, max_iter=500, penalty=l2, solver=saga; total time=   8.1s
[CV] END C=5.394422419156507, l1_ratio=0.8074401551640625, max_iter=200, penalty=elasticnet, solver=saga; total time=   7.4s
[CV] END C=9.06928441545754, l1_ratio=0.2721322493846353, max_iter=1000, penalty=l2, solver=saga; total time=  10.6s
[CV] END C=1.6475585314294172, l1_ratio=0.534089419375442, max_iter=500, penalty=l1, solver=saga; total time=  15.7s
[CV] END C=6.925360328902704, l1_ratio=0.2694123337985215, max_iter=1000, penalty=elasticnet, solver=saga; total time=  18.9s
[CV] END C=3.233029320207552, l1_ratio=0.5187906217433661, max_iter=200, penalty=l1, solver=saga; total time=   6.5s
[CV] END C=0.6499224710898156, l1_ratio=0.2539154139343447, max_iter=500, penalty=elasticnet, solver=saga; total time=   2.3s
[CV

Training final models:  60%|█████████████▏        | 3/5 [00:27<00:16,  8.20s/it]

Saved catboost_final_model.joblib


Training final models:  80%|█████████████████▌    | 4/5 [00:29<00:05,  5.96s/it]

Saved random_forest_final_model.joblib


Training final models: 100%|██████████████████████| 5/5 [00:56<00:00, 11.25s/it]

Saved logistic_final_model.joblib





In [None]:
print(oof_preds)

In [61]:
from scipy.optimize import minimize
from sklearn.metrics import f1_score
# -----------------------------
# Ensemble predictions
# -----------------------------
print(f"\n{'='*80}")
print("ENSEMBLE PREDICTIONS")
print(f"{'='*80}\n")

# Weighted average based on CV F1 scores
#weights = {name: np.mean(model_scores[name]['f1']) for name in final_models.keys()}
#total_weight = sum(weights.values())
#weights = {name: w/total_weight for name, w in weights.items()}

##################### test


model_names = list(final_models.keys())
n_models = len(model_names)

def objective(weights):
    weights = np.array(weights)
    weights = weights / weights.sum()  # normalize
    ensemble_pred = sum(weights[i] * oof_preds[m] for i, m in enumerate(model_names))
    return log_loss(y_train, ensemble_pred)

# Initial equal weights
x0 = np.ones(n_models) / n_models

# Constraints: weights sum to 1
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
bounds = [(0, 1) for _ in range(n_models)]

res = minimize(objective, x0, bounds=bounds, constraints=constraints)
best_weights = res.x / res.x.sum()
weights = dict(zip(model_names, best_weights))


#####################






print("Model weights (based on CV F1):")
for name, weight in weights.items():
    print(f"  {name}: {weight:.4f}")

# Compute weighted ensemble
ensemble_probs = np.zeros(len(X_test_reduced))
for model_name, prob in test_predictions.items():
    ensemble_probs += weights[model_name] * prob

# Compute average threshold from all models
avg_threshold = np.mean([np.mean(model_scores[name]['thresh']) for name in final_models.keys()])
print(f"\nAverage optimal threshold: {avg_threshold:.4f}")

# -----------------------------
# Generate submissions
# -----------------------------
import pandas as pd

# Individual model submissions
for model_name, prob in test_predictions.items():
    thresh = np.mean(model_scores[model_name]['thresh'])
    pred = (prob >= thresh).astype(int)
    submission = pd.DataFrame({"id": ids_test, "label": pred})
    submission.to_csv(f"submission_{model_name}.csv", index=False)
    print(f"Saved submission_{model_name}.csv (threshold={thresh:.4f})")

# Ensemble submission
ensemble_pred = (ensemble_probs >= avg_threshold).astype(int)
submission_ensemble = pd.DataFrame({"id": ids_test, "label": ensemble_pred})
submission_ensemble.to_csv("submission_ensemble.csv", index=False)
print(f"\nSaved submission_ensemble.csv (weighted average, threshold={avg_threshold:.4f})")

# Conservative ensemble (require multiple models to agree)
conservative_pred = np.zeros(len(X_test_reduced))
for prob in test_predictions.values():
    conservative_pred += (prob >= 0.5).astype(int)
conservative_pred = (conservative_pred >= 3).astype(int)  # At least 3 models agree
submission_conservative = pd.DataFrame({"id": ids_test, "label": conservative_pred})
submission_conservative.to_csv("submission_conservative.csv", index=False)
print(f"Saved submission_conservative.csv (majority voting, 3/5 agreement)")

print(f"\n{'='*80}")
print("COMPLETE!")
print(f"{'='*80}")


ENSEMBLE PREDICTIONS

Model weights (based on CV F1):
  xgboost: 0.0000
  lightgbm: 0.4692
  catboost: 0.4560
  random_forest: 0.0000
  logistic: 0.0747

Average optimal threshold: 0.4678
Saved submission_xgboost.csv (threshold=0.4499)
Saved submission_lightgbm.csv (threshold=0.4527)
Saved submission_catboost.csv (threshold=0.3503)
Saved submission_random_forest.csv (threshold=0.4665)
Saved submission_logistic.csv (threshold=0.6194)

Saved submission_ensemble.csv (weighted average, threshold=0.4678)
Saved submission_conservative.csv (majority voting, 3/5 agreement)

COMPLETE!
