<a href="https://www.kaggle.com/code/m000sey/ps-3-23-7-model-ensemble?scriptVersionId=151046151" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Trying different models and stacking to see what works
- three major sources were the most helpful for this notebook ([here](https://www.kaggle.com/competitions/playground-series-s3e23/discussion/445245), [here](https://www.kaggle.com/code/adelinmil/pg-s03e23-quick-and-dirty-implementation/notebook), and [here](https://www.kaggle.com/code/ambrosm/pss3e23-eda-which-makes-sense)) Check out their work! They are far smarter than I.

In [1]:
### Import libraries and select file path
import pandas as pd
import platform
from sklearn.model_selection import StratifiedKFold


if platform.system() == "Linux":
    train_path = '/kaggle/input/playground-series-s3e23/train.csv'
    test_path = '/kaggle/input/playground-series-s3e23/test.csv'
    sample_path = '/kaggle/input/playground-series-s3e23/sample_submission.csv'
    print("Using Linux file path...")
else:
    raise OSError("Unsupported operating system")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample = pd.read_csv(sample_path)

# Small bit of feature engineering
train['defects'] = train['defects'].map({False: 0, True: 1})
test_id = test['id']

X_train = train.iloc[:, :-1]
y_train = train['defects']

kf = StratifiedKFold(n_splits=12)

print("Data added...")

Using Linux file path...
Data added...


In [2]:
!pip install xgboost==2.0.0
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import time


XGB = XGBClassifier(random_state=42,
                    n_estimators=880,
                    eta=0.02,
                    gamma=6.3,
                    max_depth=8,
                    colsample_bytree=0.64,
                    reg_lambda=5.7,
                    reg_alpha=9.4,
                    subsample=0.6,
                    min_child_weight=2.6,
                    scale_pos_weight=2.7)

XGB_start_time = time.time() 

scores = cross_val_score(XGB, X_train, y_train, scoring='roc_auc', cv=kf)

XGB_end_time = time.time()

XGB_run_time_min = (XGB_end_time - XGB_start_time)/60

print(f'{XGB.__class__.__name__} auroc score: {scores.mean()}')
print(f'{XGB.__class__.__name__} run time: {XGB_run_time_min:.2f} minutes')

Collecting xgboost==2.0.0
  Downloading xgboost-2.0.0-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 1.7.6
    Uninstalling xgboost-1.7.6:
      Successfully uninstalled xgboost-1.7.6
Successfully installed xgboost-2.0.0
XGBClassifier auroc score: 0.7928723295937307
XGBClassifier run time: 1.24 minutes


In [3]:
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
import time

CAT = CatBoostClassifier(n_estimators=1000,
                         learning_rate=0.04,
                         max_depth=3,
                         l2_leaf_reg=0.00771,
                         border_count=196,
                         bagging_temperature=0.04,
                         one_hot_max_size=98,
                         loss_function='CrossEntropy',
                         random_state=42, 
                         verbose=0)

CAT_start_time = time.time() 

scores = cross_val_score(CAT, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)

CAT_end_time = time.time()

CAT_run_time_min = (CAT_end_time - CAT_start_time)/60

print(f'{CAT.__class__.__name__} auroc score: {scores.mean()}')
print(f'{CAT.__class__.__name__} run time: {CAT_run_time_min:.2f} minutes')

CatBoostClassifier auroc score: 0.791936001911021
CatBoostClassifier run time: 4.05 minutes


In [4]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
import time

LGBM = LGBMClassifier(random_state=42,
                      n_estimators=230,
                      learning_rate=0.05,
                      max_depth=9,
                      num_leaves=20,
                      min_child_samples=6,
                      subsample=0.9,
                      colsample_bytree=0.5,
                      reg_alpha=9.7,
                      reg_lambda=9.5,
                      scale_pos_weight=1.2)

LGBM_start_time = time.time() 

scores = cross_val_score(LGBM, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)

LGBM_end_time = time.time()

LGBM_run_time_min = (LGBM_end_time - LGBM_start_time)/60

print(f'{LGBM.__class__.__name__} auroc score: {scores.mean()}')
print(f'{LGBM.__class__.__name__} run time: {LGBM_run_time_min:.2f} minutes')

LGBMClassifier auroc score: 0.7928270190192018
LGBMClassifier run time: 0.94 minutes


In [5]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score
import time

HGB = HistGradientBoostingClassifier(random_state=42,
                                     max_iter=244,
                                     max_depth=4, 
                                     learning_rate=0.077,
                                     n_iter_no_change=14,
                                     tol=2.2045e-05,
                                     validation_fraction=0.16377,
                                     max_bins=78,
                                     l2_regularization=8.9424657,
                                     max_leaf_nodes=84)
    
    
HGB_start_time = time.time() 

scores = cross_val_score(HGB, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)

HGB_end_time = time.time()

HGB_run_time_min = (HGB_end_time - HGB_start_time)/60

print(f'{HGB.__class__.__name__} auroc score: {scores.mean()}')
print(f'{HGB.__class__.__name__} run time: {HGB_run_time_min:.2f} minutes')

HistGradientBoostingClassifier auroc score: 0.7919029899861955
HistGradientBoostingClassifier run time: 0.58 minutes


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import time

RF = RandomForestClassifier(n_estimators=450,
                            max_depth=10,
                            max_features=1.0,
                            min_samples_leaf=150,
                            min_samples_split=12,
                            criterion='entropy',
                            random_state=42)

RF_start_time = time.time() 

scores = cross_val_score(RF, X_train, y_train, scoring='roc_auc', cv=kf, n_jobs=-1)

RF_end_time = time.time()

RF_run_time_min = (RF_end_time - RF_start_time)/60 

print(f'{RF.__class__.__name__} auroc score: {scores.mean()}')
print(f'{RF.__class__.__name__} run time: {RF_run_time_min:.2f} minutes')

RandomForestClassifier auroc score: 0.7913795361413465
RandomForestClassifier run time: 85.57 minutes


In [7]:
from sklearn.ensemble import ExtraTreesClassifier  
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import time
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import make_pipeline

# this code came directly from this discussion: 
# https://www.kaggle.com/competitions/playground-series-s3e23/discussion/446078#2478396

ExT_start_time = time.time() 

ExT = make_pipeline(PowerTransformer(), 
                    ExtraTreesClassifier(min_samples_leaf=100,
                                         max_features=1.0,
                                         max_depth=24,
                                         min_samples_split=0.1,
                                         criterion='entropy',
                                         class_weight='balanced',
                                         n_estimators=480,
                                         random_state=42))
kf = RepeatedStratifiedKFold(n_splits=12,n_repeats=1,random_state=42)
drop_cols = ['iv(g)', 't', 'b', 'n', 'lOCode', 'v', 'branchCount', 'e', 'i', 'lOComment']
scores = cross_val_score(ExT,X_train.drop(drop_cols,axis=1),
                         y_train,scoring='roc_auc', 
                         cv=kf,
                         n_jobs=-1)

ExT_end_time = time.time()

ExT_run_time_min = (ExT_end_time - ExT_start_time)/60

print(f'{ExT.__class__.__name__} auroc score: {scores.mean()}')
print(f'{ExT.__class__.__name__} run time: {ExT_run_time_min:.2f} minutes')

Pipeline auroc score: 0.7877073000475673
Pipeline run time: 12.20 minutes


In [8]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
import time

# Non-tree based model
pipeline = make_pipeline(
    FunctionTransformer(np.log1p),
    Nystroem(n_components=470, random_state=42),
    StandardScaler(),
    LogisticRegression(
        dual=False,
        C=0.0024,
        class_weight='balanced',
        max_iter=1500,
        random_state=42,
        solver='newton-cholesky'
    )
)

pipe_start_time = time.time() 

scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=kf)

pipe_end_time = time.time()

pipe_run_time_min = (pipe_end_time - pipe_start_time)/60

print(f'{pipeline.__class__.__name__} auroc score: {scores.mean()}')
print(f'{pipeline.__class__.__name__} run time: {pipe_run_time_min:.2f} minutes')

Pipeline auroc score: 0.7907975400936346
Pipeline run time: 1.50 minutes


In [9]:
from sklearn.ensemble import VotingClassifier
import time

vclf = VotingClassifier(
     estimators=[('XGB', XGB),
                 ('CAT', CAT),
                 ('LGBM', LGBM),
                 ('HGB', HGB),
                 ('RF', RF),
                 ('ExT', ExT),
                 ('LR', pipeline)],
     voting='soft',
     weights=[0.35, 0.2, 0.2, 0.05, 0.05, 0.05, 0.1])

vclf_start_time = time.time()  

scores = cross_val_score(vclf, X_train, y_train, scoring='roc_auc', cv=kf)
mean_score = scores.mean()

vclf_end_time = time.time()

vclf_run_time_min = (vclf_end_time - vclf_start_time)/60

print(f'{vclf.__class__.__name__} auroc score: {mean_score}')
print(f'{vclf.__class__.__name__} run time: {vclf_run_time_min:.2f} minutes')

VotingClassifier auroc score: 0.7930385106780653
VotingClassifier run time: 97.20 minutes


In [10]:
# Submission form

vclf.fit(X_train, y_train)
submission = pd.DataFrame({'id': test_id,
                          'defects': pd.Series(vclf.predict_proba(test)[:, 1])})
submission.head()
submission.to_csv('submission.csv', index = False)