In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
from copy import deepcopy
from functools import partial
from itertools import combinations
import random
import gc

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from category_encoders import OrdinalEncoder, CountEncoder, CatBoostEncoder, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer, LabelEncoder # OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.decomposition import PCA, NMF
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

# Import libraries for Hypertuning
import optuna

# Import libraries for gradient boosting
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.svm import NuSVC, SVC
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

# Useful line of code to set the display option so we could see all the columns in pd dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
sample_submission = pd.read_csv("./sample_submission.csv")
original = pd.read_csv("./machine failure.csv")

print(f'Data Successfully Loaded \n')

target_col = 'Machine failure'

num_cols = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]
binary_cols = [
    'TWF',
    'HDF',
    'PWF',
    'OSF',
    'RNF'
]
cat_cols = df_test.select_dtypes(include=['object']).columns.tolist()

df_train['is_generated'] = 1
df_test['is_generated'] = 1
original['is_generated'] = 0

print(f'[INFO] Shapes:'
      f'\n original: {original.shape}'
      f'\n train: {df_train.shape}'
      f'\n test: {df_test.shape}\n')

print(f'[INFO] Any missing values:'
      f'\n original: {original.isna().any().any()}'
      f'\n train: {df_train.isna().any().any()}'
      f'\n test: {df_test.isna().any().any()}')

Data Successfully Loaded 

[INFO] Shapes:
 original: (10000, 15)
 train: (136429, 15)
 test: (90954, 14)

[INFO] Any missing values:
 original: False
 train: False
 test: False


In [5]:
def conversion(df):
    df['Process temperature [K]'] = df['Process temperature [K]'] - 272.15
    df['Air temperature [K]'] = df['Air temperature [K]'] - 272.15
    
    return df

def create_features(df):
  
    # Create a new feature by divided 'Air temperature' from 'Process temperature'
    df["Temperature ratio"] = df['Process temperature [K]'] / df['Air temperature [K]']
    
    # Create a new feature by multiplying 'Torque' and 'Rotational speed'
    df['Torque * Rotational speed'] = df['Torque [Nm]'] * df['Rotational speed [rpm]']

    # Create a new feature by multiplying 'Torque' by 'Tool wear'
    df['Torque * Tool wear'] = df['Torque [Nm]'] * df['Tool wear [min]']
    
    # Create a new feature by multiplying 'Torque' by 'Rotational speed'
    df['Torque * Rotational speed'] = df['Torque [Nm]'] * df['Rotational speed [rpm]']
        
    new_cols = [ 
        'Temperature ratio', 
        'Torque * Rotational speed',
        'Torque * Tool wear',  
        'Torque * Rotational speed'
    ]
    
    return df, new_cols

def replace_Type(df):
    
    df['Type'] = df['Type'].replace({'L':0,'M':1,'H':2})
    
    return df

def cat_encoder(X_train, X_test, cat_cols, encode='label'):
    
    if encode == 'label':
        ## Label Encoder
        encoder = OrdinalEncoder(cols=cat_cols)
        train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
        test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
        X_train[cat_cols] = train_encoder[cat_cols]
        X_test[cat_cols] = test_encoder[cat_cols]
        encoder_cols = cat_cols
    
    else:
        ## OneHot Encoder
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        train_encoder = encoder.fit_transform(X_train[cat_cols]).astype(int)
        test_encoder = encoder.transform(X_test[cat_cols]).astype(int)
        X_train = pd.concat([X_train, train_encoder], axis=1)
        X_test = pd.concat([X_test, test_encoder], axis=1)
        X_train.drop(cat_cols, axis=1, inplace=True)
        X_test.drop(cat_cols, axis=1, inplace=True)
        encoder_cols = list(train_encoder.columns)
        
    return X_train, X_test, encoder_cols

def rename_cols(df):
    df.rename(columns={"Process temperature [K]": "Process temperature C", "Air temperature [K]": "Air temperature C"}, inplace=True)
    df.columns = df.columns.str.replace('[\[\]]', '', regex=True)
    return df

In [6]:
train = pd.concat([df_train, original])
test = df_test.copy()

X_train = train.drop([f'{target_col}'],axis=1).reset_index(drop=True)
y_train = train[f'{target_col}'].reset_index(drop=True)
X_test = test.reset_index(drop=True)

# Conversion
X_train = conversion(X_train)
X_test = conversion(X_test)

# Category Encoders
X_train = replace_Type(X_train)
X_test = replace_Type(X_test)
X_train, X_test, _ = cat_encoder(X_train, X_test, ['Product ID'], encode='label')
cat_cols = ['Type', 'Product ID']

# Create Features
new_cols = []
X_train, _ = create_features(X_train)
X_test, new_cols = create_features(X_test)

# StandardScaler
sc = StandardScaler() # MinMaxScaler or StandardScaler
X_train[num_cols+new_cols] = sc.fit_transform(X_train[num_cols+new_cols])
X_test[num_cols+new_cols] = sc.transform(X_test[num_cols+new_cols])

# Drop_col
drop_cols = ['id', 'is_generated', 'RNF'] # binary_cols
X_train.drop(drop_cols, axis=1, inplace=True)
X_test.drop(drop_cols, axis=1, inplace=True)

# Rename
X_train = rename_cols(X_train)
X_test = rename_cols(X_test)

del train, test, df_train, df_test

X_train.drop(['UDI'], axis=1, inplace=True)

print(f"X_train shape :{X_train.shape} , y_train shape :{y_train.shape}")
print(f"X_test shape :{X_test.shape}")

X_train.head()

X_train shape :(146429, 14) , y_train shape :(146429,)
X_test shape :(90954, 14)


Unnamed: 0,Product ID,Type,Air temperature C,Process temperature C,Rotational speed rpm,Torque Nm,Tool wear min,TWF,HDF,PWF,OSF,Temperature ratio,Torque * Rotational speed,Torque * Tool wear
0,1,0,0.388563,-0.248148,0.524194,-0.490542,0.552766,0,0,0,0,-0.91314,-0.289927,0.300523
1,2,1,1.456753,1.547557,1.672486,-1.303478,1.491004,0,0,0,0,-0.992249,-0.961066,0.575923
2,3,0,-0.305761,-1.038258,1.996544,-1.605426,-1.245523,0,0,0,0,-0.504422,-1.311256,-1.278352
3,4,0,0.602201,0.685618,0.016972,0.461755,1.444092,0,0,0,0,-0.42643,0.743354,1.621112
4,5,1,-1.000084,-0.679117,0.841207,-0.571836,-1.104788,0,0,0,0,1.06907,-0.240255,-1.083811


In [7]:
class Splitter:
    def __init__(self, n_splits=5, cat_df=pd.DataFrame(), test_size=0.5):
        self.n_splits = n_splits
        self.cat_df = cat_df
        self.test_size = test_size

    def split_data(self, X, y, random_state_list):
        for random_state in random_state_list:
            kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for train_index, val_index in kf.split(X, y):
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                yield X_train, X_val, y_train, y_val, val_index

In [8]:
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=42):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self.get_models()
        self.models_name = list(self.get_models().keys())
        self.len_models = len(self.models)
        
    def get_models(self):
        
        xgb_optuna1 = {
            'n_estimators': 1500,
            'learning_rate': 0.08901459197907591,
            'booster': 'gbtree',
            'lambda': 8.550251116462702,
            'alpha': 6.92130114930949,
            'eta': 0.7719873740829137,
            'grow_policy': 'lossguide',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'random_state': self.random_state
        }
        
        xgb_optuna2 = {
            'n_estimators': 550,
            'learning_rate': 0.014551680348136895,
            'booster': 'gbtree',
            'lambda': 0.028738149876528587,
            'alpha': 0.014056635017117198,
            'subsample': 0.538653498449084,
            'colsample_bytree': 0.518050828371974, 
            'max_depth': 4, 'min_child_weight': 4,
            'eta': 0.6953619445477833,
            'gamma': 0.9036568111424781,
            'grow_policy': 'lossguide',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'random_state': self.random_state
        }
        
        xgb1_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.0503196477566407,
            'booster': 'gbtree',
            'lambda': 0.00379319640405843,
            'alpha': 0.106754104302093,
            'subsample': 0.938028434508189,
            'colsample_bytree': 0.212545425027345,
            'max_depth': 9,
            'min_child_weight': 2,
            'eta': 1.03662446190642E-07,
            'gamma': 0.000063826049787043,
            'grow_policy': 'lossguide',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            #'eval_metric': 'auc',
            'verbosity': 0,
            'random_state': self.random_state,
        }
        xgb2_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.00282353606391198,
            'booster': 'gbtree',
            'lambda': 0.399776698351379,
            'alpha': 1.01836149061356E-07,
            'subsample': 0.957123754766769,
            'colsample_bytree': 0.229857555596548,
            'max_depth': 9,
            'min_child_weight': 4,
            'eta': 2.10637756839133E-07,
            'gamma': 0.00314857715085414,
            'grow_policy': 'depthwise',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            #'eval_metric': 'auc',
            'verbosity': 0,
            'random_state': self.random_state,
        }
        xgb3_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.00349356650247156,
            'booster': 'gbtree',
            'lambda': 0.0002963239871324443,
            'alpha': 0.0000162103492458353,
            'subsample': 0.822994064549709,
            'colsample_bytree': 0.244618079894501,
            'max_depth': 10,
            'min_child_weight': 2,
            'eta': 8.03406601824666E-06,
            'gamma': 3.91180893163099E-07,
            'grow_policy': 'depthwise',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            #'eval_metric': 'auc',
            'verbosity': 0,
            'random_state': self.random_state,
        }
        
        lgb_optuna1 = {
            'num_iterations': 200,
            'learning_rate': 0.024714536811915398,
            'max_depth': 9,
            'lambda': 9.498413255934212,
            'alpha': 7.627590925937886,
            'subsample': 0.9680186598781285,
            'colsample_bytree': 0.5645599877042381,
            'min_child_weight': 1,
            'device': self.device,
            'random_state': self.random_state
        }
        
        lgb_optuna2 = {
            'num_iterations': 950,
            'learning_rate': 0.012019976156417951,
            'max_depth': 4,
            'lambda': 6.958643473661789,
            'alpha': 0.0012598800466591953, 
            'subsample': 0.9344619448867001,
            'colsample_bytree': 0.9864399750557648, 
            'min_child_weight': 1,
            'device': self.device,
            'random_state': self.random_state
        }
        
        lgb1_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.0124415817896377,
            'reg_alpha': 0.00139174509988134,
            'reg_lambda': 0.000178964551019674,
            'num_leaves': 249,
            'colsample_bytree': 0.675264038614975,
            'subsample': 0.421482143660471,
            'subsample_freq': 4,
            'min_child_samples': 8,
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'is_unbalance':True,
            # 'n_jobs': -1,
            #'force_row_wise': True,
            'device': self.device,
            'random_state': self.random_state
        }
        lgb2_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.0247403801218241,
            'reg_alpha': 6.84813726047269E-06,
            'reg_lambda': 3.40443691552308E-08,
            'num_leaves': 223,
            'colsample_bytree': 0.597332047776164,
            'subsample': 0.466442641250326,
            'subsample_freq': 2,
            'min_child_samples': 5,
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'is_unbalance':True,
            # 'n_jobs': -1,
            #'force_row_wise': True,
            'device': self.device,
            'random_state': self.random_state
        }
        lgb3_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.0109757020463629,
            'reg_alpha': 0.174927073496136,
            'reg_lambda': 2.45325882544558E-07,
            'num_leaves': 235,
            'colsample_bytree': 0.756605772162953,
            'subsample': 0.703911560320816,
            'subsample_freq': 5,
            'min_child_samples': 21,
            'objective': 'binary',
            'metric': 'binary_error',
            'boosting_type': 'gbdt',
            'is_unbalance':True,
            # 'n_jobs': -1,
            #'force_row_wise': True,
            'device': self.device,
            'random_state': self.random_state
        }
        
        cat_optuna1 = {
            'iterations': 600,
            'learning_rate': 0.019499308200732167,
            'depth': 8,
            'l2_leaf_reg': 9.024309909697191,
            'bagging_temperature': 7.9669359481998825,
            'random_strength': 5.293875378529096,
            'border_count': 235,
            'auto_class_weights': 'Balanced',
            'task_type': self.device.upper(),
            'verbose': False,
            'allow_writing_files': False,
            'random_state': self.random_state
        }
        
        cat_optuna2 = {
            'iterations': 1000,
            'learning_rate': 0.013171032440433215,
            'depth': 5, 
            'l2_leaf_reg': 2.805405544410651,
            'bagging_temperature': 5.869195302151575,
            'random_strength': 9.103415468292203,
            'task_type': self.device.upper(),
            'verbose': False,
            'allow_writing_files': False,
            'random_state': self.random_state
        }
        
        cat1_params = {
            'iterations': self.n_estimators,
            'depth': 3,
            'learning_rate': 0.020258010893459,
            'l2_leaf_reg': 0.583685138705941,
            'random_strength': 0.177768021213223,
            'od_type': "Iter", 
            'od_wait': 116,
            'bootstrap_type': "Bayesian",
            'grow_policy': 'Depthwise',
            'bagging_temperature': 0.478048798393903,
            'eval_metric': 'Logloss', # AUC
            'loss_function': 'Logloss',
            'auto_class_weights': 'Balanced',
            'task_type': self.device.upper(),
            'verbose': False,
            'allow_writing_files': False,
            'random_state': self.random_state
        }
        cat2_params = {
            'iterations': self.n_estimators,
            'depth': 5,
            'learning_rate': 0.00666304601039438,
            'l2_leaf_reg': 0.0567881687170355,
            'random_strength': 0.00564702921370138,
            'od_type': "Iter", 
            'od_wait': 93,
            'bootstrap_type': "Bayesian",
            'grow_policy': 'Depthwise',
            'bagging_temperature': 2.48298505165348,
            'eval_metric': 'Logloss', # AUC
            'loss_function': 'Logloss',
            'auto_class_weights': 'Balanced',
            'task_type': self.device.upper(),
            'verbose': False,
            'allow_writing_files': False,
            'random_state': self.random_state
        }
        cat3_params = {
            'iterations': self.n_estimators,
            'depth': 5,
            'learning_rate': 0.0135730417743519,
            'l2_leaf_reg': 0.0597353604503262,
            'random_strength': 0.0675876600077264,
            'od_type': "Iter", 
            'od_wait': 122,
            'bootstrap_type': "Bayesian",
            'grow_policy': 'Depthwise',
            'bagging_temperature': 1.85898154006468,
            'eval_metric': 'Logloss', # AUC
            'loss_function': 'Logloss',
            'auto_class_weights': 'Balanced',
            'task_type': self.device.upper(),
            'verbose': False,
            'allow_writing_files': False,
            'random_state': self.random_state
        }
        
        models = {
            "xgbo1": xgb.XGBClassifier(**xgb_optuna1),
            "xgbo2": xgb.XGBClassifier(**xgb_optuna2),
            "xgb1": xgb.XGBClassifier(**xgb1_params),
            "xgb2": xgb.XGBClassifier(**xgb2_params),
            "xgb3": xgb.XGBClassifier(**xgb3_params),
            "lgbo1": lgb.LGBMClassifier(**lgb_optuna1),
            "lgbo2": lgb.LGBMClassifier(**lgb_optuna2),
            #"lgb1": lgb.LGBMClassifier(**lgb1_params),
            #"lgb2": lgb.LGBMClassifier(**lgb2_params),
            #"lgb3": lgb.LGBMClassifier(**lgb3_params),
            "cato1": CatBoostClassifier(**cat_optuna1),
            "cato2": CatBoostClassifier(**cat_optuna2),
            #"cat1": CatBoostClassifier(**cat1_params),
            #"cat2": CatBoostClassifier(**cat2_params),
           # "cat3": CatBoostClassifier(**cat3_params),
            #'rf': RandomForestClassifier(n_estimators=500, n_jobs=-1, class_weight="balanced", random_state=self.random_state),
            #'lr': LogisticRegressionCV(max_iter=2000, random_state=self.random_state)
        }
        return models

In [9]:
class OptunaWeights:
    def __init__(self, random_state, n_trials=100):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 1e-15, 1) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)

        # Calculate the score for the weighted prediction
        score = roc_auc_score(y_true, weighted_pred)
        return score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='maximize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

In [10]:
n_splits = 10
random_state = 42
random_state_list =[42]
n_estimators = 100
device = 'cpu'
early_stopping_rounds = 444
verbose = False

# Split Data
splitter = Splitter(n_splits=n_splits, cat_df= y_train)
splits = splitter.split_data(X_train, y_train, random_state_list=random_state_list)

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators=n_estimators, device=device, random_state=random_state)
test_predss = np.zeros((X_test.shape[0]))
oof_predss = np.zeros((X_train.shape[0]))
ensemble_score = []
weights = []
models_name = [_ for _ in classifier.models_name if ('xgb' in _) or ('lgb' in _) or ('cat' in _)]
trained_models = dict(zip(models_name, [[] for _ in range(classifier.len_models)]))
score_dict = dict(zip(classifier.models_name, [[] for _ in range(len(classifier.models_name))]))

for i, (X_train_, X_val, y_train_, y_val, val_index) in enumerate(splits):
    
    n = i % n_splits
    m = i // n_splits
    

    # Classifier models
    classifier = Classifier(n_estimators, device, random_state)
    models = classifier.models

    # Store oof and test predictions for each base model
    oof_preds = []
    test_preds = []

    # Loop over each base model and fit it
    for name, model in models.items():
        if ('xgb' in name) or ('lgb' in name):
            model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            
        elif 'cat' in name :
                model.fit(
                    Pool(X_train_, y_train_, cat_features=cat_cols), eval_set=Pool(X_val, y_val, cat_features=cat_cols),
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
        else:
            model.fit(X_train_, y_train_)
            
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))

        test_pred = model.predict_proba(X_test)[:, 1]
        y_val_pred = model.predict_proba(X_val)[:, 1]

        score = roc_auc_score(y_val, y_val_pred)
        score_dict[name].append(score)
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] ROC-AUC score: {score:.5f}')

        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)

    # Use OptunaWeights
    optweights = OptunaWeights(random_state)
    y_val_pred = optweights.fit_predict(y_val.values, oof_preds)

    score = roc_auc_score(y_val, y_val_pred)
    print(f'Ensemble [FOLD-{n} SEED-{random_state_list[m]}] ROC-AUC score {score:.5f} \n')
    ensemble_score.append(score)
    weights.append(optweights.weights)

    # Predict to X_test by the best ensemble weights
    test_predss += optweights.predict(test_preds) / (n_splits * len(random_state_list))
    oof_predss[X_val.index] = optweights.predict(oof_preds)

    gc.collect()

xgbo1 [FOLD-0 SEED-42] ROC-AUC score: 0.97044
xgbo2 [FOLD-0 SEED-42] ROC-AUC score: 0.96803
xgb1 [FOLD-0 SEED-42] ROC-AUC score: 0.97551
xgb2 [FOLD-0 SEED-42] ROC-AUC score: 0.96867
xgb3 [FOLD-0 SEED-42] ROC-AUC score: 0.97720
lgbo1 [FOLD-0 SEED-42] ROC-AUC score: 0.97088
lgbo2 [FOLD-0 SEED-42] ROC-AUC score: 0.96916
cato1 [FOLD-0 SEED-42] ROC-AUC score: 0.98007
cato2 [FOLD-0 SEED-42] ROC-AUC score: 0.98125
Ensemble [FOLD-0 SEED-42] ROC-AUC score 0.98050 

xgbo1 [FOLD-1 SEED-42] ROC-AUC score: 0.96599
xgbo2 [FOLD-1 SEED-42] ROC-AUC score: 0.96386
xgb1 [FOLD-1 SEED-42] ROC-AUC score: 0.96813
xgb2 [FOLD-1 SEED-42] ROC-AUC score: 0.96054
xgb3 [FOLD-1 SEED-42] ROC-AUC score: 0.96743
lgbo1 [FOLD-1 SEED-42] ROC-AUC score: 0.96633
lgbo2 [FOLD-1 SEED-42] ROC-AUC score: 0.96650
cato1 [FOLD-1 SEED-42] ROC-AUC score: 0.97801
cato2 [FOLD-1 SEED-42] ROC-AUC score: 0.97672
Ensemble [FOLD-1 SEED-42] ROC-AUC score 0.97810 

xgbo1 [FOLD-2 SEED-42] ROC-AUC score: 0.95914
xgbo2 [FOLD-2 SEED-42] ROC-AUC s

In [11]:
sample_submission[f'{target_col}'] = test_predss
sample_submission.to_csv(f'submission.csv', index=False)
sample_submission

Unnamed: 0,id,Machine failure
0,136429,0.081740
1,136430,0.083714
2,136431,0.080263
3,136432,0.083978
4,136433,0.081691
...,...,...
90949,227378,0.083161
90950,227379,0.087410
90951,227380,0.084344
90952,227381,0.092627
