In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
import sklearn
sklearn.__version__

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_solution = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

In [None]:
xgb_params = {
    'n_estimators' : 3600,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
    'learning_rate': 0.01,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree'
}

lgbm_params = {
    "objective": "binary",
    "learning_rate": 0.008,
    'device': 'gpu',
    'n_estimators': 3205,
    'num_leaves': 184,
    'min_child_samples': 63,
    'feature_fraction': 0.6864594334728974,
    'bagging_fraction': 0.9497327922401265,
    'bagging_freq': 1,
    'reg_alpha': 19,
    'reg_lambda': 19,
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'verbose' : -1
}

catb_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

# quantile_________________________________________________________________________

In [None]:
# quantile + median dataset
quantile_train = train.copy()

median_set = ['f9', 'f12', 'f26', 'f27', 'f28', 'f32', 'f33', 'f35', 'f62', 'f74', 'f82', 'f86', 'f98', 'f108', 'f116']

for f in range(1,119):
    col_name = f'f{f}'
    
    if col_name in median_set:
        quantile_train[col_name].fillna(train[col_name].median(), inplace=True)
    else:
        quantile_train[col_name].fillna(train[col_name].quantile(0.75), inplace=True)

drop_set = ['id', 'claim']
X = quantile_train.drop(drop_set, axis = 1)
Y = quantile_train['claim']
# ---------------------------------------------------------------------- data setting
quantile_train = None

In [None]:
# xgboost
model = XGBClassifier(**xgb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'XgBoost OOF AUC : ', roc_auc_score(Y, train_oof))

# lgbm
model = LGBMClassifier(**lgbm_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'LGBM OOF AUC : ', roc_auc_score(Y, train_oof))

# catboost
model = CatBoostClassifier(**catb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'Catboost OOF AUC : ', roc_auc_score(Y, train_oof))

# Quantile + missing setting--------------------------

In [None]:
# new_feature : null has
# 
quantile_train = train.copy()
median_set = ['f9', 'f12', 'f26', 'f27', 'f28', 'f32', 'f33', 'f35', 'f62', 'f74', 'f82', 'f86', 'f98', 'f108', 'f116']

quantile_train['n_missing'] = train.isnull().sum(axis = 1)  

    
# Fill Null by using quantile + median
for f in range(1,119):
    col_name = f'f{f}'
    
    if col_name in median_set:
        quantile_train[col_name].fillna(train[col_name].median(), inplace=True)
    else:
        quantile_train[col_name].fillna(train[col_name].quantile(0.75), inplace=True)


    
drop_set = ['id', 'claim']
X = quantile_train.drop(drop_set, axis = 1)
Y = quantile_train['claim']
# ---------------------------------------------------------------------- data setting
quantile_train = None

In [None]:
# xgboost
model = XGBClassifier(**xgb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'XgBoost OOF AUC : ', roc_auc_score(Y, train_oof))

# lgbm
model = LGBMClassifier(**lgbm_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'LGBM OOF AUC : ', roc_auc_score(Y, train_oof))

# catboost
model = CatBoostClassifier(**catb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'Catboost OOF AUC : ', roc_auc_score(Y, train_oof))

# quantile + multiply setting -----------------------------------------

In [None]:
# new_feature : multiply all features
features = list(train.columns[1:119])

quantile_train = train.copy()
quantile_train['multiply'] = 1

for feature in features:
    quantile_train['multiply'] = quantile_train[feature] * quantile_train['multiply']

In [None]:
# xgboost
model = XGBClassifier(**xgb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'XgBoost OOF AUC : ', roc_auc_score(Y, train_oof))

# lgbm
model = LGBMClassifier(**lgbm_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'LGBM OOF AUC : ', roc_auc_score(Y, train_oof))

# catboost
model = CatBoostClassifier(**catb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'Catboost OOF AUC : ', roc_auc_score(Y, train_oof))

# mean + missing ------------------------------------------------------------

In [None]:
# mean filled data
features = list(train.columns[1:119])
mean_data = train.copy()

mean_data['n_missing'] = mean_data[features].isna().sum(axis=1)

mean_data = train.fillna(train.mean())

# mean_data['min'] = mean_data[features].min(axis=1)
# mean_data['max'] = mean_data[features].max(axis=1)
# mean_data['mean'] = mean_data[features].mean(axis=1)
# mean_data['std'] = mean_data[features].std(axis=1)


drop_set = ['id', 'claim']
X = mean_data.drop(drop_set, axis = 1)
Y = mean_data['claim']
# ----------------------------------------------------------------------------------------data seting

mean_data = None

In [None]:
# xgboost
model = XGBClassifier(**xgb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'XgBoost OOF AUC : ', roc_auc_score(Y, train_oof))

# lgbm
model = LGBMClassifier(**lgbm_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'LGBM OOF AUC : ', roc_auc_score(Y, train_oof))

# catboost
model = CatBoostClassifier(**catb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'Catboost OOF AUC : ', roc_auc_score(Y, train_oof))

# mean + all feature setting ------------------------------

In [None]:
# mean filled data
features = list(train.columns[1:119])
mean_data = train.copy()

mean_data['n_missing'] = mean_data[features].isna().sum(axis=1)

mean_data = train.fillna(train.mean())

mean_data['min'] = mean_data[features].min(axis=1)
mean_data['max'] = mean_data[features].max(axis=1)
mean_data['mean'] = mean_data[features].mean(axis=1)
mean_data['std'] = mean_data[features].std(axis=1)


drop_set = ['id', 'claim']
X = mean_data.drop(drop_set, axis = 1)
Y = mean_data['claim']
# ----------------------------------------------------------------------------------------data seting

mean_data = None

In [None]:
# xgboost
model = XGBClassifier(**xgb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'XgBoost OOF AUC : ', roc_auc_score(Y, train_oof))

# lgbm
model = LGBMClassifier(**lgbm_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'LGBM OOF AUC : ', roc_auc_score(Y, train_oof))

# catboost
model = CatBoostClassifier(**catb_params)
train_oof = np.zeros((957919,))
skf = StratifiedKFold(shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, Y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y.iloc[train_idx], Y.iloc[val_idx]
    
    model = model.fit(X_train, y_train, verbose=0)
    temp_oof = model.predict_proba(X_val)[:, 1]
    train_oof[val_idx] = temp_oof
    print(f'Fold {fold} AUC : ', roc_auc_score(y_val, temp_oof))
    
print(f'Catboost OOF AUC : ', roc_auc_score(Y, train_oof))