## Import libraries

In [None]:
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer

## Load datasets

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
train_df.set_index('id', inplace=True)
print(f"train_df: {train_df.shape}")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
test_df.set_index('id', inplace=True)
print(f"test_df: {test_df.shape}")
test_df.head()

## Feature Engineering

In [None]:
features = test_df.columns.tolist()

train_df['num_missing'] = train_df[features].isna().sum(axis=1)
train_df['num_missing_std'] = train_df[features].isna().std(axis=1).astype('float')
train_df['median'] = train_df[features].median(axis=1)
train_df['std'] = train_df[features].std(axis=1)
train_df['min'] = train_df[features].abs().min(axis=1)
train_df['max'] = train_df[features].abs().max(axis=1)
train_df['sem'] = train_df[features].sem(axis=1)

test_df['num_missing'] = test_df[features].isna().sum(axis=1)
test_df['num_missing_std'] = test_df[features].isna().std(axis=1).astype('float')
test_df['median'] = test_df[features].median(axis=1)
test_df['std'] = test_df[features].std(axis=1)
test_df['min'] = test_df[features].abs().min(axis=1)
test_df['max'] = test_df[features].abs().max(axis=1)
test_df['sem'] = test_df[features].sem(axis=1)

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")
train_df.head()

In [None]:
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Mean', 
    'f3': 'Mode', 
    'f4': 'Mode', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Mean', 
    'f8': 'Median', 
    'f9': 'Mode', 
    'f10': 'Mode', 
    'f11': 'Mode', 
    'f12': 'Median', 
    'f13': 'Mode', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Mode', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mean', 
    'f30': 'Median', 
    'f31': 'Mode', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Median', 
    'f37': 'Median', 
    'f38': 'Mode', 
    'f39': 'Median', 
    'f40': 'Mean', 
    'f41': 'Median', 
    'f42': 'Mean', 
    'f43': 'Mode', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mean', 
    'f48': 'Median', 
    'f49': 'Mode', 
    'f50': 'Mean', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Median', 
    'f55': 'Mode', 
    'f56': 'Mean', 
    'f57': 'Mean', 
    'f58': 'Median', 
    'f59': 'Median', 
    'f60': 'Mode', 
    'f61': 'Mode', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mean', 
    'f66': 'Mode', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Mode', 
    'f70': 'Mean', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Median', 
    'f75': 'Mean', 
    'f76': 'Mean', 
    'f77': 'Median', 
    'f78': 'Median', 
    'f79': 'Median', 
    'f80': 'Median', 
    'f81': 'Median', 
    'f82': 'Median', 
    'f83': 'Median', 
    'f84': 'Median', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Mean', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Mode', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mean', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Median', 
    'f104': 'Median', 
    'f105': 'Mode', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Median', 
    'f110': 'Mode', 
    'f111': 'Median', 
    'f112': 'Median', 
    'f113': 'Median', 
    'f114': 'Median', 
    'f115': 'Mode', 
    'f116': 'Median', 
    'f117': 'Median', 
    'f118': 'Mean'
}


for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = train_df[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = train_df[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = train_df[col].mode().iloc[0]
    
    train_df[col].fillna(fill_value, inplace=True)
    test_df[col].fillna(fill_value, inplace=True)

train_df.head()

In [None]:
features = [col for col in train_df.columns if col not in ['num_missing','num_missing_std','claim']]

for col in tqdm(features):
    transformer = QuantileTransformer(n_quantiles=3000, 
                                      random_state=42, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

In [None]:
def kmeans_fet(train, test, features, n_clusters):
    
    train_ = train[features].copy()
    test_ = test[features].copy()
    data = pd.concat([train_, test_], axis=0)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(data)
    
    train[f'clusters_k'] = kmeans.labels_[:train.shape[0]]
    test[f'clusters_k'] = kmeans.labels_[train.shape[0]:]
    return train, test

In [None]:
train_df, test_df = kmeans_fet(train_df, test_df, features, n_clusters=4)

Xtrain = train_df.copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nXtest: {Xtest.shape}")

## Pycaret

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *
from time import time

clf = setup(Xtrain, 
            target = 'claim', 
            train_size = 0.8, 
            fold_strategy = 'stratifiedkfold', 
            fold_shuffle=True, 
            use_gpu=True, 
            n_jobs = -1, 
            silent = True,
            remove_multicollinearity = True, 
            multicollinearity_threshold = 0.90,
            session_id=42 )

### Model(catboost)

In [None]:
tuned_models = []

In [None]:
cell_start_time = time()
catboost = create_model('catboost', fold = 5)
catboost = tune_model(catboost,n_iter = 50, fold = 5, optimize = 'AUC', choose_better = True)
tuned_models.append(catboost)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

In [None]:
plot_model(catboost, plot='auc')

In [None]:
plot_model(catboost, plot='feature')

In [None]:
interpret_model(catboost)

In [None]:
plot_model(catboost, plot = 'confusion_matrix')

### Model(lightgbm + optuna)

In [None]:
cell_start_time = time()
lightgbm = create_model('lightgbm', fold = 5)
lightgbm = tune_model(lightgbm,n_iter = 50, fold = 5, optimize = 'AUC', choose_better = True, search_library = 'optuna')
tuned_models.append(lightgbm)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

In [None]:
plot_model(lightgbm, plot='auc')

In [None]:
plot_model(lightgbm, plot='feature')

In [None]:
interpret_model(lightgbm)

In [None]:
plot_model(lightgbm, plot = 'confusion_matrix')

### Model(lda)

In [None]:
cell_start_time = time()
lda = create_model('lda', fold = 5)
lda = tune_model(lda,n_iter = 50, fold = 5, optimize = 'AUC', choose_better = True, search_library = 'optuna')
tuned_models.append(lda)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

In [None]:
plot_model(lda, plot='auc')

In [None]:
plot_model(lda, plot='feature')

In [None]:
interpret_model(lda)

In [None]:
plot_model(lda, plot='feature')

In [None]:
blended = blend_models(estimator_list=tuned_models, method='soft', fold=5, optimize='AUC')

In [None]:
final_model = finalize_model(blended)

## Submission

In [None]:
prep_pipe = get_config("prep_pipe") 
prep_pipe.steps.append(['trained_model', final_model])
prections = prep_pipe.predict_proba(Xtest)

In [None]:
k = []
for row in prections:
  k.append(row[1])

In [None]:
submit_data = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_data['claim'] = k
submit_data.to_csv("submission.csv", index=False)
submit_data.head(10)