In [None]:
%config Completer.use_jedi = False

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
from scipy import stats
from statsmodels import robust

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow, imread

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.impute import SimpleImputer
import scikitplot as skplt

import scipy.stats as stats

import lightgbm as lgb
import warnings

import optuna

import gc

In [None]:
R_SEED = 37
N_FOLDS = 5

In [None]:
submission_ex = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
train_data_a = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_data_a  = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')

In [None]:
target_data = train_data_a[['claim']].copy()
for_me_data_train = train_data_a[['id']].copy()
for_me_data_test = test_data_a[['id']].copy()
train_data_a.drop(['id', 'claim'], axis=1, inplace=True)
submit_data = test_data_a[['id']].copy()
test_data_a.drop(['id'], axis=1, inplace=True)
n_missing_train = train_data_a.isna().sum(axis=1)

In [None]:
all_data = pd.concat([train_data_a, test_data_a])
all_data.reset_index(drop=True, inplace=True)

In [None]:
all_data_normalized = StandardScaler().fit_transform(all_data)
all_data = pd.DataFrame(all_data_normalized.copy(), columns=all_data.columns)
gc.collect()

In [None]:
new_features_dict = {}

new_features_dict['n_of_miss'] = all_data.isna().astype(int).sum(axis=1)
for i in range(10):
    new_features_dict['miss_bt_' + str(i)] = (new_features_dict['n_of_miss'] > i).astype(int)
    
new_features_dict['r_min'] = all_data.min(axis=1)
new_features_dict['r_std'] = all_data.std(axis=1)
new_features_dict['r_max'] = all_data.max(axis=1)
new_features_dict['r_median'] = all_data.median(axis=1)
new_features_dict['r_mean'] = all_data.mean(axis=1)
new_features_dict['r_var'] = all_data.var(axis=1)
new_features_dict['r_sum'] = all_data.sum(axis=1)
new_features_dict['r_sem'] = all_data.sem(axis=1)
new_features_dict['r_skew'] = all_data.skew(axis=1)

In [None]:
for feature_name, feature_value in new_features_dict.items():
    all_data[feature_name] = feature_value
del new_features_dict

In [None]:
gc.collect()

#### code [start]

In [None]:
code_data = train_data_a.isna().astype(int)
code_data = pd.concat([code_data, target_data], axis=1, join='inner')
code_data.head()

In [None]:
features = [f for f in code_data.columns if f.startswith('f')]
f_t1 = np.array([])
for f in features:
    f_t1 = np.append(f_t1, code_data[code_data[f] == 1]['claim'].sum())

In [None]:
code_data['n_of_miss'] = code_data.iloc[:,:-1].sum(axis=1)
code_data.head()

In [None]:
n_of_miss_unique = np.unique(code_data['n_of_miss'])
f_n_of_miss_to_prob = []
for n in n_of_miss_unique:
    filtered = code_data[code_data['n_of_miss'] == n]
    f_n_of_miss_to_prob.append((n, filtered['claim'].sum() / filtered.shape[0]))

In [None]:
all_data['prob_1_for_miss'] = 0

In [None]:
warnings.filterwarnings("ignore")

for (m, p) in f_n_of_miss_to_prob:
    all_data['prob_1_for_miss'][all_data['n_of_miss'] == m] = p

In [None]:
del code_data
gc.collect()

#### code [end]

In [None]:
tmp_a  = pd.read_csv('/kaggle/input/0049-imputerdata-from-ae/imputer_data_all.csv')
for c in tmp_a.columns:
    all_data[c] = np.nan_to_num(all_data[c].values) + tmp_a[c].values
    gc.collect()

del tmp_a
gc.collect()

In [None]:
gc.collect()

#### hm

In [None]:
new_features_dict = {}

new_features_dict['r_zscore'] = (np.abs(stats.zscore(all_data))).sum(axis=1)
for i in range(3, 9):
    new_features_dict['r_zscore_' + str(i)] = (np.abs(stats.zscore(all_data)) < i).all(axis=1).astype(int)

gc.collect()

new_features_dict['median_abs_deviation'] = stats.median_abs_deviation(all_data, axis=1)

gc.collect()

# new_features_dict['dist_from_center_cos'] = cosine_similarity(all_data, [all_data.mean(axis=0).values])

# gc.collect()

# new_features_dict['dist_from_center_euc'] = euclidean_distances(all_data, [all_data.mean(axis=0).values])

# gc.collect()

# new_features_dict['mean_abs_deviation'] = all_data.mad(axis=1)

# gc.collect()

In [None]:
for feature_name, feature_value in new_features_dict.items():
    all_data[feature_name] = feature_value
del new_features_dict

In [None]:
all_data['median_abs_deviation']

In [None]:
gc.collect()

In [None]:
train_data_a, test_data_a = all_data.iloc[:train_data_a.shape[0],:].copy(), all_data.iloc[train_data_a.shape[0]:,:].copy()

In [None]:
train_data = train_data_a
test_data = test_data_a

In [None]:
del train_data_a, test_data_a, all_data

In [None]:
gc.collect()

In [None]:
params_1 = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 35000, 
    'learning_rate': 0.005, 
    'num_leaves': 29,
    'min_child_samples': 236, 
    'lambda_l1': 7.702002052840491, 
    'lambda_l2': 9.738840335016775, 
    'feature_fraction': 0.07, #0.3811137625854881, 
    'bagging_fraction': 0.7345219542805319, 
    'bagging_freq': 3, 
    'min_child_weight': 280.0714278010327}


model_1 = lgb.LGBMRegressor(**params_1,
                          n_jobs=-1,
                          random_state = R_SEED)

In [None]:
gc.collect()

In [None]:
def plot_fea_imp(model, model_name):
    print('Plotting feature importances...')
    fea_imp = pd.DataFrame({'imp': model.feature_importances_, 'col': train_data.columns})
    fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False])#.iloc[-10:]
    fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 70), legend=None)
    plt.title('%s - Feature Importance' % (model_name))
    plt.ylabel('Features')
    plt.xlabel('Importance')

In [None]:
kfolds = StratifiedKFold(n_splits = N_FOLDS, shuffle = True, random_state = R_SEED)
pred = []
lgb_oof_1 = np.zeros(train_data.shape[0])

fm_i = 0
for train_index, test_index in kfolds.split(X=train_data, y=n_missing_train):

    X_train, X_val = train_data.iloc[train_index], train_data.iloc[test_index]
    y_train, y_val = target_data.iloc[train_index], target_data.iloc[test_index]
    
    gc.collect()
    
    print(y_train.shape[0], y_train['claim'].sum())
    
    model_1.fit(
        X_train, 
        np.ravel(y_train), 
        eval_metric = "auc", 
        eval_set = [(X_val, y_val)],
        verbose = 100,
        early_stopping_rounds = 3000)
    
    plot_fea_imp(model_1, 'lightGBM_' + str(fm_i))
    
    oof_pred_1 = model_1.predict(X_val)
    lgb_oof_1[test_index] = oof_pred_1
    
    _p = model_1.predict(test_data)
    pred.append(_p)
    
    for_me_data_test['hm_' + str(fm_i)] = _p
    fm_i += 1

for_me_data_train['hm_1'] = lgb_oof_1
        
final_p = np.sum(pred, axis = 0) / len(pred)

submit_data['claim'] = final_p
submit_data.to_csv('submission.csv', index=False)
for_me_data_train.to_csv('for_me_data_train.csv', index=False)
for_me_data_test.to_csv('for_me_data_test.csv', index=False)

Curve is drawn with oof data

In [None]:
p_1 = [(1-e, e) for e in lgb_oof_1]

fig = plt.figure(figsize = (10, 10))
ax = fig.gca()

skplt.metrics.plot_roc(target_data.claim.values, p_1, plot_micro=False, plot_macro=False, classes_to_plot=[1], ax=ax, cmap='Reds')
# skplt.metrics.plot_roc(target_data.claim.values, p_1, plot_micro=False, plot_macro=False, classes_to_plot=[0], ax=ax, cmap='ocean')
plt.show()

In [None]:
fig = plt.figure(figsize = (30, 15))
ax = fig.gca()
ax.set_facecolor('cadetblue')

sns.kdeplot(for_me_data_train['hm_1'], color = "aliceblue", ax = ax, linewidth=3, label='test')

plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize = (30, 15))
ax = fig.gca()
ax.set_facecolor('cadetblue')

sns.kdeplot(submit_data['claim'], color = "aliceblue", ax = ax, linewidth=3, label='test')

plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize = (30, 15))
ax = fig.gca()
ax.set_facecolor('cadetblue')
for f in for_me_data_test.columns:
    if f.startswith('hm'):
        sns.kdeplot(for_me_data_test[f], ax = ax, linewidth=3, label='test')

plt.legend()
plt.show()