In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# To impute missing Values
from sklearn.impute import SimpleImputer

In [2]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [3]:
# prepare dataframe for modeling
X = train.drop(columns=['id','claim']).copy()
y = train['claim']

test_data = test.drop(columns=['id']).copy()

In [4]:
# feature Engineering
def get_stats_per_row(data):
    data['mv_row'] = data.isna().sum(axis=1)
    data['std_row'] = data.std(axis=1)
    data['var_row'] = data.var(axis=1)
    data['mean'] = data.mean(axis=1)
    data['median'] = data.median(axis=1)
    data['root_mean_square'] = (data ** 2).sum(1).pow(1/2)
    data['abs'] = data.abs().sum(axis=1)
    return data
X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)

In [5]:
from tqdm import tqdm

features = [x for x in X.columns.values if x[0]=="f"]

In [6]:
skewed_feat = X[features].skew()
skewed_feat = [*skewed_feat[abs(skewed_feat.values) > 1].index]

for feat in tqdm(skewed_feat):
    median = X[feat].median()
    X[feat] = X[feat].fillna(median)
    test_data[feat] = test_data[feat].fillna(median)

100%|██████████| 67/67 [00:01<00:00, 35.46it/s]


In [7]:
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Mode', 
    'f3': 'Median', 
    'f4': 'Median', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Median', 
    'f8': 'Median', 
    'f9': 'Median', 
    'f10': 'Median', 
    'f11': 'Mode', 
    'f12': 'Median', 
    'f13': 'Mode', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Mean', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mode', 
    'f30': 'Median', 
    'f31': 'Mode', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Mean', 
    'f37': 'Median', 
    'f38': 'Median', 
    'f39': 'Median', 
    'f40': 'Mode', 
    'f41': 'Median', 
    'f42': 'Mode', 
    'f43': 'Mean', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mode', 
    'f48': 'Mean', 
    'f49': 'Mode', 
    'f50': 'Mode', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Mean', 
    'f55': 'Median', 
    'f56': 'Median', 
    'f57': 'Mean', 
    'f58': 'Mode', 
    'f59': 'Median', 
    'f60': 'Median', 
    'f61': 'Mode', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mode', 
    'f66': 'Mode', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Median', 
    'f70': 'Mode', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Mode', 
    'f75': 'Mode', 
    'f76': 'Mean', 
    'f77': 'Mode', 
    'f78': 'Median', 
    'f79': 'Mean', 
    'f80': 'Mode', 
    'f81': 'Mode', 
    'f82': 'Median', 
    'f83': 'Mode', 
    'f84': 'Mode', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Median', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Mode', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mode', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Mode', 
    'f104': 'Median', 
    'f105': 'Median', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Mode', 
    'f110': 'Median', 
    'f111': 'Median', 
    'f112': 'Mode', 
    'f113': 'Median', 
    'f114': 'Median', 
    'f115': 'Median', 
    'f116': 'Mode', 
    'f117': 'Median', 
    'f118': 'Median'
    }

for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = X[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = X[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = X[col].mode().iloc[0]
    
    X[col].fillna(fill_value, inplace=True)
    test_data[col].fillna(fill_value, inplace=True)


100%|██████████| 118/118 [00:03<00:00, 36.58it/s]


In [8]:
#handling outliers
tenth_percentile = np.percentile(X, 10)
ninetieth_percentile = np.percentile(X, 90)
# print(tenth_percentile, ninetieth_percentile)
b = np.where(X<tenth_percentile, tenth_percentile, X)
b = np.where(b>ninetieth_percentile, ninetieth_percentile, b)
# print("Sample:", sample)
#print("New array:",b)

In [9]:
X = pd.DataFrame(data = b, columns= X.columns)

In [10]:
tenth_percentile = np.percentile(test_data, 10)
ninetieth_percentile = np.percentile(test_data, 90)
# print(tenth_percentile, ninetieth_percentile)
c = np.where(test_data<tenth_percentile, tenth_percentile, test_data)
c = np.where(c>ninetieth_percentile, ninetieth_percentile, c)
# print("Sample:", sample)
#print("New array:",c)

In [11]:
test = pd.DataFrame(data = c, columns= test_data.columns)

In [12]:
from sklearn.metrics import roc_auc_score, roc_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [13]:
import random
import os

SEED = 12345

In [14]:
def seed_everything(seed=64):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [15]:
params = {
    'max_depth': 3, 
    'colsample_bytree': 0.3,  
    'subsample': 0.5, 
    'reg_alpha': 18, 
    'reg_lambda': 17,
    'num_leaves': 7,
    'objective' : 'binary',
    'importance_type': 'gain',
}

In [16]:
from sklearn.model_selection import KFold, StratifiedKFold

kf = KFold(n_splits = 7, shuffle = True, random_state = SEED)
skf = StratifiedKFold(n_splits = 7, shuffle = True, random_state = SEED)

In [17]:
oof_lgb = np.zeros(len(X))
predictions_lgb = np.zeros(len(test))
i = 1

for train_ix, test_ix in kf.split(X.values):
        
    print("Out of fold predictions generating for fold  {} \n".format(i))
        
    train_X, train_y = X.values[train_ix], y.values[train_ix]
    test_X, test_y = X.values[test_ix], y.values[test_ix]
    
    model_lgb = LGBMClassifier(
        random_state = SEED,
        n_estimators = 20000,
        learning_rate = 0.095,
   #     device = 'gpu',
        **params
    )   
       
    model_lgb.fit(
        train_X, 
        train_y,
        eval_set = [(test_X, test_y)],
        eval_metric = "auc",
        early_stopping_rounds = 300,
        verbose = 100,
    )
    
    oof_lgb[test_ix] = oof_lgb[test_ix] + model_lgb.predict_proba(test_X)[:, 1]
    predictions_lgb = predictions_lgb + model_lgb.predict_proba(test)[:, 1]
    
    print("AUC for fold {} \t\t {} \n".format(i, round(roc_auc_score(test_y, oof_lgb[test_ix]), 7)))
    
    i = i + 1
    
print("AUC for Training Set: \t\t {} \n".format(round(roc_auc_score(y, oof_lgb), 7)))

Out of fold predictions generating for fold  1 

Training until validation scores don't improve for 300 rounds
[100]	valid_0's auc: 0.807858	valid_0's binary_logloss: 0.511976
[200]	valid_0's auc: 0.810193	valid_0's binary_logloss: 0.510671
[300]	valid_0's auc: 0.811384	valid_0's binary_logloss: 0.509977
[400]	valid_0's auc: 0.812199	valid_0's binary_logloss: 0.5095
[500]	valid_0's auc: 0.812757	valid_0's binary_logloss: 0.509167
[600]	valid_0's auc: 0.813226	valid_0's binary_logloss: 0.508887
[700]	valid_0's auc: 0.813595	valid_0's binary_logloss: 0.508677
[800]	valid_0's auc: 0.813826	valid_0's binary_logloss: 0.508518
[900]	valid_0's auc: 0.814027	valid_0's binary_logloss: 0.508386
[1000]	valid_0's auc: 0.814132	valid_0's binary_logloss: 0.508311
[1100]	valid_0's auc: 0.814309	valid_0's binary_logloss: 0.50821
[1200]	valid_0's auc: 0.814476	valid_0's binary_logloss: 0.508104
[1300]	valid_0's auc: 0.814514	valid_0's binary_logloss: 0.508075
[1400]	valid_0's auc: 0.814573	valid_0's bi

In [18]:
submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submission['claim'] = predictions_lgb / 7
submission.to_csv("submission.csv", index = False)
submission.head(10)

Unnamed: 0,id,claim
0,957919,0.55452
1,957920,0.118785
2,957921,0.617732
3,957922,0.13965
4,957923,0.140801
5,957924,0.183717
6,957925,0.813451
7,957926,0.170561
8,957927,0.592622
9,957928,0.727793
