In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
matplotlib.rcParams["figure.figsize"] = (12,10)
import seaborn as sns


from sklearn.impute import SimpleImputer

#Model library
from sklearn.model_selection import train_test_split,cross_val_score, KFold, StratifiedKFold
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
## import packages
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
import datatable as dt

from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score,roc_auc_score,mean_squared_error
from tqdm import tqdm
import gc

In [None]:
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample= pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
train.shape,test.shape

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#FF595E')\
                           .background_gradient(subset=['50%'], cmap='PiYG') # highlight median

In [None]:
features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
train['abs_sum'] = train[features].abs().sum(axis=1)
train['sem'] = train[features].sem(axis=1)
train['std'] = train[features].std(axis=1)
train['avg'] = train[features].mean(axis=1)
train['max'] = train[features].max(axis=1)
train['min'] = train[features].min(axis=1)

test['n_missing'] = test[features].isna().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)
test['sem'] = test[features].sem(axis=1)
test['std'] = test[features].std(axis=1)
test['avg'] = test[features].mean(axis=1)
test['max'] = test[features].min(axis=1)
test['min'] = test[features].min(axis=1)

1. Mean: normal distribution
2. Median: unimodal and skewed
3. Mode: all other cases

In [None]:
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Median', 
    'f3': 'Median', 
    'f4': 'Median', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Median', 
    'f8': 'Median', 
    'f9': 'Median', 
    'f10': 'Median', 
    'f11': 'Mean', 
    'f12': 'Median', 
    'f13': 'Mean', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Median', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mode', 
    'f30': 'Median', 
    'f31': 'Median', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Mean', 
    'f37': 'Median', 
    'f38': 'Median', 
    'f39': 'Median', 
    'f40': 'Mode', 
    'f41': 'Median', 
    'f42': 'Mode', 
    'f43': 'Mean', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mode', 
    'f48': 'Mean', 
    'f49': 'Mode', 
    'f50': 'Mode', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Mean', 
    'f55': 'Mean', 
    'f56': 'Mode', 
    'f57': 'Mean', 
    'f58': 'Median', 
    'f59': 'Median', 
    'f60': 'Median', 
    'f61': 'Median', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mode', 
    'f66': 'Median', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Mean', 
    'f70': 'Mode', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Mode', 
    'f75': 'Mode', 
    'f76': 'Mean', 
    'f77': 'Mode', 
    'f78': 'Median', 
    'f79': 'Mean', 
    'f80': 'Median', 
    'f81': 'Mode', 
    'f82': 'Median', 
    'f83': 'Mode', 
    'f84': 'Median', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Mean', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Median', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mode', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Median', 
    'f104': 'Median', 
    'f105': 'Median', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Mode', 
    'f110': 'Median', 
    'f111': 'Median', 
    'f112': 'Median', 
    'f113': 'Mean', 
    'f114': 'Median', 
    'f115': 'Median', 
    'f116': 'Mode', 
    'f117': 'Median', 
    'f118': 'Mean'
}

In [None]:
for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = train[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = train[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = train[col].mode().iloc[0]
    
    train[col].fillna(fill_value, inplace=True)
    test[col].fillna(fill_value, inplace=True)

In [None]:
X = train.drop(["claim"], axis=1)
X_test = test
y = train["claim"]


In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
del test, train, scaler
gc.collect()

In [None]:
!nvidia-smi

In [None]:
# Model hyperparameters
SEED = 1
lgbm_params = {'objective': 'binary',
               'boosting_type': 'gbdt',
               'num_leaves': 6,
               'max_depth': 2,
               'n_estimators': 10000,
               'reg_alpha': 25.0,
               'reg_lambda': 76.7,
               'random_state': SEED,
               'bagging_seed': SEED, 
               'feature_fraction_seed': SEED,
               'n_jobs': -1,
               'subsample': 1,
               'subsample_freq': 1,
               'colsample_bytree': 0.7,
               'min_child_samples': 55,
               'min_child_weight': 256,
               'metric': 'AUC',
               'verbosity': -1,
               "device": "gpu",
               "gpu_platform_id": 0,
               "gpu_device_id": 0,
              }

In [None]:
%%time

splits = 5
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=SEED)

preds = np.zeros(len(X_test))

for train_idx, valid_idx in kf.split(X, y):    
    lgb_train = lgb.Dataset(X[train_idx], y[train_idx], free_raw_data=False)
    lgb_valid = lgb.Dataset(X[valid_idx], y[valid_idx], free_raw_data=False)

    lgbm_params['learning_rate'] = 0.2
    
    model = lgb.train(lgbm_params,
                      lgb_train,
                      verbose_eval=-1,
                      early_stopping_rounds=1000,
                      valid_sets=[lgb_valid])
    
    lgbm_params['learning_rate'] = 0.1
    
    model = lgb.train(lgbm_params,
                      lgb_train,
                      init_model=model,
                      verbose_eval=-1,
                      early_stopping_rounds=1000,
                      valid_sets=[lgb_valid])
    
    preds += model.predict(X_test) / splits
    
    gc.collect()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')
submission['claim'] = preds
submission.to_csv('submission_lgbm.csv')

In [None]:
df = pd.DataFrame()
i = 0
for f in ["../input/tpssep21avgsubs/submission (10).csv",
          "../input/tpssep21avgsubs/submission (9).csv",
          "../input/tpssep21avgsubs/tps-sep21-avg-best5.csv",
          "../input/tpssep21avgsubs/tps-sep21-power_of-avg.csv",
          "./submission_lgbm.csv",
         ]:
    print(f)
    input_df = pd.read_csv(f)
    df[i] = input_df.claim.rank()
    i = i+1

In [None]:
df.corr()

In [None]:
submit_df = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_df['claim'] = df.mean(1)
submit_df.to_csv("submission.csv", index=False)