## Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import optuna
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train_ogi = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
df_test_ogi = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")   

In [None]:
df_train = df_train_ogi.copy()
df_test = df_test_ogi.copy()

In [None]:
#df_train.head()

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
pd.set_option("max_columns", None)

In [None]:
df_train.isna().sum()

# Uni-variate Analysis

In [None]:
len(df_train)

In [None]:
# df = pd.concat([df_train.drop(["id", "claim"], axis=1), df_test.drop("id", axis=1)], axis=0)
# columns = df.columns.values

# cols = 4
# rows = len(columns) // cols + 1

# fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(25,230), sharex=False)

# plt.subplots_adjust(hspace = 0.3)
# i=0

# for r in np.arange(0, rows, 1):
#     for c in np.arange(0, cols, 1):
#         if i >= len(columns):
#             axs[r, c].set_visible(False)
#         else:
#             hist1 = axs[r, c].hist(df_train[columns[i]].values,
#                                    range=(df[columns[i]].min(),
#                                           df[columns[i]].max()),
#                                    bins=40,
#                                    color="blue",
#                                    edgecolor="black",
#                                    alpha=0.7,
#                                    label="Train Dataset")
#             hist2 = axs[r, c].hist(df_test[columns[i]].values,
#                                    range=(df[columns[i]].min(),
#                                           df[columns[i]].max()),
#                                    bins=40,
#                                    color="yellow",
#                                    edgecolor="black",
#                                    alpha=0.7,
#                                    label="Test Dataset")
#             axs[r, c].set_title(columns[i], fontsize=12, pad=5)
#             axs[r, c].set_yticks(axs[r, c].get_yticks())
#             axs[r, c].set_yticklabels([str(int(i/1000))+"k" for i in axs[r, c].get_yticks()])
#             axs[r, c].tick_params(axis="y", labelsize=10)
#             axs[r, c].tick_params(axis="x", labelsize=10)
#             axs[r, c].grid(axis="y")
#             axs[r, c].legend(fontsize=13)
                                  
#         i+=1
# plt.show();

In [None]:
# for x in df_train.columns : 
#     plt.hist(df_train[x])
#     plt.show()

In [None]:
# for x in df_train.columns : 
#     sns.boxplot(df_train[x] , orient = 'Vertical', color = 'yellow')
#     plt.show();

In [None]:
df_train['claim'].value_counts()

In [None]:
df_train = df_train.drop("id", axis = 1)

# Handling the Missing Values

In [None]:
features = [col for col in df_test.columns if 'f' in col]
df_target = 'claim'

target = df_train[df_target].copy()
df_train['n_missing'] = df_train[features].isna().sum(axis=1)
df_train['mean'] = df_train[features].mean(axis=1)
df_train['median'] = df_train[features].median(axis=1)
df_train['std'] = df_train[features].std(axis=1)
df_train['min'] = df_train[features].min(axis=1)
df_train['max'] = df_train[features].max(axis=1)
df_train['sem']= df_train[features].sem(axis=1)
df_train['skew'] = df_train[features].skew(axis = 1)
df_train['mad'] = df_train[features].mad(axis = 1)

df_test['n_missing'] = df_test[features].isna().sum(axis=1)
df_test['mean'] = df_test[features].mean(axis=1)
df_test['median'] = df_test[features].median(axis=1)
df_test['std'] = df_test[features].std(axis=1)
df_test['min'] = df_test[features].min(axis=1)
df_test['max'] = df_test[features].max(axis=1)
df_test['sem']= df_test[features].sem(axis=1)
df_test['skew'] = df_test[features].skew(axis=1)
df_test['mad'] = df_test[features].mad(axis = 1)

In [None]:
features += ['n_missing','mean','median','std','min','max','sem','skew','mad']

##### Idea taken from https://www.kaggle.com/realtimshady/single-simple-lightgbm

In [None]:
from tqdm import tqdm
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Median', 
    'f3': 'Median', 
    'f4': 'Median', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Median', 
    'f8': 'Median', 
    'f9': 'Median', 
    'f10': 'Median', 
    'f11': 'Mean', 
    'f12': 'Median', 
    'f13': 'Mean', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Median', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mode', 
    'f30': 'Median', 
    'f31': 'Median', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Mean', 
    'f37': 'Median', 
    'f38': 'Median', 
    'f39': 'Median', 
    'f40': 'Mode', 
    'f41': 'Median', 
    'f42': 'Mode', 
    'f43': 'Mean', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mode', 
    'f48': 'Mean', 
    'f49': 'Mode', 
    'f50': 'Mode', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Mean', 
    'f55': 'Mean', 
    'f56': 'Mode', 
    'f57': 'Mean', 
    'f58': 'Median', 
    'f59': 'Median', 
    'f60': 'Median', 
    'f61': 'Median', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mode', 
    'f66': 'Median', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Mean', 
    'f70': 'Mode', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Mode', 
    'f75': 'Mode', 
    'f76': 'Mean', 
    'f77': 'Mode', 
    'f78': 'Median', 
    'f79': 'Mean', 
    'f80': 'Median', 
    'f81': 'Mode', 
    'f82': 'Median', 
    'f83': 'Mode', 
    'f84': 'Median', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Mean', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Median', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mode', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Median', 
    'f104': 'Median', 
    'f105': 'Median', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Mode', 
    'f110': 'Median', 
    'f111': 'Median', 
    'f112': 'Median', 
    'f113': 'Mean', 
    'f114': 'Median', 
    'f115': 'Median', 
    'f116': 'Mode', 
    'f117': 'Median', 
    'f118': 'Mean'
}


for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = df_train[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = df_train[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = df_train[col].mode().iloc[0]
    
    df_train[col].fillna(fill_value, inplace=True)
    df_test[col].fillna(fill_value, inplace=True)

In [None]:
X = df_train.drop('claim', axis = 1)
y = df_train['claim']
X_test = df_test.drop('id',axis =1 )

In [None]:
# for x in df_train.columns: 
#     df_train[x] = df_train[x].fillna(df_train[x].mean())

# Splitting the data into train and test

In [None]:
#df_train.head()

In [None]:
#train.head()

In [None]:
#y.head()

### We will not deleted the outlier values except scale them 

In [None]:
xtrain, xtest , ytrain , ytest = train_test_split(X , y , test_size = 0.2 , random_state  = 0)

# Scaling Data

In [None]:
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(xtrain)
test_scaled = scaler.transform(xtest)

## Model Building

### Without Intel Extension

In [None]:
# %%time
# lr = LogisticRegression(solver='saga', penalty = 'elasticnet', random_state = 0 ,max_iter = 500 , l1_ratio = 0.6 )
# model1 = lr.fit(train_scaled , ytrain)

In [None]:
# from sklearn import metrics
# pred= model1.predict(test_scaled)
# print(pred)
# #
# ytest
# print(accuracy_score(pred , ytest))
# fpr, tpr, thresholds = metrics.roc_curve(pred, ytest, pos_label=1)
# print(metrics.auc(fpr, tpr))

# Using Intel Extension for Scikit-Learn

### Uncomment the below cells to use Intel Extension

In [None]:
#!pip install scikit-learn-intelex --progress-bar off >> /tmp/pip_sklearnex.log

In [None]:
# from sklearnex import patch_sklearn
# patch_sklearn()

In [None]:
# %%time
# lr = RandomForestClassifier()
# model = lr.fit(train_scaled,ytrain)

In [None]:
# pred= model.predict(test_scaled)
# accuracy_score(pred , ytest)
# fpr, tpr, thresholds = metrics.roc_curve(pred, ytest, pos_label=1)
# print(metrics.auc(fpr, tpr))

In [None]:
# from sklearn.metrics import roc_auc_score
# pred= model.predict(test_scaled)
# print(roc_auc_score(pred,ytest))

## Using XGBoost

In [None]:
# def objective(trial,data=train_scaled,target=ytrain):
    
#     param = {

#         'lambda': trial.suggest_uniform('lambda',0.001,0.1),
#         'alpha': trial.suggest_uniform('alpha',0.1,0.5),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1,1.0),
#         'subsample': trial.suggest_uniform('subsample', 0.5,0.9),
#         'learning_rate': trial.suggest_uniform('learning_rate', 0.05,0.10),
#         'n_estimators': trial.suggest_int('n_estimators', 1000,30000),
#         'max_depth': trial.suggest_int('max_depth', 3,8),
#         'min_child_weight': trial.suggest_int('min_child_weight', 10,100),        
#         'objective': trial.suggest_categorical('objective',['binary:logistic']), 
#         'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # 'gpu_hist','hist'
#         'eval_metric' : 'logloss'
#     }
#     model = xgb.XGBClassifier(**param)      
#     model.fit(train_scaled,ytrain,eval_set=[(test_scaled,ytest)],early_stopping_rounds=100,verbose=False)
#     preds = model.predict(test_scaled)
#     auc = roc_auc_score(ytest, preds)
    
#     return auc

In [None]:
# import optuna
# from optuna.samplers import TPESampler
# import sklearn
# sampler = TPESampler(seed=0)
# study = optuna.create_study(direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=200)
# params = study.best_params #getting best params from study
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)


In [None]:
# params = {'lambda': 0.01053687237713984, 
#           'alpha': 0.4495749711394468, 
#           'colsample_bytree': 0.8378659612970323, 
#           'subsample': 0.5939624665626383, 
#           'learning_rate': 0.05402737627291443 ,
#           'n_estimators': 2564, 
#           'max_depth': 3, 
#           'random_state': 0, 
#           'min_child_weight': 24, 
#           'objective': 'binary:logistic', 
#           'tree_method': 'gpu_hist', 
#           'use_label_encoder': False}

In [None]:
# %%time
# xgb = xgb.XGBClassifier(**params)
# model = xgb.fit(train_scaled,ytrain)

In [None]:
# from sklearn.metrics import roc_auc_score
# pred= model.predict(test_scaled)
# print(roc_auc_score(pred,ytest))

## Using LightGBM

In [None]:
# def create_model(trial):
#     num_leaves = trial.suggest_int("num_leaves", 100, 200)
#     n_estimators = trial.suggest_int("n_estimators", 30000, 50000)
#     min_child_samples = trial.suggest_int('min_child_samples', 100, 200)
#     min_child_weight = trial.suggest_int('min_child_weight', 10, 200)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.001, 0.1)
#     reg_alpha = trial.suggest_uniform('reg_alpha', 10, 100)
#     reg_lambda = trial.suggest_uniform('reg_lambda', 10, 100)
#     colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.50, 1.0)
#     device =  'gpu'

#     model = lgb.LGBMClassifier(
#         objective='binary',
#         metric='binary_logloss',
#         num_leaves = num_leaves,
#         n_estimators = n_estimators,
#         min_child_samples = min_child_samples,
#         min_child_weight = min_child_weight,
#         learning_rate = learning_rate,
#         reg_alpha = reg_alpha,
#         reg_lambda = reg_lambda,
#         colsample_bytree = colsample_bytree,
#         device =  'gpu',
#         random_state=0,
#         verbosity = -1
#     )
    
#     return model

# def objective(trial):
#     model = create_model(trial)
#     model.fit(train_scaled,ytrain,eval_set=[(test_scaled,ytest)],early_stopping_rounds=10,verbose=False)
#     preds = model.predict(test_scaled)
#     auc = roc_auc_score(ytest, preds)
# #     model.fit(train_scaled, ytrain)
# #     score = sklearn.metrics.roc_auc_score(test_scaled, model.predict_proba(ytest)[:,1])
# #     return score
#     return auc


In [None]:
# import optuna
# from optuna.samplers import TPESampler
# import sklearn
# sampler = TPESampler(seed=0)
# study = optuna.create_study(direction="maximize", sampler=sampler,pruner=optuna.pruners.HyperbandPruner())
# study.optimize(objective, n_trials=100)
# params = study.best_params #getting best params from study
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
lgbm_params = {'objective': 'binary',
                'boosting_type': 'gbdt',
                'num_leaves': 156,
                'max_depth': 4,
                'learning_rate': 0.01,
                'n_estimators': 40026,
                'reg_alpha': 25.5,
                'reg_lambda': 96.6,
                'random_state': 0,
                'bagging_seed': 0, 
                'feature_fraction_seed': 0,
                'n_jobs': 4,
                'subsample': 0.6,
                'subsample_freq': 1,
                'colsample_bytree': 0.95,
                'min_child_samples': 95,
                'min_child_weight': 145,
                'metric': 'AUC',
                'verbosity': -1,
                'device' : 'gpu'
              }

In [None]:
# import lightgbm as lgb
# model = lgb.LGBMClassifier(**params, device ='gpu')
# model.fit(train_scaled , ytrain)

In [None]:
# from sklearn.metrics import roc_auc_score
# pred= model.predict(test_scaled)
# print(roc_auc_score(pred,ytest))

In [None]:
X = df_train.drop('claim', axis = 1)
y = df_train['claim']
X_test = df_test.drop('id', axis= 1 )

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
import gc
SEED = 0
splits = 3
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=SEED)

preds = np.zeros(len(X_test))

for train_idx, test_idx in kf.split(X, y):    
    train = lgb.Dataset(X[train_idx], y[train_idx], free_raw_data=False)
    test = lgb.Dataset(X[test_idx], y[test_idx], free_raw_data=False)
    
    lgbm_params['learning_rate'] = 0.01
    
    model = lgb.train(lgbm_params,
                      train,
                      verbose_eval=-1,
                      early_stopping_rounds=10,
                      valid_sets=[test])
    
    preds += model.predict(X_test) / splits
    gc.collect()

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')
submission['claim'] = preds
submission.to_csv('submission.csv')

In [None]:
df1 = pd.read_csv("submission.csv")
df1.head()