In [None]:
import numpy as np
import pandas as pd 
import optuna 
import sklearn
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history,plot_param_importances
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
dftest = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
useful_features = [x for x in df.columns if x not in ['id','claim']]

df['n_missing'] = df[useful_features].isna().sum(axis=1)
dftest['n_missing'] = dftest[useful_features].isna().sum(axis =1)

df['std'] = df[useful_features].std(axis=1)
dftest['std'] = dftest[useful_features].std(axis=1)

df['max'] = df[useful_features].max(axis=1)
dftest['max'] = dftest[useful_features].max(axis=1)

df['min'] = df[useful_features].min(axis=1)
dftest['min'] = dftest[useful_features].min(axis=1)

#filling remaing values using mean 
df[useful_features] = df[useful_features].fillna(df[useful_features].mean())
dftest[useful_features] = dftest[useful_features].fillna(df[useful_features].mean())

#scaling all the values using standard scaler 
sc = MinMaxScaler()
df[useful_features] = sc.fit_transform(df[useful_features])
dftest[useful_features] = sc.transform(dftest[useful_features])

useful_features = [x for x in df.columns if x not in ['id','claim']]

X = df[useful_features].values
y = df['claim'].values

In [None]:
def objective(trial):
    #empty list for training and testing roc_scores
    train_score = []
    test_score = []
    
    #optunas parameters suggestion

    param_grid = {
              'objective': 'binary:logistic',
              'use_label_encoder': False,
              'n_estimators': trial.suggest_int('n_estimators', 500, 20000),
              'learning_rate': trial.suggest_discrete_uniform('learning_rate',0.01,0.1,0.01),
              'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.1),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree',0.3,1.0, 0.1),
              'max_depth': trial.suggest_int('max_depth', 2, 20),
              'booster': 'gbtree',
              'gamma': trial.suggest_uniform('gamma',1.0,10.0),
              'reg_alpha': trial.suggest_int('reg_alpha',50,100),
              'reg_lambda': trial.suggest_int('reg_lambda',50,100),
              'random_state': 42,
                 }
    #cross validation using StratifiedKfold
    
    fold = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
    
    for train_id,test_id in fold.split(df[useful_features],df['claim']):
        
        
        
        #training and testing data
        xtrain ,xtest = X[train_id],X[test_id]
        ytrain,ytest = y[train_id],y[test_id]
        
        #model instance 
        model = XGBClassifier(**param_grid,
                               tree_method='gpu_hist', 
                                  predictor='gpu_predictor')
        
        #fitting the model
        model.fit(xtrain, ytrain, 
                  verbose=500,
                 eval_set=[(xtest,ytest)],
                  eval_metric=['logloss','auc'],
                 early_stopping_rounds=500)
        
        #predicting train and test data
        test_pred = model.predict_proba(xtest)[:, 1]
        train_pred = model.predict_proba(xtrain)[:,1] 
        
        #appending into the list
        train_score.append(roc_auc_score(ytrain,train_pred))
        test_score.append(roc_auc_score(ytest,test_pred))
        
        #printing the scores
        print("Train roc_auc_score is {}".format(roc_auc_score(ytrain,train_pred)))
        print("Test roc_auc_score is {}".format(roc_auc_score(ytest,test_pred)))
        
        #returning mean score
    return np.mean(test_score)

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler(), study_name='xgb_optuna')
study.optimize(objective, n_trials = 25)

In [None]:
print("Number of trails : {}".format(len(study.trials)))
print("Best parameters :")
print(study.best_params)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:

xgb_params = {'objective': 'binary:logistic',
              'use_label_encoder': False,
                'n_estimators': 11904, 
              'learning_rate': 0.06999999999999999 , 
              'subsample': 0.7, 
              'colsample_bytree': 1.0, 
              'max_depth': 2,
              'gamma':2.358054251805544, 
              'reg_alpha': 59, 
              'reg_lambda': 71,
             'booster': 'gbtree',
             'random_state': 42}

In [None]:
#xgbclassifier
#1st model
oof = np.zeros(df.shape[0])

xgb_preds = []
test_score = []
train_score = []
fold = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)


for train_id,test_id in fold.split(df[useful_features],df['claim']):
        
        
        
        #training and testing data
        xtrain ,xtest = X[train_id],X[test_id]
        ytrain,ytest = y[train_id],y[test_id]
        x_test_data = dftest[useful_features].values
        
        #model instance 
        model = XGBClassifier(**xgb_params, 
                tree_method='gpu_hist', 
                predictor='gpu_predictor')
        
        #fitting the model
        model.fit(xtrain, ytrain, 
                  verbose=500,
                 eval_set=[(xtest,ytest)],
                  eval_metric=['logloss','auc'],
                 early_stopping_rounds=500)
        
        #predicting train and test data
        test_pred = model.predict_proba(xtest)[:, 1]
        train_pred = model.predict_proba(xtrain)[:,1] 
        oof[test_id] = model.predict_proba(xtest)[:,1]
        
        test_auc = roc_auc_score(ytest, test_pred)
        train_auc = roc_auc_score(ytrain,train_pred)
        
        #appending into the list
        train_score.append(train_auc)
        test_score.append(test_auc)
        
        #printing the scores
        print("Train roc_auc_score is {}".format(train_auc))
        print("Test roc_auc_score is {}".format(test_auc))
        
        xgb_preds.append(model.predict_proba(x_test_data)[:,-1])
          
print("all folds done")
print("Average train auc : {} with standard devation : {}".format(np.mean(train_score),np.std(train_score)))
print("Average test auc : {} with standard devation : {}".format(np.mean(test_score),np.std(test_score)))
xgb_preds = np.mean(np.column_stack(xgb_preds), axis=1)

In [None]:
sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sample['claim'] = xgb_preds
sample.to_csv('submission.csv',index = False)

In [None]:
oof_pd = pd.DataFrame()
oof_pd['id'] = df['id']
oof_pd.set_index('id',inplace = True)
oof_pd['xgb_1'] = oof
oof_pd.to_csv('xgb_train_minmax.csv',index = False)