In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_file = '../input/tabular-playground-series-nov-2021/train.csv'
test_file = '../input/tabular-playground-series-nov-2021/test.csv'
sub_file = '../input/tabular-playground-series-nov-2021/sample_submission.csv'

In [None]:
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
submission = pd.read_csv(sub_file)

In [None]:
train_df.head()

In [None]:
print(f'Number of rows: {train_df.shape[0]}')
print(f'Number of columns: {train_df.shape[1]}')
print(f'No of missing values: {sum(train_df.isna().sum())}')

In [None]:
train_df.describe()

In [None]:
X = train_df.drop("target", axis=1)
#Save the value of Target for usage
y = train_df["target"].copy()

In [None]:
X_test = test_df.copy()
X_test.head()

In [None]:
params = {
   # 'max_depth': 6,
    'max_depth': 15,
    'n_estimators': 9500,
    'learning_rate': 0.007279718158350149,
    #'learning_rate': 0.05,
    'subsample': 0.7,
    #'colsample_bytree': 0.2,
    'colsample_bytree': 0.8308786155701784,
    'colsample_bylevel': 0.6000000000000001,
    #'min_child_weight': 56.41980735551558,
    'min_child_weight': 7.0,
    #'reg_lambda': 75.56651890088857,
    'reg_lambda': 0.6855965452788153,
    #'reg_alpha': 0.11766857055687065,
    'reg_alpha': 140.0,
    #'gamma': 0.6407823221122686
    'gamma': 2.948775828927923
    
    
    }

In [None]:
%%time

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = XGBClassifier(**params,
                            booster= 'gbtree',
                            eval_metric = 'auc',
                            tree_method= 'gpu_hist',
                            predictor="gpu_predictor",
                            use_label_encoder=False)
    
    model.fit(X_train,y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=100,
              verbose=False)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('||'*40)
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
import datatable as dt
ss = dt.fread('../input/tabular-playground-series-nov-2021/sample_submission.csv').to_pandas()

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['target'] = predictions
ss.to_csv('./first_sub', index=False)
ss.head()