In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# StratifiedKFold cross validation to make sure the same proportion of both classes maintained during each sampling process
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.figure(figsize = (20, 18))
import xgboost as xgb
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
# hyperopt is hyperparameter optimization by defining an objective function and declaring a search space
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [None]:
# Loading the training data
oct_data = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
oct_data.shape

In [None]:
oct_data.head()

In [None]:
# Drops ID column as it is not required
oct_data.drop(["id"], axis=1, inplace=True)

In [None]:
# Checks for data types used in the data set
oct_data.dtypes.unique()

In [None]:
# Check for missing values
sum(oct_data.isna().sum())

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Compresses the training data as Kaggle kernel resets due to large size of the training data 
oct_data = reduce_mem_usage(oct_data)

In [None]:
# Shows the column data types after data compression
oct_data.dtypes

In [None]:
# Checks distribution of categorical target variable
oct_data.groupby(['target']).size()

In [None]:
oct_data.groupby(['target']).size().isnull().sum() # no null value in the target column

In [None]:
oct_data.nunique()


# Feature selection with SelectKBest

#### Ten columns at a time subsetting and applying SelectKBest to get the important features

In [None]:
df = oct_data[['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9','f10','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()


#### f1, f3, f4, f8 are the important features

In [None]:
df = oct_data[['f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19','f20','target']]
target = df['target']

features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask

selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### The important features are f12, f17,f18,f19

In [None]:
df = oct_data[['f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29','f30','target']]

target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f22,f26,f27,f29 is the feature to consider for model training

In [None]:
df = oct_data[['f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39','f40','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f33,f34,f35,f40 are the features


In [None]:
df = oct_data[['f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49','f50','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f42,f43,f44,f48 are the features

In [None]:
df = oct_data[['f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59','f60','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f52,f53 f56,f58 are the features

In [None]:
df = oct_data[['f61', 'f62','f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69','f70','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f63,f64,f65 f69

In [None]:
df = oct_data[['f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79','f80','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f72,f73,f77,f78 features

In [None]:
df = oct_data[['f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89','f90','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f82,f85, f86, f90 features

In [None]:
df = oct_data[['f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99','f100','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f92, f95, f96, f99 features

In [None]:

df = oct_data[['f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109','f110','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f103,f104, f107, f108 features

In [None]:
df = oct_data[['f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119','f120','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f112,f114 f117, f119 features

In [None]:
df = oct_data[['f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129','f130','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f125,f127, f129, f130 features

In [None]:
df = oct_data[['f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139','f140','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f134, f136,f138,f139

In [None]:
df = oct_data[['f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149','f150','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f141, f143, f144, f150 features

In [None]:
df.head()

In [None]:
df = oct_data[['f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159','f160','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f152, f154, f156,f159 features

In [None]:
df.head()

In [None]:
df = oct_data[['f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169','f170','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f162, f163, f164, f169 features


In [None]:
df.head()

In [None]:

df = oct_data[['f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179','f180','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f173, f174, f177, f179 selected features

In [None]:
df = oct_data[['f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189','f190','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f181,f184,f187,f188 selected features

In [None]:
df = oct_data[['f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199','f200','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f192, f195, f199,f200 features

In [None]:

df = oct_data[['f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209','f210','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f201, f206, f208, f210 features

In [None]:
df = oct_data[['f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219','f220','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f211, f213, f214, f219 features

In [None]:
df = oct_data[['f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229','f230','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f222,f224,f227,f229 are the features

In [None]:
df = oct_data[['f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239','f240','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f231, f232, f239, f240 features selected features

In [None]:
df = oct_data[['f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249','f250','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### 241, 243, f245,247 features. Categorical features

In [None]:
df = oct_data[['f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259','f260','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f252, 256, f258, f260 features. Categorical features

In [None]:
df.head()

In [None]:
df = oct_data[['f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269','f270','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f265, f266, f267, f269 categorical features

In [None]:
df.head()

In [None]:
df = oct_data[['f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279','f280','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f274, f275,f278,f279 categorical features

In [None]:
df.head()

In [None]:

df = oct_data[['f281', 'f282', 'f283', 'f284','target']]
target = df['target']
features = df.drop('target', axis=1)
select_univariate = SelectKBest(f_classif, k=4).fit(features, target)
features_mask = select_univariate.get_support()
features_mask
selected_columns = features.columns[features_mask]
selected_features = features[selected_columns]
selected_features.head()

#### f281, f282, f283,f284 categorical features

In [None]:
df.head()

#### Manual data selection completed and we will train the model with the selected features

In [None]:
df=oct_data[[
'f1','f3','f4','f8','f12', 'f17','f18','f19',
'f22','f26','f27','f29','f33','f34','f35','f40',
'f42','f43','f44','f48','f52','f53', 'f56','f58',
'f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241', 'f243', 'f245','f247',
'f252','f256', 'f258', 'f260',
'f265', 'f266', 'f267', 'f269',
'f274', 'f275','f278','f279',
'f281', 'f282', 'f283','f284',
'target']]

In [None]:
del oct_data

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df[['f1','f3','f4','f8','f12', 'f17','f18','f19','f26','f27','f29','f33','f34','f35','f40',
'f42','f44','f48','f52','f53', 'f56','f58','f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241']]= pd.DataFrame(preprocessing.scale(df[['f1','f3','f4','f8','f12', 'f17','f18','f19','f26','f27','f29','f33','f34','f35','f40',
'f42','f44','f48','f52','f53', 'f56','f58','f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241']]))

In [None]:
df[['f1','f3','f4','f8','f12', 'f17','f18','f19','f26','f27','f29','f33','f34','f35','f40',
'f42','f44','f48','f52','f53', 'f56','f58','f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241']].head()

In [None]:
# features and the target

y = df.target
X = df.drop(["target"], axis=1)

In [None]:
del df

In [None]:
X.head()

In [None]:
y.sample(10)

# StratifiedKFold Cross Validation

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)

In [None]:
# Performs cross validation on XGB Classifier

model = XGBClassifier(n_estimators=500,objective='binary:logistic', eval_metric='auc',tree_method='gpu_hist')
model_score = cross_val_score(model, X, y, scoring='roc_auc', cv=skf.split(X, y), n_jobs=-1, verbose=10)

In [None]:
print(model_score.mean())

In [None]:
del model_score, model

# Hyperparameter Tuning with Bayesian Optimization

# a) bayes_opt

In [None]:
parameter_space = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (2,10),
    'subsample': (0.4, 1.0),
    'colsample_bytree' :(0.4, 1.0),
    'gamma': (0, 5)}

def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample_bytree,
                        gamma):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = XGBClassifier(
        tree_method='gpu_hist',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        objective = 'binary:logistic',
        eval_metric='auc',
        gamma=gamma)
    return np.mean(cross_val_score(clf, X, y, cv=5, scoring='roc_auc'))

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=parameter_space,
    random_state=100,
)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
optimizer.maximize(init_points=2, n_iter=5, acq='ei', xi=0.0)

In [None]:
optimizer.res

In [None]:
params_gbm = optimizer.max['params']
params_gbm['max_depth'] = round(params_gbm['max_depth'])
params_gbm['n_estimators'] = round(params_gbm['n_estimators'])
params_gbm

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_test, label=y_test)
del X_train,y_train,X_test,y_test
params = {'colsample_bytree': 0.9717997218792195,
 'gamma': 4.7790561362268145,
 'learning_rate': 0.052474259458759866,
 'max_depth': 6,
 'subsample': 0.9128209886114327}

params["max_depth"] = int(params["max_depth"])
params["objective"] = "binary:logistic"
params["eval_metric"] = "auc"
params["tree_method"] = "gpu_hist"
    
model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=200)

In [None]:
# Loads test data set
test = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")

# Removes ID column as it is not required for prediction
test.drop(["id"], axis=1, inplace=True)

In [None]:
# Loads submission data set that acts just as a template for submission
submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")


In [None]:
test=test[[
'f1','f3','f4','f8','f12', 'f17','f18','f19',
'f22','f26','f27','f29','f33','f34','f35','f40',
'f42','f43','f44','f48','f52','f53', 'f56','f58',
'f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241', 'f243', 'f245','f247',
'f252','f256', 'f258', 'f260',
'f265', 'f266', 'f267', 'f269',
'f274', 'f275','f278','f279',
'f281', 'f282', 'f283','f284']]

In [None]:
test[['f1','f3','f4','f8','f12', 'f17','f18','f19','f26','f27','f29','f33','f34','f35','f40',
'f42','f44','f48','f52','f53', 'f56','f58','f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241']]= pd.DataFrame(preprocessing.scale(test[['f1','f3','f4','f8','f12', 'f17','f18','f19','f26','f27','f29','f33','f34','f35','f40',
'f42','f44','f48','f52','f53', 'f56','f58','f63','f64','f65', 'f69','f72','f73','f77','f78',
'f82','f85', 'f86', 'f90',
'f92', 'f95', 'f96', 'f99',
'f103','f104', 'f107', 'f108',
'f112','f114', 'f117', 'f119' ,
'f125','f127', 'f129', 'f130',
'f134', 'f136','f138','f139',
'f141', 'f143', 'f144', 'f150',
'f152', 'f154', 'f156','f159',
'f162', 'f163', 'f164', 'f169',
'f173', 'f174', 'f177', 'f179',
'f181', 'f184', 'f187', 'f188',
'f192', 'f195', 'f199','f200',
'f201', 'f206','f208', 'f210',
'f211', 'f213', 'f214', 'f219' ,
'f222','f224','f227','f229',
'f231', 'f232', 'f239', 'f240',
'f241']]))

In [None]:
test.head()

In [None]:
dtest = xgb.DMatrix(data=test)
predictions = model.predict(dtest)

In [None]:
submission["target"] = predictions

In [None]:
# Checks for sumbission file before saving
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) # 0.84868 score

In [None]:
del model, predictions

# b) Hyperopt

In [None]:
fold_no = 1
for train_index, test_index in skf.split(X, y):
    print('Fold = ',fold_no)
    y_val = y.iloc[test_index]
    dtrain = xgb.DMatrix(data=X.iloc[train_index], label=y.iloc[train_index])
    dval = xgb.DMatrix(data=X.iloc[test_index], label=y.iloc[test_index])
    fold_no +=1

In [None]:
hyperparameter_space = { 
                        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                        'max_depth': hp.quniform("max_depth", 2, 6, 1),
                        'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
                        'reg_alpha' : hp.uniform('reg_alpha', 1e-8, 100),
                        'reg_lambda' : hp.uniform('reg_lambda', 1e-8, 100),
                        'gamma': hp.uniform ('gamma', 0.0, 1.0),
                        'subsample': hp.uniform("subsample", 0.1, 1.0),
                        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0)
                       }


   

In [None]:
def optimize_hyppara(hyperparameter_space):
    # Converts parameter value to int as required by XGBoost
    hyperparameter_space["max_depth"] = int(hyperparameter_space["max_depth"])
    hyperparameter_space["objective"] = "binary:logistic"
    hyperparameter_space["eval_metric"] = "auc"
    hyperparameter_space["tree_method"] = "gpu_hist"
    
    model = xgb.train(
        hyperparameter_space, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=False)
    
    predictions = model.predict(dval)
    
    roc_auc = roc_auc_score(y_val, predictions)
    
    del predictions, model, hyperparameter_space
    
    return {"loss": -roc_auc, "status": STATUS_OK}

In [None]:
# Starts hyperparameters tuning
trials = Trials()
best_model_params = fmin(fn=optimize_hyppara,space=hyperparameter_space, max_evals=50,algo=tpe.suggest,trials=trials)

In [None]:
best_model_params

In [None]:
del dtrain, dval,y_val

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dval = xgb.DMatrix(data=X_test, label=y_test)
del X_train,y_train,X_test,y_test
params = {'colsample_bytree': 0.24949960835732582,
 'gamma': 0.40334695196873604,
 'learning_rate': 0.06678598441625683,
 'max_depth': 5.0,
 'min_child_weight': 6.0,
 'reg_alpha': 57.68556763950645,
 'reg_lambda': 47.221038452153344,
 'subsample': 0.8481553666497129}

params["max_depth"] = int(params["max_depth"])
params["objective"] = "binary:logistic"
params["eval_metric"] = "auc"
params["tree_method"] = "gpu_hist"
    
model = xgb.train(
        params, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=200)

In [None]:
# Adds other important parameters
best_model_params["max_depth"] = int(best_model_params["max_depth"])
best_model_params["objective"] = "binary:logistic"
best_model_params["eval_metric"] = "auc"
best_model_params["tree_method"] = "gpu_hist"

In [None]:
dtest = xgb.DMatrix(data=test)
predictions = model.predict(dtest)

In [None]:
submission["target"] = predictions

# Checks for sumbission file before saving
submission


In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) # 0.85 score

In [None]:
del model, dtest, predictions

In [None]:
# Gets the model trained over cross validation and predictions 
# against each iteration is stored

test_predictions = []

dtest = xgb.DMatrix(data=test)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print("fold", fold)

    dtrain = xgb.DMatrix(data=X.iloc[train_index], label=y.iloc[train_index])
    dval = xgb.DMatrix(data=X.iloc[val_index], label=y.iloc[val_index])
    
    model = xgb.train(
        best_model_params, 
        dtrain, 
        num_boost_round=2000, 
        evals=[(dtrain, 'train'), (dval, 'eval')],
        early_stopping_rounds=50, verbose_eval=200)
    
    predictions = model.predict(dtest)
    
    test_predictions.append(predictions)
    
    del predictions, model, dval, dtrain

In [None]:
test_predictions

In [None]:
del dtest, test


In [None]:
# Predictions stored against each cross validation iteration finally gets aeveraged
# and target column is set with that averaged predictions
submission["target"] = np.mean(np.column_stack(test_predictions), axis=1)

# Checks for sumbission file before saving
submission


In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False)