In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data read

In [None]:
# StratifiedKFold cross validation to make sure the same proportion of both classes maintained during each sampling process
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest
from yellowbrick.target import FeatureCorrelation
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.figure(figsize = (20, 18))
import xgboost as xgb
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
# hyperopt is hyperparameter optimization by defining an objective function and declaring a search space
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
pd.set_option('display.max_columns', 500)

In [None]:
dfTrain = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")
dfTrain.head()

In [None]:
dfTrain.shape

In [None]:
dfTrain.target.value_counts()

In [None]:
dfTrain.info()
 

In [None]:
dfTrain.columns

In [None]:
dfTrain.drop(["row_id"], axis=1, inplace=True)

In [None]:
# Check for missing values
sum(dfTrain.isna().sum())

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        
        if col_type != object:
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# compress the data
dfTrain = reduce_mem_usage(dfTrain)

# Feature selection with SelectKBest

In [None]:
df = dfTrain[['A0T0G0C10','A0T0G1C9','A0T0G2C8','A0T0G3C7','A0T0G4C6','A0T0G5C5','A0T0G6C4','A0T0G7C3','A0T0G8C2','A0T0G9C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
sns.set(rc = {'figure.figsize':(20,6)})

In [None]:
ax = sns.boxplot(x="target", y="A0T0G4C6", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T0G10C0','A0T1G0C9','A0T1G1C8','A0T1G2C7','A0T1G3C6','A0T1G4C5','A0T1G5C4','A0T1G6C3','A0T1G7C2','A0T1G8C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A0T1G3C6", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T1G9C0','A0T2G0C8','A0T2G1C7','A0T2G2C6','A0T2G3C5','A0T2G4C4','A0T2G5C3','A0T2G6C2','A0T2G7C1','A0T2G8C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A0T2G3C5", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T3G0C7','A0T3G1C6','A0T3G2C5','A0T3G3C4','A0T3G4C3','A0T3G5C2','A0T3G6C1','A0T3G7C0','A0T4G0C6','A0T4G1C5','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A0T3G5C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T4G2C4','A0T4G3C3','A0T4G4C2','A0T4G5C1','A0T4G6C0','A0T5G0C5','A0T5G1C4','A0T5G2C3','A0T5G3C2','A0T5G4C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A0T4G2C4", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T5G5C0','A0T6G0C4','A0T6G1C3','A0T6G2C2','A0T6G3C1','A0T6G4C0','A0T7G0C3','A0T7G1C2','A0T7G2C1','A0T7G3C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A0T7G2C1", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A0T8G0C2','A0T8G1C1','A0T8G2C0','A0T9G0C1','A0T9G1C0','A0T10G0C0','A1T0G0C9','A1T0G1C8','A1T0G2C7','A1T0G3C6','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T0G3C6", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T0G4C5','A1T0G5C4','A1T0G6C3','A1T0G7C2','A1T0G8C1','A1T0G9C0','A1T1G0C8','A1T1G1C7','A1T1G2C6','A1T1G3C5','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T1G3C5", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T1G4C4','A1T1G5C3','A1T1G6C2','A1T1G7C1','A1T1G8C0','A1T2G0C7','A1T2G1C6','A1T2G2C5','A1T2G3C4','A1T2G4C3','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T2G4C3", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T2G5C2','A1T2G6C1','A1T2G7C0','A1T3G0C6','A1T3G1C5','A1T3G2C4','A1T3G3C3','A1T3G4C2','A1T3G5C1','A1T3G6C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T3G4C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T4G0C5','A1T4G1C4','A1T4G2C3','A1T4G3C2','A1T4G4C1','A1T4G5C0','A1T5G0C4','A1T5G1C3','A1T5G2C2','A1T5G3C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T5G2C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T5G4C0','A1T6G0C3','A1T6G1C2','A1T6G2C1','A1T6G3C0','A1T7G0C2','A1T7G1C1','A1T7G2C0','A1T8G0C1','A1T8G1C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A1T7G1C1", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A1T9G0C0','A2T0G0C8','A2T0G1C7','A2T0G2C6','A2T0G3C5','A2T0G4C4','A2T0G5C3','A2T0G6C2','A2T0G7C1','A2T0G8C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A2T0G5C3", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A2T1G0C7','A2T1G1C6','A2T1G2C5','A2T1G3C4','A2T1G4C3','A2T1G5C2','A2T1G6C1','A2T1G7C0','A2T2G0C6','A2T2G1C5','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A2T1G5C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A2T2G2C4','A2T2G3C3','A2T2G4C2','A2T2G5C1','A2T2G6C0','A2T3G0C5','A2T3G1C4','A2T3G2C3','A2T3G3C2','A2T3G4C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A2T3G3C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A2T3G5C0','A2T4G0C4','A2T4G1C3','A2T4G2C2','A2T4G3C1','A2T4G4C0','A2T5G0C3','A2T5G1C2','A2T5G2C1','A2T5G3C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A2T5G3C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A2T6G0C2','A2T6G1C1','A2T6G2C0','A2T7G0C1','A2T7G1C0','A2T8G0C0','A3T0G0C7','A3T0G1C6','A3T0G2C5','A3T0G3C4','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A2T7G1C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A3T0G4C3','A3T0G5C2','A3T0G6C1','A3T0G7C0','A3T1G0C6','A3T1G1C5','A3T1G2C4','A3T1G3C3','A3T1G4C2','A3T1G5C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A3T1G4C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A3T1G6C0','A3T2G0C5','A3T2G1C4','A3T2G2C3','A3T2G3C2','A3T2G4C1','A3T2G5C0','A3T3G0C4','A3T3G1C3','A3T3G2C2','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A3T3G2C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A3T3G3C1','A3T3G4C0','A3T4G0C3','A3T4G1C2','A3T4G2C1','A3T4G3C0','A3T5G0C2','A3T5G1C1','A3T5G2C0','A3T6G0C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A3T6G0C1", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A3T6G1C0','A3T7G0C0','A4T0G0C6','A4T0G1C5','A4T0G2C4','A4T0G3C3','A4T0G4C2','A4T0G5C1','A4T0G6C0','A4T1G0C5','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A4T0G3C3", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A4T1G1C4','A4T1G2C3','A4T1G3C2','A4T1G4C1','A4T1G5C0','A4T2G0C4','A4T2G1C3','A4T2G2C2','A4T2G3C1','A4T2G4C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A4T2G1C3", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A4T3G0C3','A4T3G1C2','A4T3G2C1','A4T3G3C0','A4T4G0C2','A4T4G1C1','A4T4G2C0','A4T5G0C1','A4T5G1C0','A4T6G0C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A4T5G0C1", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A5T0G0C5','A5T0G1C4','A5T0G2C3','A5T0G3C2','A5T0G4C1','A5T0G5C0','A5T1G0C4','A5T1G1C3','A5T1G2C2','A5T1G3C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A5T1G3C1", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A5T1G4C0','A5T2G0C3','A5T2G1C2','A5T2G2C1','A5T2G3C0','A5T3G0C2','A5T3G1C1','A5T3G2C0','A5T4G0C1','A5T4G1C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A5T2G1C2", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A5T5G0C0','A6T0G0C4','A6T0G1C3','A6T0G2C2','A6T0G3C1','A6T0G4C0','A6T1G0C3','A6T1G1C2','A6T1G2C1','A6T1G3C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A6T1G3C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A6T2G0C2','A6T2G1C1','A6T2G2C0','A6T3G0C1','A6T3G1C0','A6T4G0C0','A7T0G0C3','A7T0G1C2','A7T0G2C1','A7T0G3C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A6T4G0C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A7T1G0C2','A7T1G1C1','A7T1G2C0','A7T2G0C1','A7T2G1C0','A7T3G0C0','A8T0G0C2','A8T0G1C1','A8T0G2C0','A8T1G0C1','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A7T3G0C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
df = dfTrain[['A8T1G1C0','A8T2G0C0','A9T0G0C1','A9T0G1C0','A9T1G0C0','A10T0G0C0','target']]

target = df['target']
features = df.drop('target', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
ax = sns.boxplot(x="target", y="A8T2G0C0", data=df)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
del df

In [None]:
# features and the target
y = dfTrain.target
X = dfTrain[['A0T0G4C6','A0T0G5C5','A0T0G6C4','A0T0G7C3',
            'A0T1G3C6','A0T1G4C5','A0T1G5C4','A0T1G6C3',
             'A0T2G3C5','A0T2G4C4','A0T2G5C3','A0T2G6C2',
             'A0T3G2C5','A0T3G3C4','A0T3G4C3','A0T3G5C2',
             'A0T4G2C4','A0T4G3C3','A0T4G4C2','A0T5G2C3',
             'A0T6G1C3','A0T7G0C3','A0T7G1C2','A0T7G2C1',
             'A0T8G1C1','A0T8G2C0','A1T0G2C7','A1T0G3C6',
             'A1T0G4C5','A1T0G5C4','A1T1G2C6','A1T1G3C5',
             'A1T1G4C4','A1T1G5C3','A1T2G3C4','A1T2G4C3',
             'A1T2G5C2','A1T3G2C4','A1T3G3C3','A1T3G4C2',
             'A1T4G2C3','A1T4G3C2','A1T5G1C3','A1T5G2C2',
             'A1T6G1C2','A1T6G2C1','A1T7G0C2','A1T7G1C1',
             'A2T0G2C6','A2T0G3C5','A2T0G4C4','A2T0G5C3',
             'A2T1G2C5','A2T1G3C4','A2T1G4C3','A2T1G5C2',
             'A2T2G2C4','A2T2G3C3','A2T2G4C2','A2T3G3C2',
             'A2T5G0C3','A2T5G1C2','A2T5G2C1','A2T5G3C0',
             'A2T6G0C2','A2T6G1C1','A2T7G0C1','A2T7G1C0',
             'A3T0G4C3','A3T1G2C4','A3T1G3C3','A3T1G4C2',
             'A3T2G2C3','A3T2G3C2','A3T3G1C3','A3T3G2C2',
             'A3T5G0C2','A3T5G1C1','A3T5G2C0','A3T6G0C1',
             'A3T6G1C0','A3T7G0C0','A4T0G3C3','A4T0G4C2',
             'A4T1G2C3','A4T1G3C2','A4T2G1C3','A4T2G2C2' ,
             'A4T4G0C2','A4T4G2C0','A4T5G0C1','A4T5G1C0',
             'A5T0G3C2','A5T1G1C3','A5T1G2C2','A5T1G3C1',
             'A5T2G1C2','A5T3G1C1','A5T3G2C0','A5T4G1C0',
             'A5T5G0C0','A6T1G1C2','A6T1G2C1','A6T1G3C0',
             'A6T2G1C1','A6T3G0C1','A6T3G1C0','A6T4G0C0',
             'A7T1G1C1','A7T2G0C1','A7T2G1C0','A7T3G0C0',
             'A8T1G1C0','A8T2G0C0','A9T0G1C0','A9T1G0C0'
            ]]

In [None]:
del dfTrain

# XGBoost and CV

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)

In [None]:
# Performs cross validation on XGB Classifier

model = XGBClassifier(tree_method='gpu_hist')
model_score = cross_val_score(model, X, y, scoring='accuracy', cv=skf.split(X, y), n_jobs=-1, verbose=10)

In [None]:
print(model_score.mean())

In [None]:
del model_score, model

In [None]:
parameter_space = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (2,10),
    'subsample': (0.4, 1.0),
    'colsample_bytree' :(0.4, 1.0),
    'gamma': (0, 5)}

def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample_bytree,
                        gamma):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = XGBClassifier(
        tree_method='gpu_hist',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma)
    return np.mean(cross_val_score(clf, X, y, cv=5, scoring='accuracy'))

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=parameter_space,
    random_state=100,
)

In [None]:
optimizer.maximize(init_points=2, n_iter=5, acq='ei', xi=0.0)

In [None]:
optimizer.res

In [None]:
params_gbm = optimizer.max['params']
params_gbm['max_depth'] = round(params_gbm['max_depth'])
params_gbm['n_estimators'] = round(params_gbm['n_estimators'])
params_gbm

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
params = params_gbm

params["max_depth"] = int(params["max_depth"])
params['objective'] = 'multi:softmax'  # error evaluation for multiclass training
params['num_class']=  10  # the number of classes that exist in this datset
params["tree_method"] = "gpu_hist"
params['eval_metric'] =  'mlogloss'

In [None]:
xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=True)

In [None]:
y_pred=xgb.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
# Loads test data set
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")

# Removes ID column as it is not required for prediction
test.drop(["row_id"], axis=1, inplace=True)

In [None]:
# Loads submission data set that acts just as a template for submission
submission = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv")

In [None]:
test = test[['A0T0G4C6','A0T0G5C5','A0T0G6C4','A0T0G7C3',
            'A0T1G3C6','A0T1G4C5','A0T1G5C4','A0T1G6C3',
             'A0T2G3C5','A0T2G4C4','A0T2G5C3','A0T2G6C2',
             'A0T3G2C5','A0T3G3C4','A0T3G4C3','A0T3G5C2',
             'A0T4G2C4','A0T4G3C3','A0T4G4C2','A0T5G2C3',
             'A0T6G1C3','A0T7G0C3','A0T7G1C2','A0T7G2C1',
             'A0T8G1C1','A0T8G2C0','A1T0G2C7','A1T0G3C6',
             'A1T0G4C5','A1T0G5C4','A1T1G2C6','A1T1G3C5',
             'A1T1G4C4','A1T1G5C3','A1T2G3C4','A1T2G4C3',
             'A1T2G5C2','A1T3G2C4','A1T3G3C3','A1T3G4C2',
             'A1T4G2C3','A1T4G3C2','A1T5G1C3','A1T5G2C2',
             'A1T6G1C2','A1T6G2C1','A1T7G0C2','A1T7G1C1',
             'A2T0G2C6','A2T0G3C5','A2T0G4C4','A2T0G5C3',
             'A2T1G2C5','A2T1G3C4','A2T1G4C3','A2T1G5C2',
             'A2T2G2C4','A2T2G3C3','A2T2G4C2','A2T3G3C2',
             'A2T5G0C3','A2T5G1C2','A2T5G2C1','A2T5G3C0',
             'A2T6G0C2','A2T6G1C1','A2T7G0C1','A2T7G1C0',
             'A3T0G4C3','A3T1G2C4','A3T1G3C3','A3T1G4C2',
             'A3T2G2C3','A3T2G3C2','A3T3G1C3','A3T3G2C2',
             'A3T5G0C2','A3T5G1C1','A3T5G2C0','A3T6G0C1',
             'A3T6G1C0','A3T7G0C0','A4T0G3C3','A4T0G4C2',
             'A4T1G2C3','A4T1G3C2','A4T2G1C3','A4T2G2C2' ,
             'A4T4G0C2','A4T4G2C0','A4T5G0C1','A4T5G1C0',
             'A5T0G3C2','A5T1G1C3','A5T1G2C2','A5T1G3C1',
             'A5T2G1C2','A5T3G1C1','A5T3G2C0','A5T4G1C0',
             'A5T5G0C0','A6T1G1C2','A6T1G2C1','A6T1G3C0',
             'A6T2G1C1','A6T3G0C1','A6T3G1C0','A6T4G0C0',
             'A7T1G1C1','A7T2G0C1','A7T2G1C0','A7T3G0C0',
             'A8T1G1C0','A8T2G0C0','A9T0G1C0','A9T1G0C0'
            ]]

In [None]:
predictions = xgb.predict(test)

In [None]:
submission["target"] = predictions

In [None]:
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False)

In [None]:
fold_no = 1
for train_index, test_index in skf.split(X, y):
    print('Fold = ',fold_no)
    y_val = y.iloc[test_index]
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    fold_no +=1

In [None]:
hyperparameter_space = { 
                        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                        'max_depth': hp.quniform("max_depth", 2, 6, 1),
                        'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
                        'reg_alpha' : hp.uniform('reg_alpha', 1e-8, 100),
                        'reg_lambda' : hp.uniform('reg_lambda', 1e-8, 100),
                        'gamma': hp.uniform ('gamma', 0.0, 1.0),
                        'subsample': hp.uniform("subsample", 0.1, 1.0),
                        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0)
                       }

In [None]:
def optimize_hyppara(hyperparameter_space):
    # Converts parameter value to int as required by XGBoost
    hyperparameter_space["max_depth"] = int(hyperparameter_space["max_depth"])
    hyperparameter_space["objective"] = "multi:softmax"
    hyperparameter_space["eval_metric"] = "mlogloss"
    hyperparameter_space["tree_method"] = "gpu_hist"
    hyperparameter_space['num_class']=  10
    
    xgb = XGBClassifier(**hyperparameter_space)
    xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=False)
    
    predictions = xgb.predict(X_test)
    
    acc = accuracy_score(y_val, predictions)
    
    del predictions, xgb, hyperparameter_space
    
    return {"loss": -acc, "status": STATUS_OK}

In [None]:
# Starts hyperparameters tuning
trials = Trials()
best_model_params = fmin(fn=optimize_hyppara,space=hyperparameter_space, max_evals=50,algo=tpe.suggest,trials=trials)

In [None]:
best_model_params

In [None]:
params = best_model_params

params["max_depth"] = int(params["max_depth"])
params['objective'] = 'multi:softmax'  # error evaluation for multiclass training
params['num_class']=  10  # the number of classes that exist in this datset
params["tree_method"] = "gpu_hist"
params['eval_metric'] =  'mlogloss'
    
xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=False)

In [None]:
predictions = xgb.predict(test)

In [None]:
submission["target"] = predictions

# Checks for sumbission file before saving
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) 

In [None]:
del predictions