In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# run this first cell
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# this will load all data from the competition
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**importing the modules**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**importing the data**

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

**Exploratory Data Analysis**

# pie char representing the amount of train and test data
fig, ax = plt.subplots(figsize=(5,5))
pie = ax.pie([len(train_df), len(test_df)], 
             labels=['Trian dataset','Test dataset'],
             colors=['cyan','violet'],
             autopct='%1.1f%%')
fig.set_facecolor('white')
plt.title("Pie chart representing the amount of train and test data")
plt.show()

# countplot of claim value distribution
sns.countplot(y= train_df['claim'], palette = 'Set1')
plt.title("Count plot showing the claim value distribution")
print(train_df['claim'].value_counts())

In [None]:
train_df.describe().T

# checking the missing values
plt.figure(figsize=(10,10))
sns.heatmap(train_df.isna(), cmap = 'viridis', yticklabels=False, cbar=False)

In [None]:
total = np.product(train_df.shape)
missing = (train_df.isna().sum()).sum()
print("Percentage of missing values in train data: ", (missing/total)*100)

In [None]:
total = np.product(test_df.shape)
missing = (test_df.isna().sum()).sum()
print("Percentage of missing values in test data: ", (missing/total)*100)

In [None]:
train_df.drop('id',axis=1)
test_df.drop('id', axis=1)

In [None]:
train_df[:5]

In [None]:
# collecting all the features
features = train_df.columns.drop(['id','claim'])
print(features)

In [None]:
train_df['n_missing'] = train_df[features].isna().sum(axis=1)
test_df['n_missing'] = test_df[features].isna().sum(axis=1)

In [None]:
# replacing the NULL/NA values with median values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
for col in features:
    train_df[col] = imputer.fit_transform(np.array(train_df[col]).reshape(-1,1))
    test_df[col] = imputer.transform(np.array(test_df[col]).reshape(-1,1))

In [None]:
# here we don't have any missing values present in both train and test
print((train_df.isna().sum()).sum())
print((test_df.isna().sum()).sum())

In [None]:
# Scaling the values from 0 to 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in features:
    train_df[col] = scaler.fit_transform(np.array(train_df[col]).reshape(-1,1))
    test_df[col] = scaler.transform(np.array(test_df[col]).reshape(-1,1))

In [None]:
X = train_df.drop(['id','claim'], axis=1)
X_test = test_df.drop('id', axis=1)
y = train_df["claim"]

In [None]:
print(X.shape)
print(X_test.shape)
print(y.shape)

In [None]:
X[:5]

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'learning_rate':[0.01,0.001,0.0001],
        'tree_method':'gpu_hist',
        'booster': 'gbtree',
        'n_estimators' : [3000,3600],
        'reg_alpha' : [25,26],
        }

from xgboost import XGBClassifier
xgb = XGBClassifier(objective='binary:logistic',silent=True)

folds = 3
param_comb = 5

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

# Here we go
# start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, y)
# timer(start_time)

In [None]:
xgb_params = {
    'n_estimators' : 3800,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
    'learning_rate': 0.01,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree'
}

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X,y)

In [None]:
predict = xgb_model.predict_proba(X_test)[:, 1]

In [None]:
predict[:5]

In [None]:
predictions = pd.DataFrame()
predictions["id"] = test_df["id"]
predictions["claim"] = predict

predictions.to_csv('submission.csv', index=False, header=predictions.columns)
predictions.head()