In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

warnings.filterwarnings('ignore')

# Load the Data

In [None]:
# import datasets
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

# Let's see data

In [None]:
print(train_df.shape[0])
print(test_df.shape[0])

In [None]:
train_df.describe()

In [None]:
test_df.describe()

**Examine missing data **

In [None]:
missing_train_df = pd.DataFrame(train_df.isna().sum(axis=0))
missing_train_df = missing_train_df.drop(['id', 'claim']).reset_index()
missing_train_df.columns = ['feature', 'count']
missing_train_df['count_percent'] = missing_train_df['count']/train_df.shape[0]


missing_test_df = pd.DataFrame(test_df.isna().sum(axis=0))
missing_test_df = missing_test_df.drop(['id']).reset_index()
missing_test_df.columns = ['feature', 'count']
missing_test_df['count_percent'] = missing_test_df['count']/test_df.shape[0]

In [None]:
missing_train_df.head()

In [None]:
missing_test_df.head()

In [None]:
missing_train_row = train_df.drop(['id', 'claim'], axis=1).isna().sum(axis=1)
missing_train_feature_numbers = pd.DataFrame(missing_train_row.value_counts()/train_df.shape[0]).reset_index()
missing_train_feature_numbers.columns = ['no_of_feature', 'count_percent']

missing_test_row = test_df.drop(['id'], axis=1).isna().sum(axis=1)
missing_test_feature_numbers = pd.DataFrame(missing_test_row.value_counts()/test_df.shape[0]).reset_index()
missing_test_feature_numbers.columns = ['no_of_feature', 'count_percent']

**Maximum number of missing values/row** 

In [None]:
missing_train_row.max()

In [None]:
missing_train_feature_numbers.head(missing_train_row.max())

In [None]:
missing_test_feature_numbers.head(missing_test_row.max())

# Lets see the visual tables quickly 

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_train_df['feature'], x=missing_train_df['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE')
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE')

See all less than 2%

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_train_feature_numbers['no_of_feature'], x=missing_train_feature_numbers['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)

Well can't throw the null data more than 37% 

Lets check same for test

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_test_df['feature'], x=missing_train_df['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE')
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE')

In [None]:
fig = plt.figure(figsize=(16, 16))
ax0_sns = sns.barplot(y=missing_test_feature_numbers['no_of_feature'], x=missing_test_feature_numbers['count_percent'], 
                      zorder=2, linewidth=0, orient='h', saturation=1, alpha=1)
ax0_sns.set_xlabel("missing values", weight='bold')
ax0_sns.set_ylabel("features", weight='bold')
ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)

**Check duplicated**

In [None]:
#extract duplicates
train_df.loc[train_df.duplicated(), :]

**There are no duplicates**

# A lot of missing values. Let's see the correlation

Lets see why people are obsessed with the null counts

In [None]:
train_df['num_nulls'] = train_df.drop(['id', 'claim'], axis = 1).isna().sum(axis = 1)
test_df['num_nulls'] = test_df.drop(['id'], axis = 1).isna().sum(axis = 1)

In [None]:
train_df['num_nulls'].corr(train_df['claim'])

Damn!! thats a large correlation . Need to keep this factor. 

# Lets also look if we have imbalance case

In [None]:
train_df.claim.value_counts()

In [None]:
sns.countplot(train_df['claim'])

Good to go. No Imbalance Class

# So now the work remaining is the removal of null values. (Also a bit of Preprocessing) 
But we can't drop the rows owing to the large amount single null rows

In [None]:
ctrain_df= train_df.copy() 
ctrain_df.drop(['id','num_nulls'], axis=1, inplace=True)
ctrain_df.head()


In [None]:
ctest_df= test_df.copy()
ctest_df.drop(['id','num_nulls'], axis=1, inplace=True)
ctest_df.head()

In [None]:
# Separate the target variable and rest of the variables 
X, y = ctrain_df.iloc[:,:-1],ctrain_df.iloc[:,-1]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
ctest_df.head()

**Preprocessing**

In [None]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from sklearn.impute import SimpleImputer

features = [col for col in X.columns if col not in ['id']]
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=64,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=64, encode='ordinal',strategy='uniform'))
        ])
X[features] = pipe.fit_transform(X[features])
ctest_df[features] = pipe.transform(ctest_df[features])

In [None]:
X.head()

In [None]:
ctest_df.head()

In [None]:
y.head()

**Start XGBOOST**

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Create the training and test sets
X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)
# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', 
                          n_estimators=100, 
                          eval_metric = 'logloss', 
                          learning_rate= 5e-3, 
                          seed=42,  
                          tree_method ='gpu_hist',
                          gpu_id =0)

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train)
# validate
y_preds = xg_cl.predict(X_val)

# Predict the labels of the test set: preds
preds = xg_cl.predict(ctest_df)

# Compute the accuracy: accuracy
accuracy = float(np.sum(y_preds==y_val))/y_val.shape[0]
print("accuracy: %f" % (accuracy))

In [None]:
#Visulaizing 
from xgboost import plot_importance

# Create the DMatrix: claim_dmatrix
claim_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":4}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=claim_dmatrix, num_boost_round=10)

In [None]:
# Plot the feature importances
xgb.plot_importance(xg_reg, max_num_features=30)
plt.rcParams['figure.figsize'] = [15, 15]
plt.show()

In [None]:
#-------  Visualize Boosting Trees -----------
xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [150, 150]
plt.show()

In [None]:
submission['claim'] = preds.tolist()
submission.to_csv('submission.csv', index=False)
s=pd.read_csv('./submission.csv')
s.tail(40)