In [None]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
pd. set_option('display.max_rows', 500)
from sklearn.metrics import classification_report
import xgboost as xgb
import missingno as msno

In [None]:
# load train and test datasets
train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

In [None]:
# check shapes
train.shape,test.shape

In [None]:
# check if duplicate id is present
train.id.nunique(),test.id.nunique()

In [None]:
# basic info
train.describe()

In [None]:
# check for class distribution
sns.countplot(train.claim)

In [None]:
train.claim.value_counts()

# Null Values Visualization

In [None]:
features = train.isnull().sum().index[1:]
values = train.isnull().sum().values[1:]

In [None]:
# Figure Size
fig, ax = plt.subplots(figsize =(10, 30))
 
# Horizontal Bar Plot
ax.barh(features, values,color = "maroon")
 
# Remove axes splines
for s in ['top', 'bottom', 'left', 'right']:
    ax.spines[s].set_visible(False)

# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
 
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
 
# Add x, y gridlines
ax.grid(b = True, color ='grey',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.2)
 
# Show top values
ax.invert_yaxis()
 
# Add annotation to bars
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 10, fontweight ='bold',
             color ='grey')
    
# Add Plot Title
ax.set_title('Featurewise Missing Values',
             loc ='left', )
 
# Add Text watermark
fig.text(0.9, 0.15, 'Missing Values Distribution', fontsize = 12,
         color ='grey', ha ='right', va ='bottom',
         alpha = 0.7)
 
# Show Plot
plt.show()

All the features seems to have more than 15000 missing values, which is around 1.6% of the whole training data.

# Null Value Treatment

### impute missing values with the mean value

In [None]:
y = train.claim
train.drop(["id","claim"],1,inplace=True)

test_ids = test.id
test.drop("id",1,inplace=True)

In [None]:
from sklearn.impute import SimpleImputer

train_columns = train.columns
test_columns = test.columns

# Imputation
my_imputer = SimpleImputer()
train = pd.DataFrame(my_imputer.fit_transform(train))
test = pd.DataFrame(my_imputer.transform(test))

# Imputation removed column names; put them back
train.columns = train_columns
test.columns = test_columns

X=train.copy()

# Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# XGBOOST

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
y_pred = xgb_cl.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, target_names=["0","1"]))

In [None]:
y_pred_test = xgb_cl.predict(test)
sub = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
sub["id"] = test_ids
sub["claim"] = y_pred_test
sub.to_csv("submission_xgb.csv",index=False)

In [None]:
print(classification_report(y_test, y_pred, target_names=["0","1"]))