# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from missingno import missingno
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,FunctionTransformer,MinMaxScaler
from sklearn.metrics import roc_auc_score


# Loading Data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col = 'id')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv',index_col = 'id')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df_test.shape

In [None]:
df.describe()

In [None]:
df_miss  = pd.DataFrame(df.isna().sum())
df_miss.columns = ['# Missing Values']
df_miss['% Missing Values'] = 100*df_miss['# Missing Values']/df.shape[0]
df_miss

In [None]:
df_miss_test  = pd.DataFrame(df_test.isna().sum())
df_miss_test.columns = ['# Missing Values']
df_miss_test['% Missing Values'] = 100*df_miss_test['# Missing Values']/df_test.shape[0]
df_miss_test

There seem to be almost equal amount of missing values between all cols

In [None]:
sns.set_style('darkgrid')
sns.countplot(df['claim'],palette = 'viridis');

The distribution of target variable is balanced

In [None]:
missingno.matrix(df)
plt.show()

As we can see from this plot values missing from the data are from different rows
and it seem like if we remove all missing values we will be just left with less than half datapoints

In [None]:
L = len(df.columns[0:60])
nrow= int(np.ceil(L/6))
ncol= 6

remove_last= (nrow * ncol) - L

fig, ax = plt.subplots(nrow, ncol,figsize=(24, 30))
fig.subplots_adjust(top=0.95)
i = 1
for feature in df.columns[0:60]:
    plt.subplot(nrow, ncol, i)
    ax = sns.kdeplot(df[feature], shade=True, palette='viridis',  alpha=0.5, hue= df['claim'], multiple="stack")
    plt.xlabel(feature, fontsize=9)
    i += 1
plt.suptitle('DistPlot: train features vs w.r.t claim', fontsize=20)
plt.show()

This is a code from the notebook by des : -
https://www.kaggle.com/desalegngeb/sept-2021-tps-eda-model<br><br>
Check out his visualizations he has done pretty neat work!<br>

# Create Folds

**I will create 10 Stratified Folds . The dataset will be public. So if you wanna create your own folds use this codes**
<br><br>
Link to dataset  - https://www.kaggle.com/prikshitsingla/tabular-sept-stratified-folds-10

In [None]:
# skf = StratifiedKFold(n_splits = 10 , shuffle = True,random_state = 1)
# df['kfold'] = -1

In [None]:
# for folds,(train_indices,valid_indices) in enumerate(skf.split(df.iloc[:,:-1],df.iloc[:,-1])):
#     df.loc[valid_indices,'kfold'] = folds
# df.head()

In [None]:

# df.to_csv('Tabular Sept Stratified Folds-10.csv')

# Modelling 

In [None]:
df = pd.read_csv('../input/tabular-sept-stratified-folds-10/Tabular Sept Stratified Folds-10.csv',index_col = 'id')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv',index_col = 'id')
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
sc = StandardScaler()
mx = MinMaxScaler()
transformer = FunctionTransformer(np.log1p, validate=True)
imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')

In [None]:

df.iloc[:,:-2] = imputer.fit_transform(df.iloc[:,:-2])
df_test.iloc[:,:] = imputer.transform(df_test.iloc[:,:])

In [None]:
cols_log = [col for col in df.columns if df[col].mean()>1000]

In [None]:
sc_cols = [col for col in df.columns if col not in cols_log]
sc_cols = [col for col in sc_cols if col not in ['kfold','claim']]


In [None]:
df.loc[:,sc_cols]  = sc.fit_transform(df.loc[:,sc_cols])
df_test.loc[:,sc_cols] = sc.transform(df_test.loc[:,sc_cols])


In [None]:
df.loc[:,cols_log] = transformer.transform(df.loc[:,cols_log])
df_test.loc[:,cols_log] = transformer.transform(df_test.loc[:,cols_log])


In [None]:
useful_cols = df_test.columns

In [None]:
final_test_preds = []
final_valid_preds = {}
scores = []


for fold in range(10):
    x_train = df[df.kfold!=fold].reset_index(drop=True)
    x_valid = df[df.kfold==fold].reset_index(drop=True)
    
    x_test = df_test.copy()
    
    valid_ids = x_valid.index.values.tolist()
    
    y_train = x_train.claim
    y_valid = x_valid.claim
    
    
    x_train  = x_train[useful_cols]
    x_valid = x_valid[useful_cols]
    
    
    
    
    model = XGBClassifier(random_state = fold,tree_method='gpu_hist') 

#     

    model.fit(x_train, y_train, early_stopping_rounds=300, eval_metric='auc', eval_set=[(x_valid, y_valid)], verbose=1000)
    preds_valid = model.predict_proba(x_valid)[:,1]
    test_preds = model.predict_proba(x_test)[:,1]
    final_test_preds.append(test_preds)
    final_valid_preds.update(dict(zip(valid_ids, preds_valid)))
    auc = roc_auc_score(y_valid, preds_valid)
    scores.append(auc)
    
    print('***********************************************************')
    print(fold,auc)
    print('***********************************************************')
    
    
    

print("Mean of scores is {} and std is {}".format(np.mean(scores) , np.std(scores)))

# Submission

In [None]:
sub.claim = np.mean(np.column_stack(final_test_preds), axis=1)

In [None]:
sub.to_csv('submission.csv',index=False)

<b><center>This is my first ever Public Notebook!<br>
If you Like it please Upvote!  <br>
    Please Share your thoughts in comments!
    
    
