In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

Let's have an overview of the data

In [None]:
print(df_train.info())
print('*****')
print(df_test.info())

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
features = [col for col in df_train.columns if 'f' in col]

In [None]:
print(features)

### Observing the test data

In [None]:
sns.countplot(x = 'claim', data = df_train)

### Conclusion
The distribution of data with claim made and claim not made are the same. This is one less thing to worry about :)

### Observing the distribution of data in the train and test set

In [None]:
import matplotlib.pyplot as plt

for idx, feature in enumerate(features):
    plt.hist(df_train[feature], bins=30, alpha=0.5, label='Train set')
    plt.hist(df_test[feature], bins=30, alpha=0.5, label='Test set')
    plt.title(feature + " Train/Test")
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    plt.legend()
    plt.show()

### Conclusion

1. The train and test data distribution are similar. This is great!!!
2. The distribution of the independent features are skewed which may affect the accurarcy of our model. I have put forward some ideas on dealing with skewness in the below cells

In [None]:
df_train['missing'] = df_train[features].isna().sum(axis=1)
df_test['missing'] = df_test[features].isna().sum(axis=1)

features.append('missing')

## Let us understand why missing is such an important feature

In [None]:
train_missing = df_train['missing'].unique()
test_missing = df_test['missing'].unique()

In [None]:
X = df_train[features].copy()
y = df_train['claim'].copy()

x_test = df_test[features].copy()

In [None]:
train_missing.sort()
test_missing.sort()

In [None]:
### Plot the missing value data for train set
total = []
ones = []
zeros = []
for val in train_missing:
    total.append((df_train[df_train['missing']==val]).shape[0])
    ones.append((df_train[df_train['missing']==val]['claim']==1).sum())
    zeros.append((df_train[df_train['missing']==val]['claim']==0).sum())
#print(np.add(ones,zeros))
#print(total)

plt.bar(train_missing, ones, alpha=0.5, label='Claim = 1')
plt.bar(train_missing, zeros, alpha=0.5, label='Claim = 0')
plt.title(" 0/1")
plt.xlabel('0/1')
plt.ylabel('Frequency')

plt.legend()
plt.show()

### Conclusion
From the bar plot it is visible that the value of claim = 1 is  more than value of claim = 0 is where the missing values are greater than 0. For data where missing rows = 0 the value of claim =0 is significantly higher than claim = 1 

In [None]:
X = df_train[features].copy()
y = df_train['claim'].copy()

x_test = df_test[features].copy()

As it was visible in the plots of training and test data the distribution is skewed. There are two ways I am dealing with the skewness
1. Fill the missing values of columns with skew values greater than 1 with the median value and the rest with mean value
2. Replace the column with its log value or sqrt value whichever has less skew value. Fill the missing values with mean

### Uncomment one of the following two cells below to run one of the two methods

In [None]:
### Method 1

#from scipy.stats import skew

#skew_feat = X.skew()
#skew_feat = list([abs(skew_feat.values)>1].index)

#for feat in skew_feat:
#    median = X[feat].median()
#    X[feat] = X[feat].fillna(median)
    
#skew_feat = x_test.skew()
#skew_feat = list([abs(skew_feat.values)>1].index)

#for feat in skew_feat:
#    median = df_test[feat].median()
#    df_test[feat] = df_test[feat].fillna(median)

In [None]:
### Method 2

from scipy.stats import skew

skew_feat = X.skew()
skew_feat = list(skew_feat[abs(skew_feat.values)>1].index)


for feat in skew_feat:
    skew_val = abs(X[feat].skew())
    
    col_log = pd.Series((np.log(np.abs(X[feat]))) * np.sign(X[feat]))
    skew_log = abs(col_log.skew())
    
    col_sqrt = pd.Series((np.sqrt(np.abs(X[feat])))*np.sign(X[feat]))
    skew_sqrt = abs(col_sqrt.skew())
    
    if skew_log<skew_val and skew_log<skew_sqrt:
        X[feat] = col_log
        x_test[feat] = pd.Series((np.log(np.abs(x_test[feat]))) * np.sign(x_test[feat]))
    if skew_sqrt<skew_val and skew_sqrt<skew_val:
        X[feat] = col_sqrt
        x_test[feat] = pd.Series((np.sqrt(np.abs(x_test[feat])))*np.sign(x_test[feat]))

In [None]:
print(X.isna().sum().sum())
print(x_test.isna().sum().sum())

print(type(X))
print(type(x_test))

In [None]:
### Filling the remaining NA vakues with mean
X[features] = X[features].fillna(X[features].mean())
x_test[features] = x_test[features].fillna(x_test[features].mean())

In [None]:
print(X.isna().sum().sum())
print(x_test.isna().sum().sum())

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X[features] = scaler.fit_transform(X[features])
x_test[features] = scaler.transform(x_test[features])

In [None]:
### Hyperparameters list

lgb_params = {
    'metric' : 'auc',
    'objective' : 'binary',
    'device_type': 'gpu', 
    'n_estimators': 10000, 
    'learning_rate': 0.12230165751633416, 
    'num_leaves': 1400, 
    'max_depth': 8, 
    'min_child_samples': 3100, 
    'reg_alpha': 10, 
    'reg_lambda': 65, 
    'min_split_gain': 5.157818977461183, 
    'subsample': 0.5, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.2
}

catb_params = {
    'eval_metric' : 'AUC',
    'iterations': 15585, 
    'objective': 'CrossEntropy',
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

xgb_params = {
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_hist', 
    'gpu_id': 0, 
    'predictor': 'gpu_predictor', 
    'n_estimators': 10000, 
    'learning_rate': 0.01063045229441343, 
    'gamma': 0.24652519525750877, 
    'max_depth': 4, 
    'min_child_weight': 366, 
    'subsample': 0.6423040816299684, 
    'colsample_bytree': 0.7751264493218339, 
    'colsample_bylevel': 0.8675692743597421, 
    'lambda': 0, 
    'alpha': 10
}


In [None]:
### LGBM Model

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

lgb_oof = np.zeros(X.shape[0])
lgb_pred = np.zeros(x_test.shape[0])
best_lgb_model = None
best_roc_score_lgb = 0

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"===== fold {fold} =====")
    X_train = X[features].iloc[trn_idx]
    y_train = y.iloc[trn_idx]
    X_valid = X[features].iloc[val_idx]
    y_valid = y.iloc[val_idx]
    X_test = x_test[features]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=200,
        verbose=0,
    )

    lgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    lgb_pred += model.predict_proba(X_test)[:, -1]

    auc = roc_auc_score(y_valid, lgb_oof[val_idx])
    
    if auc>best_roc_score_lgb:
        best_roc_score_lgb = auc
        best_lgb_model = model
        
    print(f"fold {fold} - lgb auc: {auc:.6f}\n")

print(f"oof lgb roc = {roc_auc_score(y, lgb_oof)}")
lgb_pred = lgb_pred/5

In [None]:
feature_impt=pd.DataFrame()
feature_impt['features']=best_lgb_model.feature_name_
feature_impt['importance']=best_lgb_model.feature_importances_

feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (20,25))
sns.barplot(x=feature_impt['importance'],y=feature_impt['features'],data=feature_impt);

In [None]:
final_lgb = pd.DataFrame()
final_lgb['id'] = df_test['id']
final_lgb['claim'] = lgb_pred

In [None]:
#final_lgb.to_csv('final_lgb', index=False)

In [None]:
 ### XGBoost Model
    
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import xgboost as xgb

xgb_oof = np.zeros(X.shape[0])
xgb_pred = np.zeros(x_test.shape[0])
best_xgb_model = None
best_roc_score_xgb = 0

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"===== fold {fold} =====")
    X_train = X[features].iloc[trn_idx]
    y_train = y.iloc[trn_idx]
    X_valid = X[features].iloc[val_idx]
    y_valid = y.iloc[val_idx]
    X_test = x_test[features]
    
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        early_stopping_rounds=200,
        verbose=0,
    )

    xgb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    xgb_pred += model.predict_proba(X_test)[:, -1]

    auc = roc_auc_score(y_valid, xgb_oof[val_idx])
    
    if auc>best_roc_score_xgb:
        best_roc_score_xgb = auc
        best_xgb_model = model
        
    print(f"fold {fold} - xgb auc: {auc:.6f}\n")

print(f"oof xgb roc = {roc_auc_score(y, xgb_oof)}")
xgb_pred = xgb_pred/5

In [None]:
feature_impt=pd.DataFrame(list(best_xgb_model.get_booster().get_fscore().items()),
columns=['feature','importance']).sort_values('importance', ascending=False)

feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
plt.figure(figsize = (20,25))
sns.barplot(x=feature_impt['importance'],y=feature_impt['feature'],data=feature_impt);

### It is visible from the feature importance that missing is an important feature

In [None]:
final_xgb = pd.DataFrame()
final_xgb['id'] = df_test['id']
final_xgb['claim'] = xgb_pred

In [None]:
#final_xgb.to_csv('final_xgb', index=False)

In [None]:
### CATBoost model

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import catboost as catb

catb_oof = np.zeros(X.shape[0])
catb_pred = np.zeros(x_test.shape[0])
best_catb_model = None
best_roc_score_catb = 0

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"===== fold {fold} =====")
    X_train = X[features].iloc[trn_idx]
    y_train = y.iloc[trn_idx]
    X_valid = X[features].iloc[val_idx]
    y_valid = y.iloc[val_idx]
    X_test = x_test[features]
    
    model = catb.CatBoostClassifier(**catb_params)
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=200,
        verbose=0,
    )

    catb_oof[val_idx] = model.predict_proba(X_valid)[:, -1]
    catb_pred += model.predict_proba(X_test)[:, -1]

    auc = roc_auc_score(y_valid, catb_oof[val_idx])
    
    if auc>best_roc_score_catb:
        best_roc_score_catb = auc
        best_catb_model = model
        
    print(f"fold {fold} - catb auc: {auc:.6f}\n")

print(f"oof catb roc = {roc_auc_score(y, catb_oof)}")
catb_pred = catb_pred/5

In [None]:
feature_impt=pd.DataFrame()
#feature_impt['features']=best_catb_model.feature_name_
#feature_impt['importance']=best_catb_model.feature_importances_

#feature_impt.sort_values(by=['importance'],inplace=True,ascending=False)
#plt.figure(figsize = (20,25))
#sns.barplot(y=feature_impt['features'],data=feature_impt);

In [None]:
final_catb = pd.DataFrame()
final_catb['id'] = df_test['id']
final_catb['claim'] = catb_pred

In [None]:
#final_catb.to_csv('final_catb', index=False)

Since XGBoost and LGBM models showed the best results I am trying to submit a prediction with a weighted average of 0.6 for LGBM and 0.4 for XGBoost

In [None]:
final_lgb_xgb = (xgb_pred +lgb_pred)/2

In [None]:
final_lgb_xgb_csv = pd.DataFrame()
final_lgb_xgb_csv['id'] = df_test['id']
final_lgb_xgb_csv['claim'] = final_lgb_xgb

In [None]:
final_lgb_xgb_csv.to_csv('submission', index=False)