In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install shap

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
import shap

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
SEED=42

In [None]:
train_data=pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
#train_data=pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv').drop('cat10',axis=1)
train_target=train_data.target.values
test_data=pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
#test_data=pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv').drop(['cat10'],axis=1)

In [None]:
print(len(train_data.columns),train_data.columns)
print(len(test_data.columns),test_data.columns)

In [None]:
train_data['kfold']=-1
train_data=train_data.sample(frac=1).reset_index(drop=True)
FOLD=10
skf=StratifiedKFold(n_splits=FOLD)

for fold,(train_,valid_) in enumerate(skf.split(train_data,train_target)):
    train_data.loc[valid_,'kfold']=fold

In [None]:
print('Whole Data')
print(train_data.target.value_counts())
print()
print(train_data.target.value_counts(normalize=True),'\n')

for i in range(FOLD):
    print(f'Epoch:{i+1}')
    print(train_data[train_data.kfold==i].target.value_counts(normalize=True))

In [None]:
train_data.to_csv('train_data_stratified_10folds.csv',index=False)

In [None]:
features=train_data.drop(['id','target'],axis=1).columns
print(features)

In [None]:
cat_features=train_data.select_dtypes('object').columns
num_features=train_data.select_dtypes('float').columns

print(cat_features)
print(num_features)

In [None]:
for feat in cat_features:
    print(feat)
    print(train_data[feat].value_counts(),'\n')

In [None]:
le=LabelEncoder()

for feature in cat_features:
    print(feature)
    le.fit(pd.concat([train_data[feature],test_data[feature]],axis=0,sort=False))
    train_data[feature]=le.transform(train_data[feature])
    test_data[feature]=le.transform(test_data[feature])

In [None]:
train_data.dtypes

In [None]:
train_data_numerical=train_data[num_features].copy()
print(train_data_numerical.shape,end='\n')
prefix='PCA_'
pca=PCA(n_components=4,random_state=SEED)
train_data_numerical_pca=pd.DataFrame(pca.fit_transform(train_data_numerical))
train_data_numerical_pca.rename(columns=lambda x:str(prefix)+str(x),inplace=True)
print('PCA:')
print(train_data_numerical_pca.shape)
print((train_data_numerical_pca.columns))
print(pca.explained_variance_ratio_)

In [None]:
valid_auc_cat=[]
print('Categorical Variables Only:')
for fold in range(FOLD):
    print(f'Fold:{fold+1}')
    
    x_train=train_data[train_data['kfold']!=fold][cat_features]
    y_train=train_data[train_data['kfold']!=fold]['target']
    x_valid=train_data[train_data['kfold']==fold][cat_features]
    y_valid=train_data[train_data['kfold']==fold]['target']
    
    train_lgb_dataset=lgb.Dataset(x_train,label=y_train)
    valid_lgb_dataset=lgb.Dataset(x_valid,label=y_valid)
    
    model_lgb=lgb.LGBMClassifier()
    model_lgb.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_valid,y_valid)],verbose=False)
    y_valid_pred=model_lgb.predict_proba(x_valid)[:,1]
    print(roc_auc_score(y_valid,y_valid_pred))
    valid_auc_cat.append(roc_auc_score(y_valid,y_valid_pred))
    
print(f"Mean ROC AUC:{np.mean(valid_auc_cat)}")

In [None]:
valid_auc_num=[]
print('Numerical Variables Only:')
for fold in range(FOLD):
    print(f'Fold:{fold+1}')
    
    x_train=train_data[train_data['kfold']!=fold][num_features]
    y_train=train_data[train_data['kfold']!=fold]['target']
    x_valid=train_data[train_data['kfold']==fold][num_features]
    y_valid=train_data[train_data['kfold']==fold]['target']
    
    train_lgb_dataset=lgb.Dataset(x_train,label=y_train)
    valid_lgb_dataset=lgb.Dataset(x_valid,label=y_valid)
    
    model_lgb=lgb.LGBMClassifier()
    model_lgb.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_valid,y_valid)],verbose=False)
    y_valid_pred=model_lgb.predict_proba(x_valid)[:,1]
    print(roc_auc_score(y_valid,y_valid_pred))
    valid_auc_num.append(roc_auc_score(y_valid,y_valid_pred))
    
print(f"Mean ROC AUC:{np.mean(valid_auc_num)}")

## **Categorical Variable are more important than Numerical Variables**

In [None]:
valid_auc_cat_pca=[]

print('Cat Variables +PCA 4 features:')
for fold in range(FOLD):
    print(f'Fold:{fold+1}')
    
    x_train=pd.concat([train_data[cat_features],train_data_numerical_pca],axis=1)[train_data['kfold']!=fold]
    y_train=train_data[train_data['kfold']!=fold]['target']
    x_valid=pd.concat([train_data[cat_features],train_data_numerical_pca],axis=1)[train_data['kfold']==fold]
    y_valid=train_data[train_data['kfold']==fold]['target']
    
    model_lgb=lgb.LGBMClassifier(random_state=SEED,n_estimators=1000)
    model_lgb.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_valid,y_valid)],verbose=False)
    y_valid_pred=model_lgb.predict_proba(x_valid)[:,1]
    print("Original Score:",roc_auc_score(y_valid,y_valid_pred))
    valid_auc_cat_pca.append(roc_auc_score(y_valid,y_valid_pred))
    
    
print(f"Mean ROC AUC:{np.mean(valid_auc_cat_pca)}") 

In [None]:
valid_auc=[]

print('Cat Variables +Numerical Variables:')
for fold in range(FOLD):
    print(f'Fold:{fold+1}')
    
    x_train=train_data[train_data['kfold']!=fold].drop(['id','kfold','target'],axis=1)
    y_train=train_data[train_data['kfold']!=fold]['target']
    x_valid=train_data[train_data['kfold']==fold].drop(['id','kfold','target'],axis=1)
    y_valid=train_data[train_data['kfold']==fold]['target']
    
    model_lgb = lgb.LGBMClassifier(
                    random_state=SEED,
                    cat_l2=25.999876242730252,
                    cat_smooth=89.2699690675538,
                    colsample_bytree=0.2557260109926193,
                    learning_rate=0.003,
                    max_bin=788,
                    max_depth=81,
                    metric="auc",
                    min_child_samples=292,
                    min_data_per_group=177,
                    n_estimators=400000,
                    n_jobs=-1,
                    num_leaves=171,
                    reg_alpha=0.7115353581785044,
                    reg_lambda=5.658115293998945,
                    subsample=0.9262904583735796,
                    subsample_freq=1,
                    verbose=-1,
                )
    
    model_lgb.fit(x_train,y_train,early_stopping_rounds=200,eval_set=[(x_valid,y_valid)],verbose=False)
    y_valid_pred=model_lgb.predict_proba(x_valid)[:,1]
    print("Original Score:",roc_auc_score(y_valid,y_valid_pred))
    valid_auc.append(roc_auc_score(y_valid,y_valid_pred))
    y_train_pred_score=model_lgb.predict_proba(x_train,raw_score=True)
    
print(f"Mean Original ROC AUC:{np.mean(valid_auc)}") 

In [None]:
x_valid=train_data[train_data['kfold']==0].drop(['id','kfold','target'],axis=1)
y_valid=train_data[train_data['kfold']==0]['target']
    
model_lgb=lgb.LGBMClassifier(random_state=SEED,n_estimators=1000)
model_lgb.fit(x_valid,y_valid,early_stopping_rounds=200,eval_set=[(x_valid,y_valid)],verbose=False)

explainer=shap.Explainer(model_lgb,x_valid)
shap_values_lgb=explainer(x_valid,check_additivity=False)

In [None]:
shap.summary_plot(shap_values_lgb, x_valid)

In [None]:
shap.plots.waterfall(shap_values_lgb[0],max_display=31)

In [None]:
shap.summary_plot(shap_values_lgb, x_valid)

In [None]:
valid_auc=[]

for fold in range(FOLD):
    print(f'Fold:{fold+1}')
    
    x_train=train_data[train_data['kfold']!=fold].drop(['id','kfold','target'],axis=1)
    y_train=train_data[train_data['kfold']!=fold]['target']
    x_valid=train_data[train_data['kfold']==fold].drop(['id','kfold','target'],axis=1)
    y_valid=train_data[train_data['kfold']==fold]['target']

    model_xgb=xgb.XGBClassifier(n_estimators=1000,n_jobs=-1,random_state=SEED)
    model_xgb.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],eval_metric='auc',early_stopping_rounds=30,verbose=0)
    y_valid_pred=model_xgb.predict_proba(x_valid)[:,1]
    print("Original Score:",roc_auc_score(y_valid,y_valid_pred))
    valid_auc.append(roc_auc_score(y_valid,y_valid_pred))
    # y_train_pred_score=model_xgb.predict_proba(x_train)
    
print(f"Mean ROC AUC :{np.mean(valid_auc)}") 


In [None]:
xgb.plot_importance(model_xgb,importance_type='weight')

In [None]:
xgb.plot_importance(model_xgb,importance_type='cover')

In [None]:
xgb.plot_importance(model_xgb,importance_type='gain')

In [None]:
feature_imp_xgb=pd.DataFrame(sorted(zip(model_xgb.feature_importances_,train_data.drop(['id','kfold','target'],axis=1))), columns=['Value','Feature'])
feature_imp_xgb

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv", index_col='id')
submission['target'] =model_lgb.predict_proba(test_data.drop(['id'],axis=1))[:,1]
submission.to_csv('model_lgb_high.csv')

In [None]:
display(submission)