In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train.head()

# EDA

In [None]:
train.isnull().sum()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train['target'].value_counts()

In [None]:
train.describe()

In [None]:
df_train_label = train.drop('target', axis=1)
df_train_label['train-test'] = 1
test['train-test'] = 0

In [None]:
df = pd.concat([df_train_label, test])

In [None]:
numerical_col = [col for col in df.columns if pd.api.types.is_float_dtype(df[col])]
plt.boxplot(df[numerical_col])
plt.title('Numerical Boxplot', fontsize=24, fontweight='bold')
plt.xlabel('Features');

In [None]:
cat_feats = [col for col in train.columns if col.startswith("cat")]
num_feats = [col for col in train.columns if col.startswith("cont")]

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["variables"] = num_feats
vif["VIF"] = [variance_inflation_factor(train[num_feats].values, i) for i in range(train[num_feats].shape[1])]

In [None]:
vif = vif.sort_values(by=["VIF"], ascending=False)
vif.style.background_gradient(cmap="magma")

In [None]:
target = train['target'].values

columns = test.columns[1:]
columns

In [None]:
cont_cols = [col for col in columns if 'cont' in col]
cat_cols = [col for col in columns if 'cat' in col]

def label_encode(train_df, test_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column].unique().tolist() + test_df[column].unique().tolist())
    train_df[new_feature] = le.transform(train_df[column])
    test_df[new_feature] = le.transform(test_df[column])
    return new_feature

le_cols = []
for feature in cat_cols:
    le_cols.append(label_encode(train, test, feature))
    
columns = cont_cols + le_cols

In [None]:
def run_rskf(train, target, clf, params):
    train_preds = np.zeros((train.shape[0], 2))
    test_preds = 0
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=34)
    for fold, (train_index, val_index) in enumerate(rskf.split(train, target)):
        print("-> Fold {}".format(fold + 1))
       
        x_train, x_valid = train.iloc[train_index][columns], train.iloc[val_index][columns]
        y_train, y_valid = target[train_index], target[val_index]
    
        model = clf(**params)
        model.fit(x_train, y_train,
                    eval_set=[(x_valid, y_valid)], 
                    verbose=0,
                    early_stopping_rounds=500)
    
        train_oof_preds = model.predict_proba(x_valid)[:,1]
        train_preds[val_index, fold//5] = train_oof_preds
        test_oof_preds = model.predict_proba(test[columns])[:,1]
        test_preds += test_oof_preds / 10
        print("ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds)))
        if fold in [4, 9]:
            print("=> Overall ROC AUC Score = {}".format(roc_auc_score(target, train_preds[:, fold//5])))
    return model, test_preds

# XGBoost

In [None]:
params_xgb = {'seed':2021,
            'n_estimators':10000,
            'verbosity':1,
            'objective': 'binary:logistic',
            'eval_metric':"auc",
            'tree_method':"gpu_hist",
            'use_label_encoder':False,
            'gpu_id':0,
            'alpha':7.105038963844129,
            'colsample_bytree':0.25505629740052566,
            'gamma':0.4999381950212869,
            'reg_lambda':1.7256912198205319,
            'learning_rate':0.011823142071967673,
            'max_bin':338,
            'max_depth':8,
            'min_child_weight':2.286836198630466,
            'subsample':0.618417952155855}

clf_xgb = XGBClassifier

In [None]:
model_xgb, test_preds_xgb = run_rskf(train, target, clf_xgb , params_xgb)

In [None]:
xgb = pd.DataFrame({'id':test['id'],'target':test_preds_xgb})
xgb.to_csv('XGB.csv',index=False)

# Parameter tuning for Lightgbm via optuna

def fun(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2)
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1000,10000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6]),
        'subsample': trial.suggest_uniform('subsample', 0,1),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.1 ),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('cat_smooth', 1, 100),
        'cat_l2': trial.suggest_int('cat_l2',1,20),
        'device_type': 'gpu',
        'metric': 'auc', 
        'random_state': 13,
        
    }
    model = LGBMClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=200,verbose=False)
    
    preds = model.predict_proba(test_x)[:,1]
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(fun, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

plot_optimization_histor: shows the scores from all trials as well as the best score so far at each point.


optuna.visualization.plot_optimization_history(study)

Visualize parameter importances.


optuna.visualization.plot_param_importances(study)

best_params = study.best_params

In [None]:
params_lgb = {
            'cat_smooth':89.2699690675538,
            'colsample_bytree':0.2557260109926193,
            'learning_rate':0.00918685483594994,
            'max_bin':788,
            'max_depth':81,
            'metric':"auc",
            'min_child_samples':292,
            'min_data_per_group':177,
            'n_estimators':16000,
            'n_jobs':-1,
            'num_leaves':171,
            'reg_alpha':0.7115353581785044,
            'reg_lambda':5.658115293998945,
            'subsample':0.9262904583735796,
            'subsample_freq':1,
            'verbose':-1
            }

clf_lgb = LGBMClassifier

In [None]:
model_lgb, test_preds_lgb = run_rskf(train, target, clf_lgb , params_lgb)

In [None]:
lgb = pd.DataFrame({'id':test['id'],'target':test_preds_lgb})
lgb.to_csv('LGB.csv',index=False)

In [None]:
comb = pd.DataFrame({'id':test['id'],'target':(test_preds_xgb + test_preds_lgb)/2 })
comb.to_csv('comb.csv',index=False)