In [None]:
data_path = "../input/tabular-playground-series-oct-2021"

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Import de la donnée

In [None]:
train_full = pd.read_csv(data_path+'/train.csv')
test_full = pd.read_csv(data_path+'/test.csv')
sample_submission_full = pd.read_csv(data_path+'/sample_submission.csv')

train = train_full.sample(10000)

# Traitement de la donnée


In [None]:
train_full.shape

In [None]:
train.head()

In [None]:
train.isna().sum().sum()

### Identification des variables numériques / catégorielles


In [None]:
import matplotlib.pyplot as plt

unique_values = train.drop(['target'], axis=1).nunique()

unique_values.hist(bins=100, figsize=(15, 7))
plt.title('Distribution du nombre de valeurs uniques par variable')

On souhaite isoler la barre basse de cette distribution pour voir s'il existe un seuil trivial pour différencier les variables numériques (continues) et catégoriques (discrètes). D'autre part, on observe un outlier qui correspond à l'id unique.

In [None]:
unique_values[unique_values < 1000].hist(bins=10, figsize=(5, 5))
plt.title('Distribution du nombre de valeurs uniques par variable (moins de 1000 valeurs uniques)')
print("Nombre de valeurs uniques maximal pour les séries contenant moins de 1000 valeurs uniques :",\
      unique_values[unique_values < 1000].max())
print("Nombre de valeurs uniques minimal pour les séries contenant plus de 1000 valeurs uniques :",\
      unique_values[unique_values >= 1000].min())

On oberve donc une différence nette entre les variables numériques, présentants au moins 50632 valeurs uniques, et les variables catégoriques qui possèdent au maximum 2 valeurs uniques.

In [None]:
cat_cols = unique_values[unique_values < 3].index
num_cols = unique_values[unique_values >= 3].index

print("Categorical columns : ", len(cat_cols))
print("Numerical columns : ", len(num_cols))

### Corrélations


#### Variables catégorielles


In [None]:
recap_cat = []

for col in train[cat_cols]:
    recap_cat.append([
        col,
        train[col].nunique(),
        train[col].mean(),
        train[col].min(),
        train[col].max(),
        train[col].corr(train['target'])
    ])

recap_cat = pd.DataFrame(recap_cat, columns=['Col','Num Unique','Mean','Min','Max','Corr'])
recap_cat= recap_cat.sort_values(by=['Corr'], ascending=False, key=abs)

recap_cat.head(5)

In [None]:
print(recap_cat.Min.max())
print(recap_cat.Max.min())

Les variables sont déjà normalisées entre 0 et 1.

In [None]:
train.iloc[:, [3, 7, 10, 23]].hist()

On note la présence d'une variable catégorielle très anti-corrélée avec la target.

In [None]:
round((train[cat_cols].f22 & ~train['target']).sum() / len(train), 3) * 100

#### Variables numériques

In [None]:
recap_num = []

for col in train[num_cols]:
    recap_num.append([
        col,
        train[col].nunique(),
        train[col].mean(),
        train[col].min(),
        train[col].max(),
        train[col].corr(train['target'])
    ])

recap_num = pd.DataFrame(recap_num, columns=['Col','Num Unique','Mean','Min','Max','Corr'])
recap_num = recap_num.sort_values(by=['Corr'], ascending=False, key=abs)

recap_num.head(5)

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test_full[num_cols] = scaler.transform(test_full[num_cols])

# Modèle Standard


In [None]:
import time
time_ = time.time()
time.sleep(1)
print('Time to run this cell :', round(time.time()-time_, 2), 's')

In [None]:
from sklearn.linear_model import LogisticRegression

time_ = time.time()

kf = StratifiedKFold(n_splits=3,random_state=1998,shuffle=True)

test_pred_lo = 0
fold = 1
total_auc_lr = 0

cols = list(num_cols)+list(cat_cols)

for train_idx, test_idx in kf.split(train[cols],train['target']):
    X_tr,X_val=train[cols].iloc[train_idx],train[cols].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[train_idx],train['target'].iloc[test_idx]
    
    lr = LogisticRegression(n_jobs = -1, random_state = 42, C = 5, max_iter = 2000)
    lr.fit(X_tr, y_tr)
    
    valid_pred_lo = lr.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, valid_pred_lo)
    total_auc_lr += auc / 3
    print('Fold', fold, 'AUC :', auc)
    fold += 1
    
print('Total AUC score :', total_auc_lr)
t_lr = round(time.time()-time_, 2)
print('Time to run this cell :', t_lr, 's')

In [None]:
lr.predict_proba(X_tr)[:,1]

# Méthodes ensemblistes


## LGBMClassifier

In [None]:
time_ = time.time()


params={'objective' : 'binary',
    'eval_metric' : ['auc', 'error'],
    'max_depth' : 3,
    'num_leaves' : 7,
    'n_estimators' : 5000,
    'colsample_bytree' : 0.3,
    'subsample' : 0.5,
    'reg_alpha' : 18,
    'reg_lambda' : 17,
    'learning_rate' : 0.095,
    'random_state' : 2021}

cols = list(num_cols) + list(cat_cols)

preds_lgbm = np.zeros(test_full.shape[0])
kf = StratifiedKFold(n_splits=3,random_state=1998,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for train_idx, test_idx in kf.split(train[cols],train['target']):

    X_tr,X_val=train[cols].iloc[train_idx],train[cols].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[train_idx],train['target'].iloc[test_idx]

    eval_metric = ["auc","logloss"]

    model = LGBMClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_tr,y_tr),(X_val,y_val)],early_stopping_rounds=100,eval_metric=eval_metric,verbose=False)
    preds_lgbm += model.predict_proba(test_full[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print('Fold', n+1, 'AUC :', auc[n])
    n+=1  

total_auc_lgbm = sum(auc)/3
print('Total AUC score :', total_auc_lgbm)

t_lgbm = round(time.time()-time_, 2)
print('Time to run this cell :', t_lgbm, 's')

In [None]:
preds_lgbm

In [None]:
# retrieve performance metrics
results = model.evals_result_
epochs = len(results['valid_1']['auc'])
x_axis = range(0, epochs)
fig, ax = plt.subplots(1, 2, figsize=(15,5))
# plot auc
print(len(x_axis), len(results['training']['binary_logloss']))
ax[0].plot(x_axis, results['training']['auc'], label='Train')
ax[0].plot(x_axis, results['valid_1']['auc'], label='Test')
ax[0].legend()
ax[0].set_title('LGBM AUC-ROC')
ax[0].set_ylabel('AUC-ROC')
ax[0].set_xlabel('N estimators')
# plot classification error
ax[1].plot(x_axis, results['training']['binary_logloss'], label='Train')
ax[1].plot(x_axis, results['valid_1']['binary_logloss'], label='Test')
ax[1].legend()
ax[1].set_title('LGBM Binary Logloss')
ax[1].set_ylabel('Classification Error')
ax[1].set_xlabel('N estimators')
plt.show()
plt.tight_layout()

## XGBClassifier

In [None]:
time_ = time.time()

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'n_jobs': -1,
    'n_estimators': 10000,
    'max_depth': 3,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'learning_rate': 0.01187,
    'random_state': 2021
}

cols = list(num_cols) + list(cat_cols)

preds = np.zeros(test_full.shape[0])
kf = StratifiedKFold(n_splits=3,random_state=1998,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for train_idx, test_idx in kf.split(train[cols],train['target']):
    X_tr,X_val=train[cols].iloc[train_idx],train[cols].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[train_idx],train['target'].iloc[test_idx]

    model = XGBClassifier(**xgb_params)

    eval_metric = ["auc","error"]

    model.fit(X_tr,y_tr,eval_set=[(X_tr,y_tr),(X_val,y_val)],early_stopping_rounds=200,eval_metric=eval_metric,verbose=False)
    preds += model.predict_proba(test_full[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print('Fold', n+1, 'AUC :', auc[n])
    n+=1  

total_auc_xgb = sum(auc)/3
print('Total AUC score :', total_auc_xgb)

t_xgb = round(time.time()-time_, 2)
print('Time to run this cell :', t_xgb, 's')

In [None]:
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
fig, ax = plt.subplots(1, 2, figsize=(15,5))
# plot auc
ax[0].plot(x_axis, results['validation_0']['auc'], label='Train')
ax[0].plot(x_axis, results['validation_1']['auc'], label='Test')
ax[0].legend()
ax[0].set_title('XGBoost AUC-ROC')
ax[0].set_ylabel('AUC-ROC')
ax[0].set_xlabel('N estimators')
# plot classification error
ax[1].plot(x_axis, results['validation_0']['error'], label='Train')
ax[1].plot(x_axis, results['validation_1']['error'], label='Test')
ax[1].legend()
ax[1].set_title('XGBoost Classification Error')
ax[1].set_ylabel('Classification Error')
ax[1].set_xlabel('N estimators')
plt.show()
plt.tight_layout()

## CatBoostClassifier

In [None]:
time_ = time.time()

cb_params = {
    'loss_function' : 'CrossEntropy',
    'eval_metric' : 'AUC',
    'iterations' : 10000,
    'grow_policy' : 'SymmetricTree',
    'use_best_model' : True,
    'depth' : 5,
    'l2_leaf_reg' : 3.0,
    'random_strength' : 1.0,
    'learning_rate' : 0.1,
    'verbose' : 0,
    'random_state': 2021
}

cols = list(num_cols) + list(cat_cols)

preds = np.zeros(test_full.shape[0])
kf = StratifiedKFold(n_splits=3,random_state=1998,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for train_idx, test_idx in kf.split(train[cols],train['target']):
    X_tr,X_val=train[cols].iloc[train_idx],train[cols].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[train_idx],train['target'].iloc[test_idx]

    model = CatBoostClassifier(**cb_params)

    eval_metric = ["auc","error"]

    model.fit(X_tr, y_tr, eval_set=[(X_tr,y_tr),(X_val,y_val)],early_stopping_rounds=200)
    preds += model.predict_proba(test_full[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print('Fold', n+1, 'AUC :', auc[n])
    n+=1  

total_auc_cb = sum(auc)/3
print('Total AUC score :', total_auc_cb)

t_cb = round(time.time()-time_, 2)
print('Time to run this cell :', t_cb, 's')

In [None]:
# retrieve performance metrics
results = model.evals_result_
epochs = len(results['validation_0']['CrossEntropy'])
x_axis = range(0, epochs)
fig, ax = plt.subplots(1, 2, figsize=(15,5))
# plot auc
ax[0].plot(x_axis, results['validation_0']['AUC'], label='Train')
ax[0].plot(x_axis, results['validation_1']['AUC'], label='Test')
ax[0].legend()
ax[0].set_title('CatBoost AUC-ROC')
ax[0].set_ylabel('AUC-ROC')
ax[0].set_xlabel('N estimators')
# plot classification error
ax[1].plot(x_axis, results['validation_0']['CrossEntropy'], label='Train')
ax[1].plot(x_axis, results['validation_1']['CrossEntropy'], label='Test')
ax[1].legend()
ax[1].set_title('CatBoost CrossEntropy')
ax[1].set_ylabel('CrossEntropy Error')
ax[1].set_xlabel('N estimators')
plt.show()
plt.tight_layout()

In [None]:
fig, ax = plt.subplots()
plt.title('AUC / Training Time Ratio')

models = ['Log Reg', 'LGBM', 'XGBoost', 'CatBoost']
times = [t_lr, t_lgbm, t_xgb, t_cb]
aucs = [total_auc_lr, total_auc_lgbm, total_auc_xgb, total_auc_cb]
color = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red']

for idx, model in enumerate(models):
    
    x, y = times[idx], aucs[idx]
    #scale = 200.0 * np.random.rand(n)
    ax.scatter(x, y, c=color[idx], label=model,
               alpha=1, edgecolors='none')

ax.set_xlabel('Training Time (s)')
ax.set_ylabel('AUC')
ax.legend()
ax.grid(True)

plt.show()

# Modèle final

In [None]:
train_full = pd.read_csv(data_path+'/train.csv')
test_full = pd.read_csv(data_path+'/test.csv')
sample_submission_full = pd.read_csv(data_path+'/sample_submission.csv')

unique_values = train_full.drop(['target'], axis=1).nunique()

cat_cols = unique_values[unique_values < 3].index
num_cols = unique_values[unique_values >= 3].index

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train_full[num_cols] = scaler.fit_transform(train_full[num_cols])
test_full[num_cols] = scaler.transform(test_full[num_cols])

In [None]:
import time

time_ = time.time()


params={'objective' : 'binary',
    'eval_metric' : ['auc', 'error'],
    'max_depth' : 3,
    'num_leaves' : 7,
    'n_estimators' : 5000,
    'colsample_bytree' : 0.3,
    'subsample' : 0.5,
    'reg_alpha' : 18,
    'reg_lambda' : 17,
    'learning_rate' : 0.095,
    'random_state' : 2021}

cols = list(num_cols) + list(cat_cols)

preds_lgbm = np.zeros(test_full.shape[0])
kf = StratifiedKFold(n_splits=3,random_state=1998,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for train_idx, test_idx in kf.split(train_full[cols],train_full['target']):

    X_tr,X_val=train_full[cols].iloc[train_idx],train_full[cols].iloc[test_idx]
    y_tr,y_val=train_full['target'].iloc[train_idx],train_full['target'].iloc[test_idx]

    eval_metric = ["auc","logloss"]

    model = LGBMClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_tr,y_tr),(X_val,y_val)],early_stopping_rounds=100,eval_metric=eval_metric,verbose=False)
    preds_lgbm += model.predict_proba(test_full[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    gc.collect()
    print('Fold', n+1, 'AUC :', auc[n])
    n+=1  

total_auc_lgbm = sum(auc)/3
print('Total AUC score :', total_auc_lgbm)

t_lgbm = round(time.time()-time_, 2)
print('Time to run this cell :', t_lgbm, 's')

In [None]:
submit_df = sample_submission_full
submit_df['target'] = preds_lgbm.ravel()
submit_df.to_csv("submission.csv", index=False)
submit_df.head()