# Imports

In [None]:
import numpy as np 
import pandas as pd 

from IPython.display import display
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
import lightgbm as lgbm

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.metrics import classification_report
import pickle


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/bnp-paribas-cardif-claims-management/train.csv.zip')
display(data.shape)
display(data.head())

In [None]:
X_cols = data.columns[2:]

# Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[X_cols], data['target'],
    test_size=0.2, shuffle = True, random_state = 8, stratify=data['target'])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state= 8, stratify=y_train)


train_mask = data.index.isin(X_train.index)
test_mask = data.index.isin(X_test.index)
valid_mask = data.index.isin(X_val.index)

print("X_train shape: {}".format(data[train_mask][X_cols].shape))
print("X_test shape: {}".format(data[test_mask][X_cols].shape))
print("X_val shape: {}".format(data[valid_mask][X_cols].shape))


In [None]:
COL_TARGET = "target"
COL_PRED = "pred"
COL_PROB_1 = "prob"

## Feature encoding

In [None]:
cols = data[train_mask].columns
num_cols = data[train_mask]._get_numeric_data().columns
cols_cat = list(set(cols) - set(num_cols))

data[train_mask][cols_cat].head()



In [None]:
data[cols_cat].info()

In [None]:
encoders = dict()
for cat in cols_cat:
    encoders[cat] = preprocessing.LabelEncoder()
    data[train_mask][cat] = encoders[cat].fit(data[train_mask][cat].astype(str))
    
    data[cat] = data[cat].map(lambda s: 'unknown' if s not in encoders[cat].classes_ else s)
    encoders[cat].classes_ = np.append(encoders[cat].classes_, 'unknown')
    
    data[cat] = encoders[cat].transform(data[cat].astype(str))

## Feature selection

In [None]:
lgbm_rfe = lgbm.LGBMClassifier()
rfe = RFE(lgbm_rfe)
rfe = rfe.fit(data[train_mask][X_cols], data[train_mask][COL_TARGET].values.ravel())
print(rfe.support_)
print(rfe.ranking_)

f = rfe.get_support(1) #the most important features
X_cols = data[train_mask][X_cols].columns[f]

## Training

In [None]:
def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

def get_stat(data, col_target, col_pred,):
    perf = pd.DataFrame({
        'sample':[
            'train', #train
            'test', #test
            'valid', #valid
        ],
        'gini':[
            gini(data[train_mask][col_target], data[train_mask][col_pred]), #train
            gini(data[test_mask][col_target], data[test_mask][col_pred]), #test
            gini(data[valid_mask][col_target], data[valid_mask][col_pred]), #valid
        ],

    }).set_index('sample')
    
    display(perf)

In [None]:
def brutal_cv_search(clf, params, X_train, y_train, X_valid, y_valid,
                     cv_iterations=None, silent=False):
    
    params_lens = [len(vals) for vals in params.values()]
    params_combinations = np.prod(params_lens)
    mesh = []
    for idx in range(params_combinations):
        params_idxs = []
        for param_len in params_lens[::-1]:
            params_idxs.insert(0, idx % param_len)
            idx //= param_len

        mesh_params = {}
        for i, (param, vals) in enumerate(params.items()):
            mesh_params[param] = vals[params_idxs[i]]
        mesh.append(mesh_params)
        
    cv_iter = 0
    used_mesh_idxs = set()
    train_log = []
    while (cv_iterations is None and cv_iter < params_combinations) or \
          (cv_iterations is not None and cv_iter < cv_iterations):
        mesh_idx = np.random.randint(params_combinations)
        while mesh_idx in used_mesh_idxs:
            mesh_idx = np.random.randint(params_combinations)
            used_mesh_idxs.add(mesh_idx)
        cv_iter += 1
        mesh_params = mesh[mesh_idx]
        if not silent:
            print(mesh_params)

        clf.set_params(**mesh_params)
        clf.fit(X_train, y_train)
        
        scores_train = clf.predict_proba(X_train)[:, 1]
        gini_train = gini(y_train, scores_train) # gini train
        
        scores_valid = clf.predict_proba(X_valid)[:, 1]
        gini_valid = gini(y_valid, scores_valid) # gini valid

        scores = {
                'gini_train': gini_train,
                'gini_valid': gini_valid,
        }
        
        if not silent:
            print('iter:', cv_iter, end=' ')
            for score_name, score in scores.items():
                print(f"{score_name}: {score:.3f} ", end='')
            print()
        train_log.append({**scores, **mesh_params})
    return train_log

In [None]:
counts = data[train_mask]['target'].value_counts(1)
class_weight={0: 1, 1: counts[0] / counts[1]}
print(class_weight)

model = lgbm.LGBMClassifier(class_weight=class_weight, verbosity=-1, n_jobs=-1)

In [None]:
params = {
        'max_depth': [2, 4, 6],                  
        'n_estimators': [50, 100],
        'subsample': [0.25, 0.5, 0.75], 
        'min_gain_to_split': [60, 80, 100],
        'reg_alpha': [0.5, 0.7, 1.0],
        'learning_rate': [0.1, 0.2, 0.3],  
        'colsample_bytree': [0.2, 0.4, 0.6], 
        }

In [None]:
train_log = brutal_cv_search(model, params
        ,data[train_mask][X_cols], data[train_mask][COL_TARGET]
        ,data[valid_mask][X_cols], data[valid_mask][COL_TARGET]
        ,cv_iterations=20)

In [None]:
train_log_df = pd.DataFrame(train_log).sort_values(['gini_valid'], ascending=False)
train_log_df

In [None]:
idx = 0
model.set_params(**{'max_depth': train_log_df['max_depth'].iloc[idx],                  
                    'n_estimators': train_log_df['n_estimators'].iloc[idx],
                    'subsample': train_log_df['subsample'].iloc[idx], 
                    'reg_alpha': train_log_df['reg_alpha'].iloc[idx],
                    'learning_rate': train_log_df['learning_rate'].iloc[idx],  
                    'colsample_bytree': train_log_df['colsample_bytree'].iloc[idx]})

model.fit(data[train_mask][X_cols], data[train_mask]['target'])

In [None]:
data[COL_PRED] = model.predict(data[X_cols])
data[COL_PROB_1] = model.predict_proba(data[X_cols])[:, 1]

In [None]:
data.head(10)

## Evaluation

In [None]:
print(classification_report(data[test_mask][COL_TARGET], data[test_mask][COL_PRED]))

In [None]:
get_stat(data, COL_TARGET, COL_PROB_1)

In [None]:
roc_auc = roc_auc_score(data[test_mask][COL_TARGET], data[test_mask][COL_PROB_1])
fpr, tpr, thresholds = roc_curve(data[test_mask][COL_TARGET], data[test_mask][COL_PROB_1])
plt.figure()
plt.plot(fpr, tpr, label='LGBM (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## Feature importance

In [None]:
max_num_features = int(len(X_cols) * 0.3)

fig, ax = plt.subplots(figsize=(8, 5))
lgbm.plot_importance(model, max_num_features=max_num_features, ax=ax)
plt.show()