# Imports

In [None]:
import numpy as np 
import pandas as pd 

from IPython.display import display
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.feature_selection import RFE
import lightgbm as lgbm

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.metrics import classification_report


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/bnp-paribas-cardif-claims-management/train.csv.zip')
display(df.shape)
display(df.head())

# Split dataset

In [None]:
X_cols = df.columns[2:]  # берем все колонки кроме первых двух
X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df['target'], test_size=0.2, random_state=0, stratify=df['target'])

In [None]:
X_train.info()

In [None]:
X_train.head()

In [None]:
y_train.value_counts()

In [None]:
X_train.head()

# Feature encoding

In [None]:
cols = X_train.columns
num_cols = X_train._get_numeric_data().columns
cols_cat = list(set(cols) - set(num_cols))

In [None]:
X_train[cols_cat].head()

In [None]:
encoders = dict()
for cat in cols_cat:
    encoders[cat] = preprocessing.LabelEncoder()
    X_train[cat] = encoders[cat].fit_transform(X_train[cat].astype(str))
    
    X_test[cat] = X_test[cat].map(lambda s: 'unknown' if s not in encoders[cat].classes_ else s)
    encoders[cat].classes_ = np.append(encoders[cat].classes_, 'unknown')
    
    X_test[cat] = [x if x in encoders[cat].classes_ else 'unknown' for x in X_test[cat]]
    
    X_test[cat] = encoders[cat].transform(X_test[cat].astype(str))

In [None]:
X_train[cols_cat].head()

# Feature selection

In [None]:
lgbm_rfe = lgbm.LGBMClassifier()
rfe = RFE(lgbm_rfe)
rfe = rfe.fit(X_train, y_train.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

f = rfe.get_support(1) #the most important features
X_train=X_train[X_train.columns[f]]
X_test=X_test[X_test.columns[f]]

# Training

In [None]:
model = lgbm.LGBMClassifier(verbosity=-1, n_jobs=-1, is_unbalance = True)
model.fit(X_train, y_train)

# Prediction and evaluation

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='LGBM (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

## Gini

In [None]:
pred_x = model.predict(X_train)
def gini(y_true, y_pred):
    return 2 * roc_auc_score(y_true, y_pred) - 1

display(gini(y_train, pred_x)) # for train data
display(gini(y_test, y_pred)) # for test data

## Feature importance

In [None]:
max_num_features = int(len(X_cols) * 0.1)  # how many features to show

fig, ax = plt.subplots(figsize=(8, 10))
lgbm.plot_importance(model, max_num_features=max_num_features, ax=ax)
plt.show()