In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV


df = pd.read_excel("Scoring_9.xlsx")
df.head()

In [None]:
# Data preprocessing
median_columns = [
    'gender', 'dependants', 
    'maritalstatus', 
    'incomes', 'work_experience',
    'incomes_bdki_int', 
    'incomes_bdki_int_indxd'
    ]
mean_columns = [
    'sum_outstandingamount',
    'max_dpd_before_startdate'
]    
df['startdate'] = pd.to_numeric(pd.DatetimeIndex(df['startdate']).year)
df[df['max_dpd_before_startdate'] == -9999.0] = 0
df = df.drop(['incomes_bdki'], axis=1, errors='ignore')

for i in median_columns:
  df[i] = df[i].fillna(df[i].median())

for i in mean_columns:
  df[i] = df[i].fillna(df[i].mean())

df.isna().sum()
df.head()

In [None]:
y = df.loc[:,'GB_flag']
X = df.drop(columns=['GB_flag'])

In [None]:
from sklearn.metrics import make_scorer, roc_auc_score
def gini_normalized(y_actual, y_pred):
    """Simple normalized Gini based on Scikit-Learn's roc_auc_score"""
    gini = lambda a, p: 2 * roc_auc_score(a, p) - 1
    return gini(y_actual, y_pred) / gini(y_actual, y_actual)

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RepeatedKFold

model = ElasticNet(alpha=1.0, l1_ratio=0.5)

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

gini = cross_val_score(model, X, y, scoring=make_scorer(gini_normalized), cv=cv, n_jobs=-1)


gini = np.absolute(gini)
print('Gini score: %.3f ' % (np.mean(gini)))