In [1]:
import pandas as pd
import sklearn
import numpy as np
from scipy import optimize
from scipy.optimize import brute
from scipy.optimize import minimize
from scipy.optimize import minimize_scalar
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml.pipeline import PMMLPipeline
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/application_record.csv')
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [3]:
df.describe()

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS
count,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0,438557.0
mean,6022176.0,0.42739,187524.3,-15997.904649,60563.675328,1.0,0.206133,0.287771,0.108207,2.194465
std,571637.0,0.724882,110086.9,4185.030007,138767.799647,0.0,0.404527,0.452724,0.310642,0.897207
min,5008804.0,0.0,26100.0,-25201.0,-17531.0,1.0,0.0,0.0,0.0,1.0
25%,5609375.0,0.0,121500.0,-19483.0,-3103.0,1.0,0.0,0.0,0.0,2.0
50%,6047745.0,0.0,160780.5,-15630.0,-1467.0,1.0,0.0,0.0,0.0,2.0
75%,6456971.0,1.0,225000.0,-12514.0,-371.0,1.0,0.0,1.0,0.0,3.0
max,7999952.0,19.0,6750000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,20.0


In [4]:
df['AGE'] = -df['DAYS_BIRTH']/365.0
df['DAYS_EMPLOYED'] = -df['DAYS_EMPLOYED']

In [5]:
# drop bad columns
df = df.dropna()

In [6]:
from random import random
def calculate_approval(row):
    p = 1.0

    if row['AGE'] < 30:
        p = p - 0.3
    elif row['AGE'] >= 30 and row['AGE'] < 50:
        p = p - 0.2
    elif row['AGE'] >= 50:
        p = p -  0.1

    if row['CNT_CHILDREN'] == 0:
        p = p -  0.1
    elif row['CNT_CHILDREN'] > 0 and row['CNT_CHILDREN'] < 2:
        p = p - 0.2
    else:
        p = p - 0.3

    if row['AMT_INCOME_TOTAL'] < 50000:
        p = p - 0.3
    elif row['AMT_INCOME_TOTAL'] >= 50000 and row['AMT_INCOME_TOTAL'] < 100000:
        p = p - 0.2
    elif row['AMT_INCOME_TOTAL'] >= 100000 and row['AMT_INCOME_TOTAL'] < 200000:
        p = p - 0.1

    if row['DAYS_EMPLOYED'] < 365:
        p = p -  0.2
    elif row['DAYS_EMPLOYED'] >= 365 and row['DAYS_EMPLOYED'] < 2000:
        p = p - 0.1

    if row['FLAG_OWN_REALTY'] == 0.0:
        p = p -  0.1

    if row['FLAG_OWN_CAR'] == 0.0:
        p = p - 0.05

    return random() < p



df['APPROVED'] = df.apply(calculate_approval, axis=1)
df['APPROVED'].describe()

count     304354
unique         2
top         True
freq      158752
Name: APPROVED, dtype: object

In [7]:
inputs = df[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AGE', 'DAYS_EMPLOYED', 'FLAG_WORK_PHONE']]
outputs = df['APPROVED'].eq(True).mul(1)
inputs['FLAG_OWN_CAR'] = inputs['FLAG_OWN_CAR'].eq('Y').mul(1)
inputs['FLAG_OWN_REALTY'] = inputs['FLAG_OWN_REALTY'].eq('Y').mul(1)

In [8]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.4, random_state=23)

In [10]:
X_train

Unnamed: 0,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AGE,DAYS_EMPLOYED,FLAG_WORK_PHONE
101910,0,1,0,135000.0,61.304110,1279,0
325039,1,1,0,315000.0,32.665753,1205,0
396383,0,1,0,112500.0,47.161644,942,1
422056,0,0,0,157500.0,23.797260,239,1
350142,0,1,0,180000.0,54.443836,453,0
...,...,...,...,...,...,...,...
292096,0,0,1,112500.0,30.597260,4548,0
391884,0,1,0,225000.0,36.550685,3893,0
321053,0,0,1,157500.0,33.989041,480,0
391182,0,1,0,292500.0,36.726027,4777,1


In [11]:
from sklearn_pandas import DataFrameMapper

def build_RF_pipeline(inputs, outputs, rf=None):
    if not rf:
        rf = RandomForestClassifier()
    pipeline = PMMLPipeline([
        ("mapper", DataFrameMapper([
            (['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_WORK_PHONE'], preprocessing.OrdinalEncoder()),
            (['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AGE', 'DAYS_EMPLOYED'], None)
        ])),
        ("classifier", rf)
    ])
    pipeline.fit(inputs, outputs)
    return pipeline

In [12]:

def RF_estimation(inputs, outputs,
                  estimator_steps=10,
                  depth_steps=10,
                  min_samples_split=None,
                  min_samples_leaf=None):
    # hyper-parameter estimation
    n_estimators = [int(x) for x in np.linspace(start=50, stop=100, num=estimator_steps)]
    max_depth = [int(x) for x in np.linspace(3, 10, num=depth_steps)]
    max_depth.append(None)
    if not min_samples_split:
        min_samples_split = [1, 2, 4]
    if not min_samples_leaf:
        min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid,
                                   n_iter=100, scoring='neg_mean_absolute_error',
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)
    rf_random.fit(inputs, outputs)
    best_random = rf_random.best_estimator_
    print(best_random)
    return best_random

In [13]:
rf = RF_estimation(X_train, y_train, estimator_steps=4, depth_steps=4)
random_forest_pipeline = build_RF_pipeline(X_train, y_train, rf)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.1min finished
RandomForestClassifier(bootstrap=False, max_depth=7, min_samples_leaf=4,
                       min_samples_split=4, n_estimators=83)


In [14]:
rf_predictions = random_forest_pipeline.predict(X_test)
print(f"MSE: {random_forest_pipeline.score(X_test, y_test)*100}%")

MSE: 61.845542212219286%


In [15]:
# skl_to_pmml(random_forest_pipeline, ['age', 'income', 'response'], 'segment',"models/rf_segmentation_small.pmml")
from sklearn2pmml import sklearn2pmml

sklearn2pmml(random_forest_pipeline, "../models/loan.pmml", with_repr = True)