# Baseline model

## Set-up

In [45]:
import os
import pickle

import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector

In [2]:
os.chdir('..')

In [3]:
pd.options.display.max_columns = None

In [4]:
train = pd.read_parquet('data/interim/train.parquet')
print(f"Train size: {train.shape}")
dev = pd.read_parquet('data/interim/dev.parquet')
print(f"Dev size: {dev.shape}")

Train size: (184506, 122)
Dev size: (61502, 122)


## Data processing

In [5]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,297783,0,Cash loans,F,N,Y,0,74250.0,112500.0,6282.0,112500.0,Unaccompanied,Working,Incomplete higher,Married,House / apartment,0.01885,-15077,-7915,-2853.0,-8,,1,1,1,1,0,0,Core staff,2.0,2,2,SATURDAY,10,0,0,0,0,0,0,School,,0.598495,0.728141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,-793.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
1,390572,0,Revolving loans,F,Y,Y,2,225000.0,810000.0,40500.0,810000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-14311,-1858,-2306.0,-5196,4.0,1,1,0,1,0,0,High skill tech staff,4.0,2,2,FRIDAY,13,0,0,0,0,0,0,Business Entity Type 3,0.578538,0.491977,0.200926,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-312.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,212363,0,Cash loans,M,Y,N,0,225000.0,1546020.0,42642.0,1350000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,Municipal apartment,0.032561,-15236,-5751,-7483.0,-4492,8.0,1,1,0,1,1,0,Drivers,2.0,1,1,TUESDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.467365,0.591815,0.762336,0.1103,0.0857,0.9831,0.7688,,0.12,0.1034,0.3333,0.375,,,0.1214,,,0.1124,0.089,0.9831,0.7779,,0.1208,0.1034,0.3333,0.375,,,0.1265,,,0.1114,0.0857,0.9831,0.7719,,0.12,0.1034,0.3333,0.375,,,0.1236,,,reg oper account,block of flats,0.0975,Panel,No,0.0,0.0,0.0,0.0,-1767.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,116368,1,Cash loans,M,Y,Y,1,202500.0,444420.0,30195.0,337500.0,Family,Working,Secondary / secondary special,Married,House / apartment,0.01885,-17688,-754,-4307.0,-1243,22.0,1,1,0,1,0,0,Drivers,3.0,2,2,TUESDAY,14,0,0,0,0,1,1,Business Entity Type 3,,0.077471,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1882.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,137783,0,Cash loans,F,N,Y,2,126000.0,1216201.5,35689.5,1062000.0,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.018029,-14030,-368,-3400.0,-4447,,1,1,1,1,0,0,,4.0,3,3,THURSDAY,6,0,0,0,0,0,0,Government,0.628605,0.313052,0.715103,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,3.0,0.0,-36.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
prop_nulls_df = (train.isnull().sum() / len(train)).reset_index().rename(columns={'index': 'feature', 0: 'proportion_of_nulls'})
prop_nulls_df.head()

Unnamed: 0,feature,proportion_of_nulls
0,SK_ID_CURR,0.0
1,TARGET,0.0
2,NAME_CONTRACT_TYPE,0.0
3,CODE_GENDER,0.0
4,FLAG_OWN_CAR,0.0


In [7]:
len(prop_nulls_df)

122

### For now we will drop features with more than 20% nulls

In [8]:
FEATURES_TO_DROP = list(prop_nulls_df[prop_nulls_df['proportion_of_nulls'] > 0.2]['feature'])

In [9]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,297783,0,Cash loans,F,N,Y,0,74250.0,112500.0,6282.0,112500.0,Unaccompanied,Working,Incomplete higher,Married,House / apartment,0.01885,-15077,-7915,-2853.0,-8,,1,1,1,1,0,0,Core staff,2.0,2,2,SATURDAY,10,0,0,0,0,0,0,School,,0.598495,0.728141,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,-793.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
1,390572,0,Revolving loans,F,Y,Y,2,225000.0,810000.0,40500.0,810000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-14311,-1858,-2306.0,-5196,4.0,1,1,0,1,0,0,High skill tech staff,4.0,2,2,FRIDAY,13,0,0,0,0,0,0,Business Entity Type 3,0.578538,0.491977,0.200926,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-312.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,212363,0,Cash loans,M,Y,N,0,225000.0,1546020.0,42642.0,1350000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,Municipal apartment,0.032561,-15236,-5751,-7483.0,-4492,8.0,1,1,0,1,1,0,Drivers,2.0,1,1,TUESDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.467365,0.591815,0.762336,0.1103,0.0857,0.9831,0.7688,,0.12,0.1034,0.3333,0.375,,,0.1214,,,0.1124,0.089,0.9831,0.7779,,0.1208,0.1034,0.3333,0.375,,,0.1265,,,0.1114,0.0857,0.9831,0.7719,,0.12,0.1034,0.3333,0.375,,,0.1236,,,reg oper account,block of flats,0.0975,Panel,No,0.0,0.0,0.0,0.0,-1767.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,116368,1,Cash loans,M,Y,Y,1,202500.0,444420.0,30195.0,337500.0,Family,Working,Secondary / secondary special,Married,House / apartment,0.01885,-17688,-754,-4307.0,-1243,22.0,1,1,0,1,0,0,Drivers,3.0,2,2,TUESDAY,14,0,0,0,0,1,1,Business Entity Type 3,,0.077471,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1882.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,137783,0,Cash loans,F,N,Y,2,126000.0,1216201.5,35689.5,1062000.0,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.018029,-14030,-368,-3400.0,-4447,,1,1,1,1,0,0,,4.0,3,3,THURSDAY,6,0,0,0,0,0,0,Government,0.628605,0.313052,0.715103,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,3.0,0.0,-36.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X_train = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_train = train['TARGET']

In [11]:
X_dev = dev.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_dev = dev['TARGET']

## Build baseline models

__Preprocessing steps__:
- Drop columns not needed
- For categorical columns:
    - Impute missing nulls with most common value
    - One hot encode
- For numeric columns:
    - Impute missing numeric with mean/median
    - Normalise data

__Model training__:
- Models to train:
    1. Logistic Regression
    2. SVM
    3. Random Forest
    4. LightGBM
- For each model train on `train` and calculate ROCAUC for `dev`
- Pickle model which performs best on `dev`
- Make submission using this model

`FunctionTransformer` automatically converts a Python function into a sklearn transformer

In [12]:
def drop_columns(X, columns_to_drop):
    return X.drop(columns_to_drop, axis=1)

In [13]:
def convert_object_columns_to_string(X):
    object_columns = X.select_dtypes(include=object).columns
    X[object_columns] = X[object_columns].astype(str)
    return X

In [14]:
preprocessing_pipeline = Pipeline([
    ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns_to_drop': FEATURES_TO_DROP})),
    ('convert_object_columns_to_string', FunctionTransformer(convert_object_columns_to_string))  # Need to convert to string otherwise leads to error when imputing object data
])

In [15]:
X_train = preprocessing_pipeline.fit_transform(X_train)
print(X_train.shape)
X_train.head()

(184506, 70)


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,F,N,Y,0,74250.0,112500.0,6282.0,112500.0,Unaccompanied,Working,Incomplete higher,Married,House / apartment,0.01885,-15077,-7915,-2853.0,-8,1,1,1,1,0,0,2.0,2,2,SATURDAY,10,0,0,0,0,0,0,School,0.598495,0.728141,1.0,1.0,1.0,0.0,-793.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
1,Revolving loans,F,Y,Y,2,225000.0,810000.0,40500.0,810000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-14311,-1858,-2306.0,-5196,1,1,0,1,0,0,4.0,2,2,FRIDAY,13,0,0,0,0,0,0,Business Entity Type 3,0.491977,0.200926,0.0,0.0,0.0,0.0,-312.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cash loans,M,Y,N,0,225000.0,1546020.0,42642.0,1350000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,Municipal apartment,0.032561,-15236,-5751,-7483.0,-4492,1,1,0,1,1,0,2.0,1,1,TUESDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.591815,0.762336,0.0,0.0,0.0,0.0,-1767.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,Cash loans,M,Y,Y,1,202500.0,444420.0,30195.0,337500.0,Family,Working,Secondary / secondary special,Married,House / apartment,0.01885,-17688,-754,-4307.0,-1243,1,1,0,1,0,0,3.0,2,2,TUESDAY,14,0,0,0,0,1,1,Business Entity Type 3,0.077471,,0.0,0.0,0.0,0.0,-1882.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,Cash loans,F,N,Y,2,126000.0,1216201.5,35689.5,1062000.0,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.018029,-14030,-368,-3400.0,-4447,1,1,1,1,0,0,4.0,3,3,THURSDAY,6,0,0,0,0,0,0,Government,0.313052,0.715103,3.0,0.0,3.0,0.0,-36.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


We create two separate `Pipeline`s for operations performed on numeric and categorical data. We then use `ColumnTransformer` to apply these pipelines to the correct columns of our DataFrame. `make_column_selector` is a special function which works with `ColumnTransformer` and allows columns to be selected by dtype.

In [16]:
# Pipeline of operations to perform on any object columns in DataFrame
object_pipeline = Pipeline(
    [
        ('most_frequent_imputer', SimpleImputer(strategy='most_frequent')),  # Very slow (see https://datascience.stackexchange.com/questions/66034/sklearn-simpleimputer-too-slow-for-categorical-data-represented-as-string-values), may need to change
        ('ohe', OneHotEncoder())
    ]
)

In [17]:
# Pipeline of operations to perform on any numeric columns in DataFrame
numeric_pipeline = Pipeline(
    [
        ('mean_imputer', SimpleImputer(strategy='mean')),
        ('min_max_scalar', MinMaxScaler())
    ]
)

In [18]:
full_pipeline = Pipeline(
    [
        (
            'process_data',
            ColumnTransformer(
                [
                    ('numeric_processing', numeric_pipeline, make_column_selector(dtype_include=np.number)),
                    ('object_processing', object_pipeline, make_column_selector(dtype_include=object))
                ]
            )
        )
    ]
)

In [19]:
%%time
X_train_processed_as_array = full_pipeline.fit_transform(X_train)

CPU times: user 2min 8s, sys: 29.6 s, total: 2min 37s
Wall time: 2min 37s


In [20]:
X_train_processed_as_array

array([[0.        , 0.00360685, 0.01685393, ..., 0.        , 0.        ,
        0.        ],
       [0.10526316, 0.01479478, 0.19101124, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01479478, 0.37478652, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.05263158, 0.0114551 , 0.06741573, ..., 0.        , 0.        ,
        0.        ],
       [0.10526316, 0.00811542, 0.07397753, ..., 0.        , 0.        ,
        0.        ],
       [0.05263158, 0.01479478, 0.24324719, ..., 0.        , 0.        ,
        0.        ]])

In [27]:
import lightgbm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

In [22]:
X_dev = preprocessing_pipeline.transform(X_dev)
print(X_dev.shape)
X_dev.head()

(61502, 70)


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,N,0,135000.0,254700.0,17149.5,225000.0,Unaccompanied,Working,Incomplete higher,Single / not married,Municipal apartment,0.009334,-10566,-197,-694.0,-3210,1,1,1,1,1,0,1.0,2,2,SUNDAY,19,0,0,0,0,1,1,Business Entity Type 3,0.189613,0.600658,0.0,0.0,0.0,0.0,-8.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
1,Cash loans,F,Y,Y,0,153000.0,1339884.0,39307.5,1170000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.025164,-17312,-4045,-3751.0,-833,1,1,0,1,0,0,2.0,2,2,THURSDAY,10,0,0,0,0,1,1,Business Entity Type 3,0.606122,,7.0,1.0,7.0,0.0,-1373.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
2,Cash loans,F,N,Y,2,135000.0,808650.0,26086.5,675000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.030755,-11819,-363,-4132.0,-4396,1,1,0,1,0,0,4.0,2,2,THURSDAY,13,0,0,0,0,1,1,Industry: type 11,0.649083,0.621226,3.0,1.0,3.0,1.0,-1939.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
3,Cash loans,F,Y,Y,0,180000.0,1113840.0,47322.0,900000.0,Unaccompanied,Commercial associate,Higher education,Widow,House / apartment,0.035792,-19489,-5329,-9861.0,-3044,1,1,0,1,1,0,1.0,2,2,MONDAY,14,0,0,0,0,0,0,Business Entity Type 3,0.695969,0.260856,0.0,0.0,0.0,0.0,-803.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
4,Revolving loans,F,N,Y,0,180000.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.001276,-11710,-184,-5659.0,-4177,1,1,0,1,0,0,2.0,2,2,MONDAY,7,0,0,0,0,0,0,Services,0.426832,,4.0,0.0,4.0,0.0,-1241.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
%%time
X_dev_processed_as_array = full_pipeline.transform(X_dev)

CPU times: user 224 ms, sys: 66.6 ms, total: 291 ms
Wall time: 290 ms


In [32]:
models = {
    'LGBM': lightgbm.LGBMClassifier(),
    'random_forest': RandomForestClassifier(),
    'logistic': LogisticRegression(max_iter=1000)
}

In [33]:
model_outputs = {}
for model_name, model in models.items():
    print(f"Training {model_name}")
    model.fit(X_train_processed_as_array, y_train)
    # Training ROCAUC
    predictions = model.predict_proba(X_train_processed_as_array)[:,1]
    training_rocauc = roc_auc_score(y_train, predictions)
    # Dev ROCAUC
    predictions = model.predict_proba(X_dev_processed_as_array)[:,1]
    dev_rocauc = roc_auc_score(y_dev, predictions)
    
    model_outputs[model_name] = {
        'fitted_model': model,
        'training_rocauc': training_rocauc,
        'dev_rocauc': dev_rocauc
    }

Training LGBM
Training random_forest
Training logistic


In [34]:
model_outputs

{'LGBM': {'fitted_model': LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                 importance_type='split', learning_rate=0.1, max_depth=-1,
                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                 n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                 random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                 subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
  'training_rocauc': 0.8028348943854875,
  'dev_rocauc': 0.7446343122063395},
 'random_forest': {'fitted_model': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_we

LGBM does best. Create an end-to-end pipeline for this.

## End-to-end pipeline

In [35]:
end_to_end_pipeline = Pipeline(
    [
        ('preprocessing', preprocessing_pipeline),
        ('processing', full_pipeline),
        ('model', lightgbm.LGBMClassifier())
    ]
)

In [38]:
X_train = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_train = train['TARGET']
X_dev = dev.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y_dev = dev['TARGET']

In [39]:
end_to_end_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 Pipeline(memory=None,
                          steps=[('drop_columns',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=<function drop_columns at 0x10d876950>,
                                                      inv_kw_args=None,
                                                      inverse_func=None,
                                                      kw_args={'columns_to_drop': ['OWN_CAR_AGE',
                                                                                   'OCCUPATION_TYPE',
                                                                                   'EXT_SOURCE_1',
                                                                                   'APARTMENTS_AVG',
                                                              

In [43]:
training_predictions = end_to_end_pipeline.predict_proba(X_train)[:,1]
roc_auc_score(y_train, training_predictions)

0.8028348943854875

In [44]:
dev_predictions = end_to_end_pipeline.predict_proba(X_dev)[:,1]
roc_auc_score(y_dev, dev_predictions)

0.7446343122063395

In [59]:
#imputation_pipeline['impute'].named_transformers_['most_frequent_imputer'].statistics_

## Save pipeline

We cannot simply save the `Pipeline` as a pickle. When we unpickle the object in a different script the classes used to construct the pipeline are no longer available. See https://www.stefaanlippens.net/python-pickling-and-dealing-with-attributeerror-module-object-has-no-attribute-thing.html for an explanation of the problem. https://stackoverflow.com/questions/46077793/how-to-save-a-custom-transformer-in-sklearn provides a solution.

We need to create a class which constructs and generates the fitted pipeline. We then import this class in any script where we need to unpickle the fitted pipeline. We create this class in `src.models.baseline_model`.

The pipeline above can be created and saved by running `python src/models/baseline_model.py`

## Submit predictions

In [53]:
test = pd.read_parquet('data/interim/test_for_submission.parquet')
print(test.shape)
test.head()

(48744, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.01885,-19241,-2329,-5170.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,0.0612,0.9732,,,,0.1379,0.125,,,,0.0526,,,0.0666,0.059,0.9732,,,,0.1379,0.125,,,,0.0514,,,,block of flats,0.0392,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699787,0.610991,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,0.2049,0.997,0.9608,0.1176,0.3222,0.2759,0.375,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.997,0.9597,0.1173,0.32,0.2759,0.375,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,reg oper account,block of flats,0.37,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202145,0.425687,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


In [54]:
test_predictions = end_to_end_pipeline.predict_proba(test)[:, 1]



In [55]:
test_predictions

array([0.05798832, 0.20793293, 0.01935206, ..., 0.08450974, 0.07383805,
       0.14903297])

In [56]:
test['TARGET'] = test_predictions

In [57]:
test[['SK_ID_CURR', 'TARGET']].to_csv('data/processed/baseline_model_pipeline_submission.csv', index=False)

In [58]:
!kaggle competitions submit -c home-credit-default-risk -f data/processed/baseline_model_pipeline_submission.csv -m "Baseline model pipeline"

100%|███████████████████████████████████████| 1.26M/1.26M [00:07<00:00, 169kB/s]
Successfully submitted to Home Credit Default Risk

## Summary

Private score: 0.72801 (5880/7175)

Public score: 0.73225 (5837/7175)

Pretty bad rank for baseline model. Not a big surpise given number of features dropped in application dataset and use of only one dataset. Next we evaluate this model to understand where it is performing badly.