# Load Dependencies and data

In [1]:
from __future__ import print_function

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
np.random.seed(0)


train_data = pd.read_csv('application_train.csv') #data we have the target class for 
test_data = pd.read_csv('application_test.csv') #data we need to predict target class for, for competition

col_names = train_data.columns.values.tolist()
print(col_names)



['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELE

# Train, Test, Split

In [7]:
train_data = train_data.sample(10000) #data size too large for available compute

In [8]:
X_train_data = train_data.drop('TARGET', axis=1)
y_train_data = train_data['TARGET']


X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

print(f"X train           shape: {X_train.shape}")
print(f"X validation      shape: {X_valid.shape}")
print(f"X test            shape: {X_test.shape}")

X_train.head()

X train           shape: (5780, 121)
X validation      shape: (1200, 121)
X test            shape: (1020, 121)


Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
67421,178187,Cash loans,M,N,N,0,270000.0,486000.0,21537.0,486000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
53904,162456,Cash loans,F,N,N,0,135000.0,1369773.0,54324.0,1260000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
195148,326288,Cash loans,F,N,Y,0,112500.0,708939.0,25591.5,612000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
133832,255217,Cash loans,F,N,Y,0,90000.0,1024290.0,30078.0,855000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
302915,450951,Revolving loans,F,Y,Y,1,67500.0,337500.0,16875.0,337500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


# Pipelines

## Knn pipeline

In [9]:
numeric_features = ['CNT_CHILDREN','AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY','OCCUPATION_TYPE']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier())])


knn_model.fit(X_train, y_train)
print(knn_model.score(X_test, y_test))
#X_dist_graph = model.fit_transform(X_train)
#X_dist_graph.shape


0.8931372549019608


In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

try:
    del expLog
except:
    pass

exp_name = "knn_baseline"
try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train F1", 
                                   "Valid F1",
                                   "Test  F1"
                                  ])

expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, knn_model.predict(X_train)), 
                accuracy_score(y_valid, knn_model.predict(X_valid)),
                accuracy_score(y_test, knn_model.predict(X_test)),
                f1_score(y_train, knn_model.predict(X_train)),
                f1_score(y_valid, knn_model.predict(X_valid)),
                f1_score(y_test, knn_model.predict(X_test))],
    4)) 
expLog

Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train F1,Valid F1,Test F1
0,knn_baseline,0.9227,0.9225,0.8931,0.1252,0.0211,0.018


In [6]:
X_train.sample(100)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
92581,207505,Cash loans,M,N,N,2,180000.0,675000.0,32602.5,675000.0,...,0,0,0,0,0.0,1.0,1.0,0.0,0.0,2.0
85969,199754,Cash loans,F,N,Y,0,126000.0,247275.0,17716.5,225000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,3.0
155073,279751,Revolving loans,F,N,Y,1,90000.0,270000.0,13500.0,270000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
233105,369999,Cash loans,F,N,Y,0,144000.0,545040.0,20677.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
19282,122496,Cash loans,M,Y,Y,0,67500.0,193572.0,10116.0,171000.0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18449,121521,Cash loans,M,Y,Y,0,225000.0,539100.0,22837.5,450000.0,...,0,0,0,0,,,,,,
174180,301851,Cash loans,F,N,Y,0,157500.0,450000.0,22018.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,2.0
224336,359837,Cash loans,F,N,Y,2,90000.0,343800.0,16852.5,225000.0,...,0,0,0,0,,,,,,
113725,231891,Cash loans,F,N,Y,0,99000.0,277969.5,17892.0,229500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


## XGB_ pipeline

In [14]:
from xgboost import XGBClassifier

numeric_features = ['CNT_CHILDREN','AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY','OCCUPATION_TYPE']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

XGB_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGBClassifier())])


XGB_model.fit(X_train, y_train)
print(XGB_model.score(X_test, y_test))
#X_dist_graph = model.fit_transform(X_train)
#X_dist_graph.shape




0.888235294117647


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


exp_name = "XGB_baseline"
try:
    expLog
except NameError:
   expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train F1", 
                                   "Valid F1",
                                   "Test  F1"
                                  ])

expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, XGB_model.predict(X_train)), 
                accuracy_score(y_valid, XGB_model.predict(X_valid)),
                accuracy_score(y_test, XGB_model.predict(X_test)),
                f1_score(y_train, XGB_model.predict(X_train)),
                f1_score(y_valid, XGB_model.predict(X_valid)),
                f1_score(y_test, XGB_model.predict(X_test))],
    4)) 
expLog

Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train F1,Valid F1,Test F1
0,knn_baseline,0.9227,0.9225,0.8931,0.1252,0.0211,0.018
1,XGB_baseline,0.9227,0.9225,0.8931,0.1252,0.0211,0.018
2,XGB_baseline,0.974,0.9242,0.8882,0.8077,0.0,0.0339


## Logistic Regression Pipeline

In [20]:
from xgboost import XGBClassifier

numeric_features = ['CNT_CHILDREN','AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'DAYS_BIRTH', 'DAYS_EMPLOYED']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY','OCCUPATION_TYPE']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

logreg_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=10000))])


logreg_model.fit(X_train, y_train)
print(logreg_model.score(X_test, y_test))
#X_dist_graph = model.fit_transform(X_train)
#X_dist_graph.shape


0.8970588235294118


In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


exp_name = "logreg_baseline"
try:
    expLog
except NameError:
   expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train Acc", 
                                   "Valid Acc",
                                   "Test  Acc",
                                   "Train F1", 
                                   "Valid F1",
                                   "Test  F1"
                                  ])

expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [accuracy_score(y_train, logreg_model.predict(X_train)), 
                accuracy_score(y_valid, logreg_model.predict(X_valid)),
                accuracy_score(y_test, logreg_model.predict(X_test)),
                f1_score(y_train, logreg_model.predict(X_train)),
                f1_score(y_valid, logreg_model.predict(X_valid)),
                f1_score(y_test, logreg_model.predict(X_test))],
    4)) 
expLog

Unnamed: 0,exp_name,Train Acc,Valid Acc,Test Acc,Train F1,Valid F1,Test F1
0,knn_baseline,0.9227,0.9225,0.8931,0.1252,0.0211,0.018
1,XGB_baseline,0.9227,0.9225,0.8931,0.1252,0.0211,0.018
2,XGB_baseline,0.974,0.9242,0.8882,0.8077,0.0,0.0339
3,logreg_baseline,0.9194,0.9275,0.8971,0.0,0.0,0.0
