# Home Credit Default Risk Prediction
**Xin Zhao**
<br>
This is a case study on Home Credit Default Risk Prediction using data from Home Credit Group and various machine learning algorithms. Techniques on feature selection and principal component analysis are also considered. In the end, XGBoost algorithm is selected. The prediction AUC score is 0.75. 

In [None]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import matplotlib, matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb

# Explore and process bureau data

In [None]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')

## Shape

In [None]:
bureau.shape

## Categorical features in bureau

In [None]:
cat_features = bureau.select_dtypes(include='object').columns.tolist()
cat_features

## Subset categorical features from bureau data

In [None]:
bureau_cat = bureau[cat_features] 
bureau_cat

## Onehot encode the categorical features

In [None]:
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
bureau_cat_encode = pd.DataFrame(onehot_encoder.fit_transform(bureau_cat))
bureau_cat_encode

In [None]:
feature_encode = onehot_encoder.get_feature_names(cat_features)
bureau_cat_encode.columns = feature_encode
bureau_cat_encode

## Extract numerical features and combine into the encoded categorical data

In [None]:
num_features = ['SK_ID_CURR', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'DAYS_CREDIT']
bureau_num = bureau[num_features] 
bureau_proc = pd.concat([bureau_num, bureau_cat_encode], axis=1)
print(bureau_proc.shape)
print(bureau_num.shape)
print(bureau_cat_encode.shape)

## Group sum and mean bureau data by loan ID

In [None]:
bureau_proc_sum = bureau_proc.groupby(['SK_ID_CURR']).sum()
bureau_proc_mean = bureau_proc.groupby(['SK_ID_CURR']).mean()

In [None]:
bureau_proc_final = pd.merge(bureau_proc_sum, bureau_proc_mean, how='left', on='SK_ID_CURR')

In [None]:
bureau_proc_final.to_csv('bureau_proc_final.csv', index=True)

In [None]:
bureau_proc_final = pd.read_csv('bureau_proc_final.csv')

# Process application_train/test

In [None]:
application_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
application_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

## Check if 'SK_ID_CURR' in the train data

In [None]:
 'SK_ID_CURR' in application_train.columns

## Merge bureau_proc_final to application data

In [None]:
train_merged = pd.merge(application_train, bureau_proc_final, how='left', on='SK_ID_CURR')
test_merged = pd.merge(application_test, bureau_proc_final, how='left', on='SK_ID_CURR')
print(application_train.shape)
print(train_merged.shape)
print(application_test.shape)
print(test_merged.shape)

## Drop ID columns for training

In [None]:
train_merged.drop('SK_ID_CURR', axis=1, inplace=True)
test_merged.drop('SK_ID_CURR', axis=1, inplace=True)

## Extract numerical and categorical features for further processing: impute, scaling, one-hot encoding

In [None]:
num_features_1 = train_merged.select_dtypes(include='int64').columns.tolist()
num_features_1.remove('TARGET')

In [None]:
num_features_2 = train_merged.select_dtypes(include='float64').columns.tolist()

In [None]:
cat_features = train_merged.select_dtypes(include='object').columns.tolist()

In [None]:
num_features = num_features_1 + num_features_2
features = num_features + cat_features


num_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
       # ('scaler', MinMaxScaler())  
        ('scaler', StandardScaler())  
    ]
)

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [None]:
preprocessor.fit(train_merged[features])
X_train = preprocessor.transform(train_merged[features])
X_test = preprocessor.transform(test_merged[features])

y_train = train_merged.TARGET.values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape: ', X_test.shape)

# Train XGB model

In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc", max_depth=4,learning_rate=0.277, gamma=0.382)
xgb_model.fit(X_train, y_train)

# Calculate training accuracy

In [None]:
print(roc_auc_score(y_train, xgb_model.predict_proba(X_train)[:,1]))

# Get prediction

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission['TARGET'] =xgb_model.predict_proba(X_test)[:,1]
submission.head()

# Save the submission 

In [None]:
submission.to_csv('my_submission.csv', index=False)