In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb 

from sklearn.metrics import accuracy_score
from xgboost import plot_importance

import joblib

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train.set_index(['SK_ID_CURR'], inplace=True)
train.shape

In [None]:
train.head()

In [None]:
train.info(max_cols = 200)

### Check label distribution

In [None]:
# Display a DataFrame with proportion of Survived
s = train.TARGET
cts = s.value_counts()
pct = s.value_counts(normalize = True).mul(100).round(1)
pd.DataFrame({'counts': cts, 'percent': pct})

### Data Preparation & Feature Engineering

In [None]:
#convert catergorical festures to cat
cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 
            'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
            'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

train[cat_cols] = train[cat_cols].astype('category')

In [None]:
#create credit annuity ratio feature
train['CRED_ANNUITY'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']

#set max income to 2.5 million
train = train[train['AMT_INCOME_TOTAL'] < 2500000]

#replace 365243 in days employed with nan
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
train['AGE'] = train['DAYS_BIRTH'] / - 365


#create avg of each row of EXIT_SOURCE values
train['AVG_EXT'] = train.iloc[:, 40:43].sum(axis=1)/(3- train.iloc[:,40:43].isnull().sum(axis=1))
train.EXT_SOURCE_1.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_2.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_3.fillna(train.AVG_EXT, inplace=True)

In [None]:
#remove columns with mode and median building information 
dels = ['DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 
        'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 
        'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
        'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 
        'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

train1 = train.drop(train[dels], axis = 1)
train1.head()

In [None]:
train1.info(max_cols = 100)

### Build Pipelines

In [None]:
#create a list of numerical features
num_feat = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 
            'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 
            'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'CRED_ANNUITY', 'AGE']

#create a list of categorical features 
cat_feat = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
            'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 
            'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
            'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 
            'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE']


print(num_feat)
print(cat_feat)

In [None]:
#combine the numeric and categorical lists into a list called features
features = num_feat + cat_feat
print(features)

In [None]:
#create a Pipeline for processing the num_feat
num_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'median')),
           ('scaler', StandardScaler())
    ])


#create a Pipeline for processing the cat_feat
cat_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])


#create a ColumnTransformer that combines the two pipelines
preprocessor = ColumnTransformer(
    transformers = [('num', num_pipe, num_feat),
                    ('cat', cat_pipe, cat_feat)
    ])

In [None]:
# Fit the preprocessor to the training data, selecting only the columns in the 'features' list. 
preprocessor.fit(train1[features])

In [None]:
train30k = train.sample(frac=0.10, replace=False, random_state=1)


In [None]:
#define y_train, apply the fitted preprocessor to the training data
y_train = train30k['TARGET'].values
X_train = preprocessor.transform(train30k[features])

In [None]:
#print the shapes of X_train and y_train

print('Shape of features: ', X_train.shape)
print('Shape of target: ', y_train.shape)

## Project Week 2 Models

**XGBoost**

In [None]:
%%time

XGB_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

XGB_parameters = {
    'max_depth': range (1, 2, 3),
    'n_estimators': range(25, 100, 200),
    'learning_rate': [0.05, 0.01, 1]
}

XGB_grid = GridSearchCV(XGB_clf, XGB_parameters, cv=10, n_jobs=10, verbose=True, scoring= 'roc_auc')
XGB_grid.fit(X_train, y_train)


XGB_model = XGB_grid.best_estimator_

print('Best Parameters:', XGB_grid.best_params_)
print('Best CV Score:  ', XGB_grid.best_score_)
print('Training Acc:   ', XGB_model.score(X_train, y_train))

**Light GBM**

In [None]:
%%time

LGBM_clf = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators= 5000, 
                              class_weight='balanced', subsample=0.8, colsample_bytree= 0.7, n_jobs=-1)

LGBM_parameters = {
    'max_depth': range (1, 2, 3),
    'learning_rate': [0.05, 0.03, 0.01], 
    'metric' : ['auc', 'binary_logloss']
}


LGBM_grid = GridSearchCV(LGBM_clf, LGBM_parameters, cv=10, n_jobs=10, verbose=True, scoring= 'roc_auc')
LGBM_grid.fit(X_train, y_train)


LGBM_model = LGBM_grid.best_estimator_

print('Best Parameters:', LGBM_grid.best_params_)
print('Best CV Score:  ', LGBM_grid.best_score_)
print('Training Acc:   ', LGBM_model.score(X_train, y_train))

#### Save pipeline and model for submission

In [None]:
#save your pipeline to a file. 
joblib.dump(preprocessor, 'wk2default_preprocessor.joblib')

In [None]:
#determine the best model found above and save that to a file. 
XGB_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, max_depth = 1, n_estimators = 25, learning_rate = 1)
XGB_model.fit(X_train, y_train)
joblib.dump(XGB_model, 'XGB_default_model.joblib')

In [None]:
LGBM_model = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators= 5000, class_weight='balanced',
                                subsample=0.8, colsample_bytree= 0.7, n_jobs=-1, 
                                learning_rate = 0.03, max_depth = 1)
LGBM_model.fit(X_train, y_train)
joblib.dump(LGBM_model, 'LGBM_default_model.joblib')

## Project Week 1 Models

In [None]:
%%time 

lr_clf = LogisticRegression(max_iter= 250, solver='saga', penalty='elasticnet')

lr_parameters = {
    'l1_ratio':[0, 0.5, 1],
    'C': [0.01, 0.1, 1]
}

lr_grid = GridSearchCV(lr_clf, lr_parameters, cv=10, refit='True', n_jobs=-1, verbose=10, scoring= 'roc_auc')
lr_grid.fit(X_train_10K, y_train_10K)

lr_model = lr_grid.best_estimator_

print('Best Parameters:', lr_grid.best_params_)
print('Best CV Score:  ', lr_grid.best_score_)
print('Training Acc:   ', lr_model.score(X_train_10K, y_train_10K))

In [None]:
# view the CV results

lr_summary = pd.DataFrame(lr_grid.cv_results_['params'])
lr_summary['cv_score'] = lr_grid.cv_results_['mean_test_score']

for r in lr_parameters['l1_ratio']:
    temp = lr_summary.query(f'l1_ratio == {r}')
    plt.plot(temp.C, temp.cv_score, label=r)
plt.xscale('log')
plt.xlabel('Regularization Parameter (C)')
plt.ylabel('CV Score')
plt.legend(title='L1 Ratio', loc='lower right')
plt.grid()
plt.show()

print(lr_summary.to_string(index=False))

#### Decision Tree

In [None]:
%%time 

dt_clf = DecisionTreeClassifier(random_state=1)

dt_parameters = {
    'max_depth': [1, 5, 7, 10],
    'min_samples_leaf': [2, 3, 4, 5]
}

dt_grid = GridSearchCV(dt_clf, dt_parameters, cv=75, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
dt_grid.fit(X_train_10K, y_train_10K)

dt_model = dt_grid.best_estimator_

print('Best Parameters:', dt_grid.best_params_)
print('Best CV Score:  ', dt_grid.best_score_)
print('Training Acc:   ', dt_model.score(X_train_10K, y_train_10K))

In [None]:
#view the CV results.

dt_summary = pd.DataFrame(dt_grid.cv_results_['params'])
dt_summary['cv_score'] = dt_grid.cv_results_['mean_test_score']

for ms in dt_parameters['min_samples_leaf']:
    temp = dt_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

print(dt_summary.to_string(index=False))

### Random Forest

In [None]:
%%time 

rf_clf = RandomForestClassifier(random_state = 1, n_estimators = 25)

rf_parameters = {
    'max_depth': [1, 3, 5, 7, 11],
    'min_samples_leaf': [4, 6, 10, 14]
}

rf_grid = GridSearchCV(rf_clf, rf_parameters, cv=10, refit='True', n_jobs=-1, verbose=0, scoring='roc_auc')
rf_grid.fit(X_train_10K, y_train_10K)

rf_model = rf_grid.best_estimator_

print('Best Parameters:', rf_grid.best_params_)
print('Best CV Score:  ', rf_grid.best_score_)
print('Training Acc:   ', rf_model.score(X_train_10K, y_train_10K))

In [None]:
#view the CV results.

rf_summary = pd.DataFrame(rf_grid.cv_results_['params'])
rf_summary['cv_score'] = rf_grid.cv_results_['mean_test_score']

for ms in rf_parameters['min_samples_leaf']:
    temp = rf_summary.query(f'min_samples_leaf == {ms}')
    plt.plot(temp.max_depth, temp.cv_score, label=ms)
plt.xlabel('Maximum Depth')
plt.ylabel('CV Score')
plt.legend(title='Min Samples')
plt.grid()
plt.show()

print(rf_summary.to_string(index=False))

In [None]:
#determine the best model found above and save that to a file. 
#rf_model = RandomForestClassifier(random_state=1, n_estimators=25, max_depth = 3, min_samples_leaf = 14)
#rf_model.fit(X_train_10K, y_train_10K)

#joblib.dump(rf_model, 'rf_default_model.joblib')

### LGBM Classifier

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier


feat = train1.drop(['SK_ID_CURR', 'TARGET'], axis=1)
label = train1['TARGET']

X_train, X_valid, y_train, y_valid = train_test_split(feat, label, test_size=0.2, random_state=3)
X_train.shape, X_valid.shape




clf = LGBMClassifier(
        n_jobs = -1,
        n_estimators = 2000,
        learning_rate = 0.1,
        num_leaves = 16,
        max_depth = 10,
        silent = -1,
        verbose = -1
        )

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, 
        early_stopping_rounds= 50)
        
from lightgbm import plot_importance

plot_importance(train1, figsize=(18, 40))