## Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

### Import packages

In [None]:
# standard Python tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss   # need this for chi-squared function

# special tools for working in Kaggle
import joblib   # save and load ML models
import gc       # garbage collection
import os 

# preprocessing steps
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# machine learning models and tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# surely there will be a lot more packages loaded by the time we are done!

# First look at training data set

### Read the training data

In [None]:
MainDir = "../input/../input/home-credit-default-risk"
print(os.listdir(MainDir))

# Main table
train = pd.read_csv(f'{MainDir}/application_train.csv')

# Supplemental data - we can create additional feature sets by analyzing these.
bureau = pd.read_csv(f'{MainDir}/bureau.csv')


### Reminder - these are the columns in train

In [None]:
train.head(5)

# Bureau table

### first look - head of bureau table, size and shape, etc.

In [None]:
print(bureau.shape)                                        # size of table - 17 columns x 1.72 million rows
print(bureau.SK_ID_CURR.nunique(), "unique SK_ID_CURR")    # number of unique SK_ID_CURR is 305,811, similar to size of train.csv

# identifying column types
types = np.array([z for z in bureau.dtypes])               # array([dtype('float64'), dtype('float64'), dtype('O'), dtype('O') ...])
all_columns = bureau.columns.values                        # list of all column names
is_num = types != 'object'                                 # returns array([False, False, False, False,  True,  True, ...) where True is a numeric variable
num_features = all_columns[is_num].tolist()                # list of all numeric columns
cat_features = all_columns[~is_num].tolist()               # list of all categorical columns
print(len(num_features), "numeric features")               # looks like we have 14 numeric features (including the two key fields)
print(len(cat_features), "categorical features")           # ... and three categorical features

bureau.head(5)
# SK_ID_CURR key field will let us merge this data into the train.csv table; SK_ID_BUREAU key field will let us merge with bureau_balance.csv.

### wrangling new features from bureau.csv table

In [None]:
# check for missing values:
bureau.isna().sum().to_frame().sort_values(0, ascending = False)

# no categorical variables have missing values. For now, let's assume that missing numeric values are zeroes.
bureau.fillna(0, inplace = True)

# let's get some stats grouped on SK_ID_CURR
Grouped = (bureau
           .groupby('SK_ID_CURR')
           .agg(
               {'SK_ID_CURR': 'count', 
                'AMT_CREDIT_SUM': 'sum',
                'AMT_CREDIT_SUM_DEBT': 'sum',
                'AMT_CREDIT_SUM_OVERDUE': 'sum',
                'AMT_ANNUITY' : 'sum',
                'DAYS_CREDIT': 'max',
                'CREDIT_DAY_OVERDUE' : 'max',
                'AMT_CREDIT_MAX_OVERDUE' : 'sum'
               }
           )
          )
Grouped

### merge our new features into the training data

In [None]:
#Merge into train.csv
train['SK_ID_CURR'] = train['SK_ID_CURR'].astype(str)
Grouped['SK_ID_CURR'] = Grouped['SK_ID_CURR'].astype(str)

train = train.merge(Grouped, on = 'SK_ID_CURR', how = 'left')

# getting all NA on merged columns
train.isna().sum().to_frame().sort_values(0, ascending = False)


### Some Visualizations

# Appendix - data descriptions

In [None]:
# Description table contains characters that are unprintable with UTF8 encoding, so we need to open it this way:

with open(f'{MainDir}/HomeCredit_columns_description.csv', 'r', encoding = 'ISO-8859-1') as csvfile:
    desc = pd.read_csv(csvfile)
pd.set_option("display.max_rows", None)               # print entire thing, not just first and last rows
pd.options.display.max_colwidth = 100                 # description column
desc