In [None]:
import numpy as np
import pandas as pd
from dstk.utils.data_cleaning import clean_columns

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

%matplotlib inline

In [None]:

plt.rcParams['figure.figsize'] = (9,6)
np.set_printoptions(suppress=True)
pd.options.display.max_columns = 150



FILES:

    - application_test.csv.zip
    - application_train.csv.zip

In [None]:
col_des = pd.read_csv('HomeCredit_columns_description.csv', encoding='latin-1')
col_des.shape

In [None]:
col_des.head()

In [None]:
col_des.Table.unique()

## Cleaning
 
 
   - Examine Distribution of Target (check for imbalanced classes)
   - Examine Column Types
   - Remove/Impute Anomalies
     - You can also create a categorical value (0,1) for whether or not the data was anomalous.
   - Fill Missing Variables
   - Label Encode binary features and OHE multiple categorical ftrs
     - _Make sure to drop categories that are not in the test set!_
     - check out df.align?
   - Look for correlations
   - Feature Engineering

## Applications

Main File

Contains main id. 1 Row is 1 loan.

In [None]:
app_train = pd.read_csv('application_train.csv.zip', dtype= {'SK_ID_CURR':str})
app_test = pd.read_csv('application_test.csv.zip', dtype= {'SK_ID_CURR':str})
app_train.shape, app_test.shape

In [None]:
app_train.head()

In [None]:
list(
    zip(col_des[col_des.Table=='application_{train|test}.csv'].Row.tolist(),col_des[col_des.Table=='application_{train|test}.csv'].Description.tolist())
)

In [None]:
# Well fuck
app_train.TARGET.hist()

In [None]:
app_train.groupby('CODE_GENDER').TARGET.sum()

In [None]:
sns.kdeplot(app_train.loc[app_train.TARGET==0, 'DAYS_BIRTH']/-365, label = 'Repayed')
sns.kdeplot(app_train.loc[app_train.TARGET==1, 'DAYS_BIRTH']/-365, label = 'Defaulted')
plt.xlabel('Age Yrs')

### Clean Columns

In [None]:
# clean columns
app_train.columns = clean_columns(app_train)
app_test.columns = clean_columns(app_test)
app_train.shape, app_test.shape

### Check Column Types

In [None]:
def col_descrip(table, col):
    "Looks up column description for given table"
    print(table)
    print(col)
    return col_des.loc[(col_des.Table==table) & (col_des.Row==col.upper()), 
            'Description'].values[0]

In [None]:
app_train.dtypes.unique()

In [None]:
pk = ['sk_id_curr']
tgt = ['target']
obj_cols = app_train.dtypes[app_train.dtypes=='object'].index.drop(pk).tolist()
int_cols = app_train.dtypes[app_train.dtypes=='int64'].index.tolist()
float_cols = app_train.dtypes[app_train.dtypes=='float64'].index.tolist()
indicator_cols = []


numeric_cols = int_cols + float_cols

In [None]:
# Some columns are already encoded
indicator_cols.extend(app_train[numeric_cols].nunique()[app_train[numeric_cols].nunique() == 2].index.tolist())
int_cols = [i for i in int_cols if i not in indicator_cols]
float_cols = [i for i in float_cols if i not in indicator_cols]


numeric_cols = int_cols + float_cols

In [None]:
# Target column separate
for c in tgt:
    indicator_cols.remove(c)


In [None]:
# not sure what to do about these :-/
# NOTE nunique() does NOT count nans!
app_train[numeric_cols].nunique()[app_train[numeric_cols].nunique() < 10]

In [None]:
col_descrip('application_{train|test}.csv', 'region_rating_client')

In [None]:
col_descrip('application_{train|test}.csv', 'amt_req_credit_bureau_day')

In [None]:
col_descrip('application_{train|test}.csv', 'amt_req_credit_bureau_hour')

In [None]:
app_train['amt_req_credit_bureau_day'].unique()

In [None]:
len(pk) + len(tgt) + len(obj_cols) + len(indicator_cols) + len(int_cols) + len(float_cols), app_train.columns.shape

In [None]:
# Order the columns
app_train = app_train[pk + tgt + sorted(obj_cols) + sorted(indicator_cols) + sorted(int_cols) + sorted(float_cols)]
app_test = app_test[pk + sorted(obj_cols) + sorted(indicator_cols) + sorted(int_cols) + sorted(float_cols)]
app_train.shape, app_test.shape

In [None]:
# all columns that have ANY negative numbers
neg_cols = app_train[numeric_cols].loc[:,(app_train[numeric_cols] < 0).any(0)].columns

In [None]:
# make positive for interpretation sake
app_train[neg_cols] = app_train[neg_cols] * -1
app_test[neg_cols] = app_test[neg_cols] * -1

In [None]:
# Days employed has some negative some positive values
app_train[numeric_cols].loc[:,(app_train[numeric_cols] < 0).any(0)].columns

In [None]:
# multiply just neg values by -1
app_train.loc[app_train['days_employed']<0, 'days_employed'] = app_train.loc[app_train['days_employed']<0, 'days_employed'] * -1

#### Anomalies

In [None]:
def anom_eval(df, col, filename):
    print(col_descrip(filename, col))
    print()

    max_diff_idx = df[col].sort_values().diff().nlargest(3).index
    max_diffs = df.loc[max_diff_idx, col]
    nlargest = df[col].nlargest()
    nsmallest = df[col].nsmallest()

    print('Max Diffs')
    print(max_diffs)
    print()
    print("Largest Vals")
    print(nlargest)
    print()
    print("Smallest Vals")
    print(nsmallest)
    df[col].hist()
    df[col].value_counts().sort_index()
    
    return max_diffs, nlargest

In [None]:
# col = 'amt_income_total'
# app_train[col].hist()

# sns.boxplot(app_train[col], whis=10)

# p25 = np.percentile(app_train[col], 25)
# p75 = np.percentile(app_train[col], 75)
# iqr = p75-p25

# max_val = p75 + 10*iqr
# min_val = p25 - 10*iqr

# app_train.loc[(app_train[col]>max_val) | (app_train[col]<min_val), col].unique()

Identify absurd outliers by looking at large jumps in data. 

These are for distributions where there are a subset of values WAY outside the normal range such as days_employed


In [None]:
gaps = app_train[numeric_cols].apply(lambda ser: ser.sort_values().diff().max()/ser.std())
gaps[gaps>2]

In [None]:
# From manual exploration, these are the columns with ridiculous outliers
cols = ['cnt_children',
        'days_employed',
        'amt_income_total',
        'cnt_fam_members',
        'obs_30_cnt_social_circle',
        'def_30_cnt_social_circle',
        'obs_60_cnt_social_circle',
        'def_60_cnt_social_circle',
        'amt_req_credit_bureau_qrt']

In [None]:
anom_eval(app_train, 'cnt_children', 'application_{train|test}.csv')

In [None]:
for col in cols:
    print(col)
    max_diff_idx = app_train[col].sort_values().diff().nlargest(1).index
    max_diff = app_train.loc[max_diff_idx, col].squeeze()
    denom = int(np.log10(max_diff))
    
    if denom < 2:
        # if we are only in the 10s palce leave it
        cutoff = max_diff
    else:
        # if we are greater than the 10s place round down
        cutoff = int(max_diff/(10**denom))*(10**denom)
    
    anoms_train = app_train.loc[app_train[col]>=cutoff, col]
    anoms_test = app_test.loc[app_test[col]>=cutoff,col]
    # fill with vals from train set
    rest = app_train.loc[~app_train.index.isin(anoms_train.index), col]
    
    # replace outliers with median from non-outlying training data
    app_train.loc[anoms_train.index, col] = rest.median()
    app_test.loc[anoms_test.index, col] = rest.median()
    
    # create identifier for anomalies
    app_train[col+'_anom'] = 0
    app_test[col+'_anom'] = 0
    app_train.loc[anoms_train.index, col+'_anom'] = 1
    app_test.loc[anoms_test.index, col+'_anom'] = 1
    
    # add to indicator columns
    indicator_cols.append(col+'_anom')

#### Fill NAs

In [None]:
from sklearn.preprocessing import Imputer

In [None]:
def pct_null(df):
    null_counts = df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending=False).to_frame()
    if null_counts.empty:
        return None
    null_counts['pct_null'] = null_counts/df.shape[0]
    null_counts.columns = ['n_null', 'pct_null']
    return null_counts

In [None]:
# good that no null primary keys
app_train[pk].isnull().sum(), app_test[pk].isnull().sum()

For columns where less than 20% is null, i'm filling with the most-frequent value

For columns where most is null i'm filling with "missing" and this will get encoded as it's own category.

__NOTE__ A _better_ strategy would be to compute how much the target distributions differ for rows where the column is null and if it is distinctly different then label as "missing" so that it is predictive. Otherwise fill with most-frequent


_OBJECT COLS_

In [None]:
# Train
obj_nulls = pct_null(app_train[obj_cols])
fill_most_frequent = obj_nulls[obj_nulls.pct_null < 0.2].index.tolist()
fill_missing = obj_nulls.index.drop(fill_most_frequent)

In [None]:
modes = app_train[fill_most_frequent].mode().T.to_dict()[0]
app_train.fillna(modes, inplace=True)
app_train.fillna(dict.fromkeys(fill_missing,'MISSING'), inplace=True)

In [None]:
# Test
obj_nulls = pct_null(app_test[obj_cols])
fill_most_frequent = obj_nulls[obj_nulls.pct_null < 0.2].index.tolist()
fill_missing = obj_nulls.index.drop(fill_most_frequent)

In [None]:
# Note i'm filling with most-frequents from train data even for test missings
modes = app_train[fill_most_frequent].mode().T.to_dict()[0]
app_test.fillna(modes, inplace=True)

In [None]:
app_test.fillna(dict.fromkeys(fill_missing,'MISSING'), inplace=True)

_NUMERIC COLS_

In [None]:
# for numeric cols, just fill w median
medians = app_train[numeric_cols].median().squeeze()
app_train.fillna(medians, inplace=True)
app_test.fillna(medians, inplace=True)

### Convert Categorical Variables To Numeric

Label Encode binary categoricals and OHE other categoricals with multiple values

In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
# first make sure no columns have only 1 value
app_train.columns[app_train.nunique() < 2]

In [None]:
binary_cols = app_train[obj_cols].nunique()[app_train[obj_cols].nunique() ==2].index.tolist()
binary_cols

In [None]:
lcoder = LabelEncoder()
for col in binary_cols:
    print(col)
    # Check to make sure it's binary accross both test and train
    if not set(app_train[col].unique()) == set(app_test[col].unique()):
        print(f'{col} is not Binary!')
        print('Values in train:', set(app_train[col].unique()))
        print('Values in test:', set(app_train[col].unique()))
        continue
    app_train[col] = lcoder.fit_transform(app_train[col])
    app_test[col] = lcoder.transform(app_test[col])
    # don't forget to add it to indicator cols
    indicator_cols.append(col)
    obj_cols.remove(col)

In [None]:
# One hot encode
non_binary_cats = [col for col in obj_cols if col not in binary_cols]
non_binary_cats

In [None]:
for col in non_binary_cats:
    print(col)
    ohe_train = pd.get_dummies(app_train[col]) # drop_first?
    ohe_train.columns = col + '_' + ohe_train.columns
    
    ohe_test = pd.get_dummies(app_test[col]) # they might not align if I drop_first...
    ohe_test.columns = col + '_' + ohe_test.columns
    ohe_test = ohe_test.align(ohe_train, 'left', 1)[0].fillna(0)
    
    # now drop first
    ohe_train.drop(ohe_train.columns[0], axis=1, inplace=True)
    ohe_test.drop(ohe_train.columns[0], axis=1, inplace=True) # should be the same first column, but just incase
    
    app_train = pd.concat([app_train, ohe_train], axis=1)
    app_test = pd.concat([app_test, ohe_test], axis=1)
    print(app_train.shape, app_test.shape)
    del app_train[col]
    del app_test[col]
    obj_cols.remove(col)
    indicator_cols.extend(ohe_train.columns)

In [None]:
app_train.dtypes.value_counts()

In [None]:
app_train.to_csv('clean_data/app_train.csv',index=False)
app_test.to_csv('clean_data/app_test.csv',index=False)