In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import model_selection, linear_model
from sklearn.impute import SimpleImputer
import util

In [None]:
# Path: data-processing.ipynb
# Read in the data
df = pd.read_csv('data/application_train.csv')
df.head(15)

In [None]:
# Removes columns we first deemed are useless
util.defaultClean(df)
target = df["TARGET"]
df.drop(columns = ["TARGET"], inplace=True)

In [None]:
bureau = pd.read_csv('data/bureau.csv')
bureau.head(15)
bureau = bureau[bureau["CREDIT_CURRENCY"] == "currency 1"]
bureau.drop(columns = ["SK_ID_BUREAU", "CREDIT_ACTIVE", "CREDIT_CURRENCY", "DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT", "CREDIT_TYPE", "DAYS_CREDIT_UPDATE", "AMT_ANNUITY"], inplace=True)
bureau.fillna(0, inplace=True)
bureau["CREDIT_BUREAU_APPLICATION_COUNT"] = 1
bureau = bureau.groupby("SK_ID_CURR").agg("sum")
df = df.merge(bureau, how="left", on="SK_ID_CURR")

In [None]:
df.drop(columns = ["SK_ID_CURR"], inplace=True)

In [None]:
# All columns with NaNs
df2 = df.loc[:, df.isnull().any()]
print("numCols: ", len(df2.columns))
df2.columns

In [None]:
# Calculates percentage of NaNs alongside data type of column
percentNull = {}
for col in df.columns:
    percent = (len(df[df[col].isnull()]))/len(df)
    if percent > 0:
        percentNull[col] = (df.dtypes[col], percent)
percentNull

In [None]:
# Shows all rows where a certain column has NaNs
df[df['EXT_SOURCE_2'].isnull()]

In [None]:
# Y'ALL, WE'RE DUMB AS SHIT. Check this out:
col_descriptions = pd.read_csv('data/HomeCredit_columns_description.csv', encoding = "ISO-8859-1")
col_descriptions.head(20)

# Data Preprocessing

In [None]:
# All rows where AMT_ANNUITY is NaN
df_annuity = df[df["AMT_ANNUITY"].notnull()]
# Average of non NaN AMT_ANNUITY
numerator = df_annuity["AMT_ANNUITY"].mean()
# Average of non NaN AMT_CREDIT
denominator = df_annuity["AMT_CREDIT"].mean()
# Average ratio of AMT_ANNUITY / AMT_CREDIT
ratio = numerator / denominator
# Set Nulls in AMT_ANNUITY to ratio multiplied by the relevant AMT_CREDIT index
df["AMT_ANNUITY"] = df["AMT_ANNUITY"].fillna(ratio * df["AMT_CREDIT"])

 # Set Nulls in AMT_GOODS_PRICE to the relevant AMT_CREDIT index
df["AMT_GOODS_PRICE"] = df["AMT_GOODS_PRICE"].fillna(df["AMT_CREDIT"])

# Columns where we fill null with 0
zero_cols = ["OWN_CAR_AGE", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK",
        "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR"]
# Set Nulls in zero cols to 0
for col in zero_cols:
        df[col] = df[col].fillna(0)

## Handle Categorical Data

In [None]:
str_columns = df.select_dtypes(['string', "object"]).columns
df[str_columns] = df[str_columns].astype("category")
cat_columns = df.select_dtypes(['category']).columns
# df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes + 1)
df = util.onehot_categorical_columns(df, cat_columns)

In [None]:
df.head()

In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy = "constant", fill_value=0)
df[:] = imputer.fit_transform(df)

In [None]:
df.columns

In [None]:
from get_feature_importance import get_feature_imp
# get_feature_imp(df, target)

In [None]:
# output = util.run_pca(df, 60)
output = df

In [None]:


train_X, test_X, train_y, test_y = model_selection.train_test_split(output.to_numpy(), target.to_numpy(), test_size=0.25, shuffle=True, stratify=target.to_numpy())
print("training size:", len(train_X))
print("testing size:", len(test_X))

In [None]:
from classifier import run_and_compare
run_and_compare(train_X, train_y, test_X, test_y)