In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import model_selection, linear_model
from sklearn.impute import SimpleImputer
import util

In [None]:
# Path: data-processing.ipynb
# Read in the data
df = pd.read_csv('data/application_train.csv')
df.head(15)

In [None]:
# Removes columns we first deemed are useless
util.defaultClean(df)
target = df["TARGET"]
df.drop(columns = ["TARGET"], inplace=True)

In [None]:
bureau = pd.read_csv('data/bureau.csv')
bureau = bureau[bureau["CREDIT_CURRENCY"] == "currency 1"]
bureau.drop(columns = ["SK_ID_BUREAU", "CREDIT_ACTIVE", "CREDIT_CURRENCY", "DAYS_CREDIT", "CREDIT_DAY_OVERDUE", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT", "CREDIT_TYPE", "DAYS_CREDIT_UPDATE", "AMT_ANNUITY"], inplace=True)
bureau.fillna(0, inplace=True)
bureau["CREDIT_BUREAU_APPLICATION_COUNT"] = 1
bureau.sort_values(by=["SK_ID_CURR"], inplace=True)
bureau = bureau.groupby("SK_ID_CURR").mean()
df = df.merge(bureau, how="left", on="SK_ID_CURR")

In [None]:
prev = pd.read_csv('data/previous_application.csv')
prev.drop(columns=["SK_ID_PREV", "NAME_CONTRACT_TYPE", "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START", "FLAG_LAST_APPL_PER_CONTRACT", "NFLAG_LAST_APPL_IN_DAY",
                   "AMT_DOWN_PAYMENT", "AMT_GOODS_PRICE", "RATE_INTEREST_PRIMARY", "RATE_INTEREST_PRIVILEGED",
                   "NAME_CASH_LOAN_PURPOSE", "DAYS_DECISION", "NAME_TYPE_SUITE", "NAME_GOODS_CATEGORY", "NAME_PORTFOLIO", "NAME_PRODUCT_TYPE", "CHANNEL_TYPE", "SELLERPLACE_AREA", "NAME_SELLER_INDUSTRY", "CNT_PAYMENT",
                   "PRODUCT_COMBINATION", "DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE", "DAYS_LAST_DUE_1ST_VERSION", "DAYS_LAST_DUE", "DAYS_TERMINATION"], inplace=True)
prev = prev[prev["NAME_CONTRACT_STATUS"] != "Canceled"]
prev.fillna(0, inplace=True)
prev["TIME_TO_PAY"] = prev["AMT_CREDIT"] / prev["AMT_ANNUITY"]
prev["AMT_CREDIT"] = np.where(prev["NAME_CONTRACT_STATUS"] == "Refused", 0, prev["AMT_CREDIT"])
prev["CREDIT_DIFFERENCE"] = prev["AMT_CREDIT"] - prev["AMT_APPLICATION"]
prev.drop(columns=["AMT_ANNUITY", "AMT_APPLICATION", "AMT_CREDIT"], inplace=True)

str_columns = prev.select_dtypes(['string', "object"]).columns
prev[str_columns] = prev[str_columns].astype("category")
cat_columns = prev.select_dtypes(['category']).columns
prev = util.onehot_categorical_columns(prev, cat_columns)
prev.replace({False: 0, True: 1}, inplace=True)

prev1 = prev[["SK_ID_CURR", "RATE_DOWN_PAYMENT", "NFLAG_INSURED_ON_APPROVAL", "TIME_TO_PAY", "CREDIT_DIFFERENCE"]]
prev2 = prev.drop(columns=["RATE_DOWN_PAYMENT", "NFLAG_INSURED_ON_APPROVAL", "TIME_TO_PAY", "CREDIT_DIFFERENCE"])
prev1 = prev1.groupby("SK_ID_CURR").agg(
    AVG_DOWN_PAYMENT=pd.NamedAgg(column="RATE_DOWN_PAYMENT", aggfunc="mean"),
    NUM_INSURED=pd.NamedAgg(column="NFLAG_INSURED_ON_APPROVAL", aggfunc="sum"),
    AVG_MONTHS_TO_PAY=pd.NamedAgg(column="TIME_TO_PAY", aggfunc="mean"),
    TOTAL_CREDIT_DIFFERENCE=pd.NamedAgg(column="CREDIT_DIFFERENCE", aggfunc="sum"),
)
prev2 = prev2.groupby("SK_ID_CURR").sum()
prev = prev1.merge(prev2, how="inner", on="SK_ID_CURR")
df = df.merge(prev, how="left", on="SK_ID_CURR")
df.head(15)
# prev.sort_values(by=["SK_ID_CURR"], inplace=True)
# prev["NAME_CONTRACT_STATUS"].value_counts(normalize=True)
# prev.head(15)
# prev[prev["NAME_CONTRACT_STATUS"] == "Unused offer"]

In [None]:
df.drop(columns = ["SK_ID_CURR"], inplace=True)

In [None]:
# All columns with NaNs
df2 = df.loc[:, df.isnull().any()]
print("numCols: ", len(df2.columns))
df2.columns

In [None]:
# Calculates percentage of NaNs alongside data type of column
percentNull = {}
for col in df.columns:
    percent = (len(df[df[col].isnull()]))/len(df)
    if percent > 0:
        percentNull[col] = (df.dtypes[col], percent)
percentNull

In [None]:
# Shows all rows where a certain column has NaNs
df[df['EXT_SOURCE_2'].isnull()]

In [None]:
# Y'ALL, WE'RE DUMB AS SHIT. Check this out:
col_descriptions = pd.read_csv('data/HomeCredit_columns_description.csv', encoding = "ISO-8859-1")
col_descriptions.head(20)

# Data Preprocessing

In [None]:
# All rows where AMT_ANNUITY is NaN
df_annuity = df[df["AMT_ANNUITY"].notnull()]
# Average of non NaN AMT_ANNUITY
numerator = df_annuity["AMT_ANNUITY"].mean()
# Average of non NaN AMT_CREDIT
denominator = df_annuity["AMT_CREDIT"].mean()
# Average ratio of AMT_ANNUITY / AMT_CREDIT
ratio = numerator / denominator
# Set Nulls in AMT_ANNUITY to ratio multiplied by the relevant AMT_CREDIT index
df["AMT_ANNUITY"] = df["AMT_ANNUITY"].fillna(ratio * df["AMT_CREDIT"])

 # Set Nulls in AMT_GOODS_PRICE to the relevant AMT_CREDIT index
df["AMT_GOODS_PRICE"] = df["AMT_GOODS_PRICE"].fillna(df["AMT_CREDIT"])

# Columns where we fill null with 0
zero_cols = ["OWN_CAR_AGE", "DAYS_LAST_PHONE_CHANGE", "AMT_REQ_CREDIT_BUREAU_HOUR", "AMT_REQ_CREDIT_BUREAU_DAY", "AMT_REQ_CREDIT_BUREAU_WEEK",
        "AMT_REQ_CREDIT_BUREAU_MON", "AMT_REQ_CREDIT_BUREAU_QRT", "AMT_REQ_CREDIT_BUREAU_YEAR"]
# Set Nulls in zero cols to 0
for col in zero_cols:
        df[col] = df[col].fillna(0)

## Handle Categorical Data

In [None]:
str_columns = df.select_dtypes(['string', "object"]).columns
df[str_columns] = df[str_columns].astype("category")
cat_columns = df.select_dtypes(['category']).columns
# df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes + 1)
df = util.onehot_categorical_columns(df, cat_columns)

In [None]:
df.head()

In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy = "constant", fill_value=0)
df[:] = imputer.fit_transform(df)

In [None]:
df.columns

In [None]:
from get_feature_importance import get_feature_imp
# get_feature_imp(df, target)

In [None]:
output = util.run_pca(df, 200)
# output = df

In [None]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(output.to_numpy(), target.to_numpy(), test_size=0.25, shuffle=True, stratify=target.to_numpy())
print("training size:", len(train_X))
print("testing size:", len(test_X))

In [None]:
from classifier import run_and_compare
run_and_compare(train_X, train_y, test_X, test_y, model='svm')

# SVM Accuracies

Note that these will probably be slightly different each time based on the data split. I just wrote these down for a general idea of the best params. 

200 components: 
- Hinge — 0.6432888166607337
- log_loss — 0.6371600030709288
- huber — 0.6045798030541624
- epsilon_insensitive -- 0.6243449925510459
- modified_huber — 0.6607661267904723
- squared_hinge — 0.6525500834455048
- perceptron — 0.5963679298868061 (really good for 0s, but probably because it might just be guessing 0s a ton and getting a high hit rate) 

175 components:
- Hinge — 0.6468060525314879
- log_loss — 0.6172351771686619
- huber — 0.6247846510249453
- epsilon_insensitive -- 0.6245843411376562
    - Did a great job predicting defaults (0.84) 
- modified_huber — 0.6557343207158701
- squared_hinge — 0.5492138711315983
- perceptron — 0.5041903878103752 (lol now I KNOW this shit is just guessing 1)

150 components: 
- Hinge — 0.6453322424507427
- log_loss — 0.591766431876927
- huber — 0.61793966358117
- epsilon_insensitive -- 0.6354215862146304
    - Did a great job predicting defaults (0.78) 
- modified_huber — 0.6425852080603528
- squared_hinge — 0.5466302351985645
- perceptron — 0.6012018087070353


In [None]:
from classifier import run_and_compare
run_and_compare(train_X, train_y, test_X, test_y, model='lr')

# LR Accuracies:

Note that these will probably be slightly different each time based on the data split. I just wrote these down for a general idea of the best params. 

150 Components:
- C 0.25 — 0.6628742530343206
- C 0.5 — 0.6681524393126752
- C 0.75 — 0.6702277034625324
- C 1 — 0.6721468834960813
- C 1.25 — 0.6723835348054287
- C 1.5 — 0.6737241591262073
- C 1.75 — 0.6741726230817547
- C 2 — 0.6736864770456279
- C 5 — 0.6760685536341112
- C 10 — 0.6752884385767906
- C 50 — 0.6750325871221924

200 Components:
- C 0.25 — 0.6663838348393336
- C 0.5 — 0.6710626760777125
- C 0.75 — 0.6734488020021376
- C 1 — 0.6737931665595478
- C 1.25 — 0.6733738733269949
- C 1.5 — 0.6753873349430477
- C 1.75 — 0.6755968401973712
- C 2 — 0.6757949384541065
- C 5 — 0.6754746943499215
- C 10 — 0.6759695045057436
- C 50 — 0.6751665754532465
