In [8]:
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [108]:
credit_df = pd.read_csv("UCI_Credit_Card.csv")

In [125]:
credit_df.columns.tolist()

['ID',
 'LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'default_payment_next_month']

In [109]:
credit_df = credit_df.dropna()
credit_df = credit_df.rename(
    columns={"default.payment.next.month": "default_payment_next_month"}
)
credit_df['default_payment_next_month'] = credit_df['default_payment_next_month'].astype(str)


In [110]:
credit_df['default_payment_next_month']  = credit_df['default_payment_next_month'] .map({"1": "Yes", "0": "No"})

In [111]:
credit_df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,Yes
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,Yes
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,No
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,No
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,No


In [126]:
balance_limit_dist = alt.Chart(credit_df).mark_bar(opacity = 0.7).encode(
    x = alt.X('LIMIT_BAL', title = "Balance Limit", bin=alt.BinParams(maxbins=30)),
    y = alt.Y('count()', title = 'Distribution')).properties(title = 'Distribution of Balance Limits')

balance_limit_dist.save("img/balance_limit_dist.png")

In [127]:
previous_amt_dist = alt.Chart(credit_df).mark_bar(opacity = 0.7).encode(
    x = alt.X('PAY_AMT1', title = "Balance Limit", bin=alt.BinParams(maxbins=30)),
    y = alt.Y('count()', title = 'Distribution', scale = alt.Scale(type='log'))).properties(title = 'Distribution of Previous Payment Amount')
previous_amt_dist.save("img/previous_amt_dist.png")

In [129]:
age_by_class = alt.Chart(credit_df).mark_bar(opacity=0.7).encode(
    x = alt.X('AGE:Q', bin=alt.BinParams(maxbins=30), title = "Age of Client"),
    y = alt.Y("count():Q", title = "Distribution"),
    color = alt.Color('default_payment_next_month:N', title = "Default on Payment"))
age_by_class.save("img/age_by_class.png")

In [128]:
class_imbalance = alt.Chart(credit_df).mark_bar().encode(
    x = alt.X('default_payment_next_month', title = "Whether Client Defaults"),
    y = alt.Y('count():Q', title = 'Count')).properties(title = "Distribution of Class Labels")

class_imbalance.save("img/class_imbalance.png")

In [133]:
!pip install tabulate
results_dict = {
    "Model":            ["Dummy", "Logistic Regression", "Selected Logistic Regression", "Decision Tree", "Tuned Decision Tree", 
                        "Random Forest", "Tuned Random Forest", "Gradient Boosting", "Tuned Gradient Boosting"],   
    "CV ROC-AUC":       [0.5000, 0.7247, 0.7238,0.6149, 0.7538, 0.7633, 0.7788, 0.7792, 0.7778],  
    "Fit Time":         [0.0129, 0.0763, 0.0342, 2.544, 0.801, 25.963, 117.108, 41.515, 85.951], 
    "Notes":            ["Good baseline, low predictive power", 
                         "Significant improvement from Dummy, but still room for improvement",
                         "Not a significant difference from the model without feature selection",
                         "Poor performance, worse than Logistic Regression",
                         "Significant improvement compared to un-tuned version",
                         "Good performance, but notably longer fit time",
                         "Slightly better performance compared to un-tuned version, but much longer fit time",
                         "Best performance seen so far",
                         "Notable increase in fit time, very similar yet decreased score compared to un-tuned version"]    
}
results_df = pd.DataFrame(results_dict)
md_table = results_df.to_markdown()
print(md_table)

|    | Model                        |   CV ROC-AUC |   Fit Time | Notes                                                                                       |
|---:|:-----------------------------|-------------:|-----------:|:--------------------------------------------------------------------------------------------|
|  0 | Dummy                        |       0.5    |     0.0129 | Good baseline, low predictive power                                                         |
|  1 | Logistic Regression          |       0.7247 |     0.0763 | Significant improvement from Dummy, but still room for improvement                          |
|  2 | Selected Logistic Regression |       0.7238 |     0.0342 | Not a significant difference from the model without feature selection                       |
|  3 | Decision Tree                |       0.6149 |     2.544  | Poor performance, worse than Logistic Regression                                            |
|  4 | Tuned Decision Tree          |   