In [22]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [23]:
file_path_ad_clean = "../Raw_data/application_data_no_nulls.csv"
df_loans = pd.read_csv(file_path_ad_clean)
df_loans.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100083,0,Cash loans,M,Y,Y,0,103500.0,573628.5,24435.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
1,100145,0,Cash loans,F,Y,Y,1,202500.0,260725.5,16789.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100179,0,Cash loans,F,Y,N,0,202500.0,675000.0,53329.5,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,4.0
3,100190,0,Cash loans,M,Y,N,0,162000.0,263686.5,24781.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100295,1,Cash loans,M,Y,N,1,225000.0,1019205.0,31032.0,...,1,0,0,0,0.0,0.0,0.0,6.0,0.0,1.0


In [24]:
# convert text to numbers for model
df_loans = pd.get_dummies(df_loans)

In [25]:
# Define features set
X = df_loans.copy()
X.drop("TARGET", axis=1, inplace=True)
X.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100083,0,103500.0,573628.5,24435.0,463500.0,0.009657,-15406,-892,-341.0,...,0,0,0,0,0,0,1,0,1,0
1,100145,1,202500.0,260725.5,16789.5,198000.0,0.01885,-16282,-4375,-762.0,...,0,0,0,0,0,1,0,0,1,0
2,100179,0,202500.0,675000.0,53329.5,675000.0,0.031329,-11375,-2311,-180.0,...,0,0,0,1,0,0,0,0,1,0
3,100190,0,162000.0,263686.5,24781.5,238500.0,0.022625,-13972,-4472,-464.0,...,0,0,0,0,0,1,0,0,1,0
4,100295,1,225000.0,1019205.0,31032.0,774000.0,0.072508,-11356,-602,-335.0,...,0,0,0,0,0,1,0,0,1,0


In [26]:
# Define target vector
y = df_loans["TARGET"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [1]], dtype=int64)

In [27]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [28]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [29]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [30]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
# Create the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [32]:
# Fit the model
model = model.fit(X_train_scaled, y_train)

In [33]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [34]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [35]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1892,142
Actual 1,101,16


Accuracy Score : 0.8870292887029289
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2034
           1       0.10      0.14      0.12       117

    accuracy                           0.89      2151
   macro avg       0.53      0.53      0.53      2151
weighted avg       0.90      0.89      0.89      2151

