In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

import seaborn as sns
#import maplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 


In [2]:


def tts_scale(x, y, scaler, encoding=None, test_size=None, random_state=None):
    '''
    Helper function to correctly train-test-split, scale numerical columns, and one-hot-encode categorical features.
    Will return x_train, x_test, y_train, y_test.
    
    Parameters:
    x         ==> input variables from dataframe.
    y         ==> target variable from dataframe.
    encoding  ==> iterable of columns from dataframe to be one-hot-encoded.
    ts        ==> test size from train-test-split.
    rs        ==> random_state for train_test_split function.
    scaler    ==> scaling object used to scale numeric data for x_train and x_test.
    
    '''
    
    if(type(x) != type(pd.DataFrame()) and type(x) != type(pd.Series())):
        raise TypeError('X must be a Pandas DataFrame or Pandas Series.')

    if(type(y) != type(pd.DataFrame()) and type(y) != type(pd.Series())):
        raise TypeError('X must be a Pandas DataFrame or Pandas Series.')
    

    num_cols = x.corr().columns
    cat_cols = []
    
    for col in x.columns:
        if col not in num_cols:
            cat_cols.append(col)
    

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
    

    
    try:
        x_train_s = pd.DataFrame(scaler.fit_transform(x_train[num_cols]), columns=num_cols)
        x_test_s = pd.DataFrame(scaler.fit_transform(x_test[num_cols]), columns=num_cols)

        x_train_s = x_train_s.set_index(x_train.index)
        x_test_s = x_test_s.set_index(x_test.index)
    except AttributeError:
        raise TypeError('The type of this scaler is not allowed.')

    if encoding is not None:
        
        x_train_cat = pd.get_dummies(x_train[cat_cols],columns=encoding)
        x_test_cat = pd.get_dummies(x_test[cat_cols],columns=encoding)
        x_train_final = pd.merge(x_train_s, x_train_cat, left_index=True, right_index=True)
        x_test_final = pd.merge(x_test_s, x_test_cat, left_index=True, right_index=True)
        
        return x_train_final, x_test_final, y_train, y_test

    else:

        return x_train_s, x_test_s, y_train, y_test
    

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns',45)

In [5]:
df = pd.read_csv(r'final.csv',index_col=0)
df = df[df['Credit Risk Code'].notna()]
df['Credit Risk Code'] = df['Credit Risk Code'].astype(str)


df.head()

Unnamed: 0,PK,Date,Qdate,Balance,Days Past Due,Past_Due30_59,Past_Due60_89,Past_Due90+,Credit Risk Code,Adversity,QTR_CLOSE,REAL_GDP_GROWTH,NOMINAL_GDP_GROWTH,REAL_DISPOSABLE_INCOME_GROWTH,NOMINAL_DISPOSABLE_INCOME_GROWTH,UNEMPLOYMENT_RATE,CPI_INFLATION_RATE,THREE_MONTH_TREASURY_RATE,FIVE_YEAR_TREASURY_YIELD,TEN_YEAR_TREASURY_YIELD,BBB_CORPORATE_YEILD,MORTGAGE_RATE,PRIME_RATE,DOWJONES_TOTAL_STOCK_MARKET_INDEX,HOUSE_PRICE_INDEX,COMERCIAL_REAL_ESTATE_PRICE_INDEX,MARKET_VOLATILITY_INDEX,EURO_AREA_REAL_GDP_GROWTH,EURO_AREA_INFLATION,EURO_AREA_BILATERAL_DOLLAR_EXCHANGE_RATE,DEVELOPING_ASIA_REAL_GDP_GROWTH,DEVELOPING_ASIA_INFLATION,DEV_ASIA_BILATERAL_DOLLAR_EXCHANGE_RATE,JAPAN_REAL_GDP_GROWTH,JAPAN_INFLATION,JAPAN_BILATERAL_DOLLAR_EXCHANGE_RATE,UK_REAL_GDP_GROWTH,UK_INFLATION,UK_BILATERAL_DOLLAR_EXCHANGE_RATE,Default
0,2,2015-01-31,Q1 2015,7685.03,15,10,5,1,5.0,Historical,2015-03-01,3.2,3.0,4.6,2.8,5.5,-2.6,0.0,1.5,2.0,3.9,3.7,3.3,21708,168,241,22.4,3.0,-0.7,1.074,5.7,0.9,88.1,5.5,0.5,120.0,2.1,-1.1,1.485,1.0
1,3,2015-01-31,Q1 2015,181479.0,0,0,0,0,3.0,Historical,2015-03-01,3.2,3.0,4.6,2.8,5.5,-2.6,0.0,1.5,2.0,3.9,3.7,3.3,21708,168,241,22.4,3.0,-0.7,1.074,5.7,0.9,88.1,5.5,0.5,120.0,2.1,-1.1,1.485,0.0
2,4,2015-01-31,Q1 2015,67382.89,0,0,0,0,3.0,Historical,2015-03-01,3.2,3.0,4.6,2.8,5.5,-2.6,0.0,1.5,2.0,3.9,3.7,3.3,21708,168,241,22.4,3.0,-0.7,1.074,5.7,0.9,88.1,5.5,0.5,120.0,2.1,-1.1,1.485,0.0
3,5,2015-01-31,Q1 2015,115580.0,0,0,0,0,1.0,Historical,2015-03-01,3.2,3.0,4.6,2.8,5.5,-2.6,0.0,1.5,2.0,3.9,3.7,3.3,21708,168,241,22.4,3.0,-0.7,1.074,5.7,0.9,88.1,5.5,0.5,120.0,2.1,-1.1,1.485,0.0
4,6,2015-01-31,Q1 2015,111445.0,0,0,0,0,1.0,Historical,2015-03-01,3.2,3.0,4.6,2.8,5.5,-2.6,0.0,1.5,2.0,3.9,3.7,3.3,21708,168,241,22.4,3.0,-0.7,1.074,5.7,0.9,88.1,5.5,0.5,120.0,2.1,-1.1,1.485,0.0


In [6]:
X = df.drop(columns=['PK','Default'])
Y = df['Default']

In [7]:
x_train, x_test, y_train, y_test = tts_scale(X, Y, StandardScaler(), encoding=['Credit Risk Code'], test_size=0.3, random_state=42)



## Decision Tree

In [8]:
x_train = x_train.drop(columns=['Adversity','QTR_CLOSE','Qdate','Past_Due90+']) #have to drop pastdue90+ - getting 100%
x_test = x_test.drop(columns=['Adversity','QTR_CLOSE','Qdate','Past_Due90+'])

In [9]:
x_train = x_train.set_index(['Date'])
x_test = x_test.set_index(['Date'])

In [10]:
X = pd.concat([x_train,x_test])

In [12]:
dtree = DecisionTreeClassifier()

In [13]:
dtree.fit(x_train, y_train)

DecisionTreeClassifier()

In [14]:
predictions = dtree.predict(x_test)

In [15]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     73346
         1.0       0.89      0.89      0.89      6064

    accuracy                           0.98     79410
   macro avg       0.94      0.94      0.94     79410
weighted avg       0.98      0.98      0.98     79410



In [16]:
print(confusion_matrix(y_test, predictions))

[[72670   676]
 [  687  5377]]


## Decision Tree Bagging

In [19]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500, #500 decision trees
                           max_samples=100, bootstrap=True, n_jobs=-1, random_state=42) #100 samples for each decision tree, bootstrap: with replacement, n_jobs- how many cores to use

In [20]:
bag_clf.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  max_samples=100, n_estimators=500, n_jobs=-1,
                  random_state=42)

In [21]:
y_pred=bag_clf.predict(x_test)

In [22]:
print(accuracy_score(y_test, y_pred))

0.9629895479158797


In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98     73346
         1.0       0.77      0.73      0.75      6064

    accuracy                           0.96     79410
   macro avg       0.88      0.86      0.87     79410
weighted avg       0.96      0.96      0.96     79410



## Random Forest

In [26]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)

In [27]:
y_pred_rf = rnd_clf.predict(x_test)

In [29]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     73346
         1.0       0.89      0.90      0.89      6064

    accuracy                           0.98     79410
   macro avg       0.94      0.95      0.94     79410
weighted avg       0.98      0.98      0.98     79410



In [31]:
print(accuracy_score(y_test, y_pred_rf))

0.9837929731771817


## Random Forest Adaboost 

In [32]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, algorithm="SAMME.R", learning_rate=0.25, random_state=42)
ada_clf.fit(x_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.25, n_estimators=100, random_state=42)

In [33]:
y_pred_ada = rnd_clf.predict(x_test)

In [34]:
print(classification_report(y_test, y_pred_ada))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     73346
         1.0       0.89      0.90      0.89      6064

    accuracy                           0.98     79410
   macro avg       0.94      0.95      0.94     79410
weighted avg       0.98      0.98      0.98     79410



## Linear Discriminant Analysis 

In [43]:
clf = LinearDiscriminantAnalysis(priors=[.4,.6])
clf.fit(x_train, y_train)

LinearDiscriminantAnalysis(priors=[0.4, 0.6])

In [44]:
y_pred= clf.predict(x_test)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97     73346
         1.0       0.59      0.90      0.71      6064

    accuracy                           0.94     79410
   macro avg       0.79      0.92      0.84     79410
weighted avg       0.96      0.94      0.95     79410



In [None]:
#78% recall on default when don't adjust the priors 

In [46]:
print(confusion_matrix(y_test, y_pred))

[[69540  3806]
 [  625  5439]]
