In [None]:
import math
import pydot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.random import seed
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout 
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.random import set_seed
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('../input/should-this-loan-be-approved-or-denied/SBAnational.csv')
data.head()

I think it will be interesting to predict MIS_Status value (Loan status charged off = CHGOFF, Paid in full = PIF)

In [None]:
data.drop(data[data['MIS_Status'].isnull()].index, axis = 0, inplace = True)

In [None]:
data

# Splits

In [None]:
df, validation_df  = train_test_split(data,
                                test_size=0.35,
                                random_state = 101)

In [None]:
df_train, df_test  = train_test_split(df,
                                test_size=0.25,
                                random_state = 101)

In [None]:
del data

# EDA

In [None]:
df_train.info()

In [None]:
df_train.describe()

N\A values

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="NewExist", hue="MIS_Status", data=df_train)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="RevLineCr", hue="MIS_Status", data=df_train)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="LowDoc", hue="MIS_Status", data=df_train)
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df_train, hue='MIS_Status', height = 7, aspect = 2)
g.map(sns.kdeplot, 'Term')
plt.legend()
plt.title('Term factor')
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
job_survey_data = df_train[['CreateJob', 'RetainedJob', 'MIS_Status']]
job_survey_data[['CreateJob', 'RetainedJob']] = np.sqrt(job_survey_data[['CreateJob', 'RetainedJob']])
sns.scatterplot(data = job_survey_data, x = 'CreateJob', y = 'RetainedJob', hue = 'MIS_Status', palette = 'magma')
plt.show()
del job_survey_data

In [None]:
count = df_train['City'].value_counts()
print(f'Unique values: {len(count)}')
count_f = count[count>500]
more_popular_Cities = set(count_f.index)
print(f'Unique values after values grouped: {len(count_f)}')
count_f

In [None]:
count = df_train['Bank'].value_counts()
print(f'Unique values: {len(count)}')
count_f = count[count>500]
more_popular_Banks = set(count_f.index)
print(f'Unique values after values grouped: {len(count_f)}')
count_f

In [None]:
df_train['ChgOffDate'].isnull().value_counts()

Looks like this column is better to ignore at all

In [None]:
count = df_train['NAICS'].value_counts()
print(f'Unique values: {len(count)}')
count_f = count[count>500]
more_popular_NAICS = set(count_f.index)
print(f'Unique values after values grouped: {len(count_f)}')
count_f

# Data preparation

In [None]:
def unknown_filling_text(val):
    if pd.isna(val):
        return 'no data'
    else:
        return str(val)
    
def proc_col_City(val):
    if val not in more_popular_Cities:
        return 'other'
    else:
        return val
    
def proc_col_Bank(val):
    if val not in more_popular_Banks:
        return 'other'
    else:
        return val
    
def proc_col_NAICS(val):
    if val not in more_popular_NAICS:
        return str(val)[:3]
    else:
        return str(val)

def proc_col_MIS_Status(val):
    if val == 'CHGOFF':
        return 1
    elif val == 'P I F':
        return 0
    else:
        raise ValueError('Incorrect MIS_Status value')
    
def check_na(df):
    if len(df[df.isnull().any(axis=1)])!= 0:
        raise ValueError('N\A in data')
    
def pre_dumm_proc(df):
    df = df.copy()
    to_drop = [
        'LoanNr_ChkDgt', 'ChgOffDate', 'Name', 'Zip', 'ApprovalDate',
        'ApprovalFY', 'DisbursementDate', 'DisbursementGross',
        'BalanceGross', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv'
    ]
    df.drop(columns = to_drop, axis = 1, inplace = True)
    
    df['City'] = df['City'].apply(proc_col_City)
    df['State'] = df['State'].apply(unknown_filling_text)
    df['Bank'] = df['Bank'].apply(proc_col_Bank)
    df['BankState'] = df['BankState'].apply(unknown_filling_text)
    df['NAICS'] = df['NAICS'].apply(proc_col_NAICS)
    df['NewExist'] = df['NewExist'].apply(unknown_filling_text)
    df['RevLineCr'] = df['RevLineCr'].apply(unknown_filling_text)
    df['LowDoc'] = df['LowDoc'].apply(unknown_filling_text)
    df['MIS_Status'] = df['MIS_Status'].apply(proc_col_MIS_Status)
    check_na(df)
    return df

def dummification(df):
    dummy_df = pd.DataFrame()
    object_cols = df.columns[df.dtypes == object]
    for col in object_cols:
        dummy_df = pd.concat([dummy_df, create_dummy(col, df)], axis = 1)
    
    df_out = pd.concat([df.drop(columns = object_cols), dummy_df], axis = 1)
    return df_out.sort_index(ascending=False, axis=1)
        

def create_dummy(col, df):
    df_dummy = pd.get_dummies(df[col], drop_first = True)
    df_dummy.columns = ['dum: ' + col + ': ' + str(name) for name in df_dummy.columns]
    return df_dummy

def data_preparation(df):
    return dummification(pre_dumm_proc(df))

df_train_d = data_preparation(df_train)
df_train_c = pre_dumm_proc(df_train)
df_train_d

In [None]:
df_train_c

In [None]:
columns_needed = set(df_train_d.columns)

def columns_standardization(df):
    df = df.copy()
    for col in columns_needed:
        if col not in set(df.columns):
            df.insert(loc = len(df.columns), column = col, value = 0, allow_duplicates=False)
    
    for col in set(df.columns):
        if col not in columns_needed:
            df.drop(columns = col, axis = 1, inplace = True)
    
    return df.sort_index(ascending=False, axis=1)

In [None]:
df_test_d = columns_standardization(data_preparation(df_test))
df_test_c = pre_dumm_proc(df_test)
df_test_d

In [None]:
df_test_c.head()

In [None]:
df_test_d[df_test_d.isnull().any(axis=1)]

In [None]:
df_train_d[df_train_d.isnull().any(axis=1)]

No null value left

In [None]:
X_train = df_train_d.drop('MIS_Status', axis = 1)
y_train = df_train_d['MIS_Status']
X_test = df_test_d.drop('MIS_Status', axis = 1)
y_test = df_test_d['MIS_Status']

In [None]:
del df, df_train_d, df_train_c, df_test_d, df_test_c

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train.values)
X_test_sc = scaler.transform(X_test.values)

# Dimensionality reduction using PCA

In [None]:
pca = PCA(n_components=2, random_state = 1)
df_pca_train = pca.fit_transform(X_train_sc)
df_pca_test = pca.transform(X_test_sc)

In [None]:
df_pca_vis = pd.DataFrame(df_pca_train)
df_pca_vis['y'] = y_train.values

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_pca_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

In [None]:
pca_variance = pca.explained_variance_

plt.figure(figsize=(6, 6))
plt.bar(['0', '1'], pca_variance, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()

In [None]:
lgbr_pca = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_pca.fit(df_pca_train, y_train)
pred = lgbr_pca.predict(df_pca_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# Dimensionality reduction using Singular Value Decomposition

In [None]:
svd = TruncatedSVD(n_components=2, random_state = 1)
df_svd_train = svd.fit_transform(X_train_sc)
df_svd_test = svd.transform(X_test_sc)

In [None]:
df_svd_vis = pd.DataFrame(df_svd_train)
df_svd_vis['y'] = y_train.values

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_svd_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

In [None]:
lgbr_svd = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_svd.fit(df_svd_train, y_train)
pred = lgbr_svd.predict(df_svd_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# Dimensionality reduction using Non-Negative Matrix Factorization (NMF)

In [None]:
nmf = NMF(n_components=2, random_state = 1)
df_nmf_train = nmf.fit_transform(X_train_sc, y_train)
df_nmf_test = nmf.transform(X_test_sc)

In [None]:
df_nmf_vis = pd.DataFrame(df_nmf_train)
df_nmf_vis['y'] = y_train.values

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_nmf_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

In [None]:
lgbr_nmf = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_nmf.fit(df_nmf_train, y_train)
pred = lgbr_nmf.predict(df_nmf_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# Dimensionality reduction using Linear Discriminant Analysis

In [None]:
lda = LinearDiscriminantAnalysis(n_components=1)
df_lda_train = lda.fit_transform(X_train_sc, y_train)
df_lda_test = lda.transform(X_test_sc)

In [None]:
df_lda_vis = pd.DataFrame(df_lda_train)
df_lda_vis['y'] = y_train.values

plt.figure(figsize = (12, 8))
sns.scatterplot(data = df_lda_vis, x = 0, y = 1, hue = 'y',  palette = 'magma')
plt.show()

In [None]:
lgbr_lda = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_lda.fit(df_lda_train, y_train)
pred = lgbr_lda.predict(df_lda_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# Dimensionality reduction using autoencoder

In [None]:
input_width = len(X_train.columns)
input_width

In [None]:
def dim_red_analysis(n_epochs = None):
    seed(101)
    set_seed(101)

    encoder = Sequential()
    encoder.add(Dense(units = 256, activation = 'relu', input_shape = [input_width]))
    encoder.add(Dropout(0.2))
    encoder.add(Dense(units = 16, activation = 'relu'))
    encoder.add(Dense(units = 2, activation = 'relu'))

    decoder = Sequential()
    decoder.add(Dense(units = 16, activation = 'relu', input_shape = [2]))
    decoder.add(Dense(units = 256, activation = 'relu'))
    decoder.add(Dense(units = input_width, activation = 'relu'))

    autoencoder = Sequential([encoder, decoder])

    autoencoder.compile(loss = 'mse', optimizer = SGD(lr = 12))
    
    autoencoder.summary()
    
    if n_epochs is None:
        es = [EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)]
        n_epochs = 100
    else:
        es = []
    
    autoencoder.fit(
            X_train_sc,
            X_train_sc,
            epochs = n_epochs,
            validation_data=(X_test_sc, X_test_sc), 
            callbacks=[es]
             )
    
    if n_epochs > 1:
        histo = pd.DataFrame(autoencoder.history.history)
        for metric in ['loss', 'val_loss']:
            plt.title(metric)
            histo[metric].plot()
            plt.show()
        
    encoded_2dim = encoder.predict(X_train_sc)
    encoded_2dim = pd.DataFrame(encoded_2dim)
    encoded_2dim['y'] = y_train.values

    plt.figure(figsize = (12, 8))
    sns.scatterplot(data = encoded_2dim, x = 0, y = 1, hue = 'y', palette = 'magma')
    plt.show()
    
    return encoder

In [None]:
encoder = dim_red_analysis(0)

enc_train = encoder.predict(X_train_sc)
enc_test = encoder.predict(X_test_sc)

lgbr_enc = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_enc.fit(enc_train, y_train)
pred = lgbr_enc.predict(enc_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))


In [None]:
encoder = dim_red_analysis(1)

enc_train = encoder.predict(X_train_sc)
enc_test = encoder.predict(X_test_sc)

lgbr_enc = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_enc.fit(enc_train, y_train)
pred = lgbr_enc.predict(enc_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

In [None]:
encoder = dim_red_analysis(2)

enc_train = encoder.predict(X_train_sc)
enc_test = encoder.predict(X_test_sc)

lgbr_enc = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_enc.fit(enc_train, y_train)
pred = lgbr_enc.predict(enc_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

In [None]:
encoder = dim_red_analysis(5)

enc_train = encoder.predict(X_train_sc)
enc_test = encoder.predict(X_test_sc)

lgbr_enc = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_enc.fit(enc_train, y_train)
pred = lgbr_enc.predict(enc_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
del encoder, enc_train, enc_test, lgbr_enc, pred

In [None]:
encoder = dim_red_analysis()

enc_train = encoder.predict(X_train_sc)
enc_test = encoder.predict(X_test_sc)

lgbr_enc = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr_enc.fit(enc_train, y_train)
pred = lgbr_enc.predict(enc_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

# Modelling without DR

In [None]:
dtc = DecisionTreeClassifier(random_state = 101)
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

In [None]:
rfc = RandomForestClassifier(random_state = 101, n_jobs = -1)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
display(pd.DataFrame({'Variable':X_train.columns,
              'Importance':rfc.feature_importances_}).sort_values('Importance', ascending=False).head(10))

In [None]:
gbr = GradientBoostingClassifier(random_state = 101)
gbr.fit(X_train, y_train)
pred = gbr.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
display(pd.DataFrame({'Variable':X_train.columns,
              'Importance':gbr.feature_importances_}).sort_values('Importance', ascending=False).head(10))

In [None]:
lgbr = LGBMClassifier(random_state = 1, n_jobs=- 1)
lgbr.fit(X_train.values, y_train)
pred = lgbr.predict(X_test.values)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
display(pd.DataFrame({'Variable':X_train.columns,
              'Importance':lgbr.feature_importances_}).sort_values('Importance', ascending=False).head(10))

In [None]:
xgbr = XGBClassifier(random_state = 1, n_jobs=- 1)
xgbr.fit(X_train, y_train)
pred = xgbr.predict(X_test)
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))
display(pd.DataFrame({'Variable':X_train.columns,
              'Importance':xgbr.feature_importances_}).sort_values('Importance', ascending=False).head(10))

In [None]:
def ANN_model_classification(model, X_train_sc, y_train, X_test_sc, y_test):
    
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    
    model.fit(
        x = X_train_sc,
        y = y_train,
        epochs = 100,
        validation_data=(X_test_sc, y_test), 
        batch_size = 128,
        callbacks=[es]
             )


    histo = pd.DataFrame(model.history.history)
    
    for metric in ['loss', 'val_loss', 'accuracy', 'val_accuracy']:
        plt.title(metric)
        histo[metric].plot()
        plt.show()
    
    pred_test_values = model.predict_classes(X_test_sc)

    print('test')
    print(classification_report(y_test,pred_test_values))
    print(confusion_matrix(y_test,pred_test_values))
    
    return model

In [None]:
seed(101)
set_seed(101)

ann_model1 = Sequential()

ann_model1.add(Dense(units=128, activation = 'relu'))
ann_model1.add(Dropout(0.3))
ann_model1.add(Dense(units=1,activation='sigmoid'))
ann_model1 = ANN_model_classification(ann_model1, X_train_sc, y_train, X_test_sc, y_test)

In [None]:
seed(101)
set_seed(101)

ann_model2 = Sequential()

ann_model2.add(Dense(units=128, activation = 'relu'))
ann_model2.add(Dropout(0.3))
ann_model2.add(Dense(units=16, activation = 'relu'))
ann_model2.add(Dense(units=1,activation='sigmoid'))
ann_model2 = ANN_model_classification(ann_model2, X_train_sc, y_train, X_test_sc, y_test)

Looks like XGB model performs better then other.

# Validation

In [None]:
val_data_for_model = columns_standardization(data_preparation(validation_df))
val_data_for_model

In [None]:
val_data_for_model[val_data_for_model.isnull().any(axis=1)]

In [None]:
X_validation = val_data_for_model.drop('MIS_Status', axis = 1)
y_validation = val_data_for_model['MIS_Status']

In [None]:
pred = xgbr.predict(X_validation)
print(classification_report(y_validation, pred))
print(confusion_matrix(y_validation, pred))

Model showed quite good result on validation data

In [None]:
accuracy_score(y_validation,pred)