In [35]:
# Importing needed libraries

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# read the CSV file into a DataFrame
loan_data = pd.read_csv('Loan_Default.csv')

In [36]:
# Insert both numerical and categorical features

numerical_columns = [
 'loan_amount',
 'rate_of_interest',
 'Interest_rate_spread',
 'Upfront_charges',
 'term',
 'property_value',
 'income',
 'Credit_Score',
 'LTV',
 'dtir1'
]

categorical_columns = [
 'loan_limit',
 'Gender',
 'approv_in_adv',
 'loan_type',
 'loan_purpose',
 'Credit_Worthiness',
 'open_credit',
 'business_or_commercial',
 'Neg_ammortization',
 'interest_only',
 'lump_sum_payment',
 'construction_type',
 'occupancy_type',
 'Secured_by',
 'total_units',
 'credit_type',
 'age',
 'submission_of_application',
 'Region'
]

In [37]:
# Impute missing values in the categorical features using the 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')
loan_data[categorical_columns] = imputer.fit_transform(loan_data[categorical_columns])

# Impute missing values in the numerical features using the column mean
loan_data[numerical_columns] = loan_data[numerical_columns].fillna(loan_data[numerical_columns].mean())

In [38]:
# Create a boolean mask indicating which values are NaN
mask = loan_data[categorical_columns].isna()

# Checking wether any of the values in the mask has missing value
if mask.any().any():
    print('There are NaN values in the columns.')
else:
    print('There are no NaN values in the columns.')

There are no NaN values in the columns.


In [39]:
# One-hot encoding for categorical features only

cat_feat = pd.get_dummies(loan_data[categorical_columns])

df_no_status = loan_data[numerical_columns].join(cat_feat)
df_no_status

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,dtir1,...,age_55-64,age_65-74,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south
0,116500,4.045476,0.441656,3224.996127,360.0,118000.000000,1740.0,758,98.728814,45.000000,...,0,0,0,0,0,1,0,0,0,1
1,206500,4.045476,0.441656,3224.996127,360.0,497893.465696,4980.0,552,72.746457,37.732932,...,1,0,0,0,0,1,1,0,0,0
2,406500,4.560000,0.200000,595.000000,360.0,508000.000000,9480.0,834,80.019685,46.000000,...,0,0,0,0,0,1,0,0,0,1
3,456500,4.250000,0.681000,3224.996127,360.0,658000.000000,11880.0,587,69.376900,42.000000,...,0,0,0,0,1,0,1,0,0,0
4,696500,4.000000,0.304200,0.000000,360.0,758000.000000,10440.0,602,91.886544,39.000000,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,436500,3.125000,0.257100,9960.000000,180.0,608000.000000,7860.0,659,71.792763,48.000000,...,1,0,0,0,0,1,0,0,0,1
148666,586500,5.190000,0.854400,0.000000,360.0,788000.000000,7140.0,569,74.428934,15.000000,...,0,0,0,0,1,0,0,0,0,1
148667,446500,3.125000,0.081600,1226.640000,180.0,728000.000000,6900.0,702,61.332418,49.000000,...,0,0,0,0,1,0,1,0,0,0
148668,196500,3.500000,0.582400,4323.330000,180.0,278000.000000,7140.0,737,70.683453,29.000000,...,1,0,0,0,0,1,1,0,0,0


In [40]:
# Converting all cat. lables to the ordinar array

key_list = df_no_status.columns.tolist()

# Creating a pandas DataFrame with both cat. and num. features
new_loan_data = df_no_status.join(loan_data[['Status']])
new_loan_data

Unnamed: 0,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,dtir1,...,age_65-74,age_<25,age_>74,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Status
0,116500,4.045476,0.441656,3224.996127,360.0,118000.000000,1740.0,758,98.728814,45.000000,...,0,0,0,0,1,0,0,0,1,1
1,206500,4.045476,0.441656,3224.996127,360.0,497893.465696,4980.0,552,72.746457,37.732932,...,0,0,0,0,1,1,0,0,0,1
2,406500,4.560000,0.200000,595.000000,360.0,508000.000000,9480.0,834,80.019685,46.000000,...,0,0,0,0,1,0,0,0,1,0
3,456500,4.250000,0.681000,3224.996127,360.0,658000.000000,11880.0,587,69.376900,42.000000,...,0,0,0,1,0,1,0,0,0,0
4,696500,4.000000,0.304200,0.000000,360.0,758000.000000,10440.0,602,91.886544,39.000000,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,436500,3.125000,0.257100,9960.000000,180.0,608000.000000,7860.0,659,71.792763,48.000000,...,0,0,0,0,1,0,0,0,1,0
148666,586500,5.190000,0.854400,0.000000,360.0,788000.000000,7140.0,569,74.428934,15.000000,...,0,0,0,1,0,0,0,0,1,0
148667,446500,3.125000,0.081600,1226.640000,180.0,728000.000000,6900.0,702,61.332418,49.000000,...,0,0,0,1,0,1,0,0,0,0
148668,196500,3.500000,0.582400,4323.330000,180.0,278000.000000,7140.0,737,70.683453,29.000000,...,0,0,0,0,1,1,0,0,0,0


In [41]:
new_loan_data = new_loan_data.head(2000)

In [42]:
import numpy as np

def one_hot_feature_names(df, original_names, target_feature):
    one_hot_names = []
    for name in original_names:
        if target_feature in name:
            unique_values = np.unique(df[name])
            for value in unique_values:
                one_hot_names.append(name)
    return list(set(one_hot_names))

In [43]:
one_hot_feature_names(new_loan_data, new_loan_data.columns, 'age')

['age_35-44',
 'age_45-54',
 'age_55-64',
 'age_>74',
 'age_<25',
 'age_25-34',
 'age_65-74']

In [44]:
import pandas as pd

def orig_features_enumerated(df, x):
    x_features = one_hot_feature_names(df, df.columns, x)
    
    new_df = pd.DataFrame()
    new_df[x] = df[x_features].idxmax(axis=1)
    
    new_df[x] = new_df[x].apply(lambda x: x_features.index(x))
    
    return new_df

In [45]:
feature_df = orig_features_enumerated(new_loan_data, 'age')
feature_df

Unnamed: 0,age
0,5
1,2
2,0
3,1
4,5
...,...
1995,1
1996,0
1997,1
1998,5


In [46]:
import matplotlib.pyplot as plt

def plot_3d(df, x, y, z, color):
    plt.figure(figsize=(10,10))
    ax = plt.axes(projection='3d')
    
    x_features = one_hot_feature_names(df, df.columns, x)
    x_values = df[x_features].idxmax(axis=1)
    x_labels = [x_feature.split('_', 1)[1] for x_feature in x_features]
    
    x_enumerated = orig_features_enumerated(df, x)
    
    ax.scatter3D(x_enumerated[x], df[y], df[z], c=df[color])
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)
    
    ax.set_xticks(range(len(x_labels)))
    ax.set_xticklabels(x_labels)

In [50]:
plot_3d(new_loan_data, 'age', 'loan_amount', 'income', 'Status')

<IPython.core.display.Javascript object>

In [22]:
import matplotlib.pyplot as plt

def y_plot_3d(df, x, y, z, color):
    plt.figure(figsize=(10,10))
    ax = plt.axes(projection='3d')
    
    y_features = one_hot_feature_names(df, df.columns, y)
    y_values = df[y_features].idxmax(axis=1)
    y_labels = [y_feature.split('_', 1)[1] for y_feature in y_features]
    
    y_enumerated = orig_features_enumerated(df, y)
    
    colors = np.where(df['Status'] == 1, 'red', df[color])
    
    ax.scatter3D(df[x], y_enumerated[y], df[z], c=colors)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)
    
    ax.set_yticks(range(len(y_labels)))
    ax.set_yticklabels(y_labels)

In [23]:
%matplotlib notebook

y_plot_3d(new_loan_data, 'income', 'age', 'loan_amount', 'Status')
plot_3d(new_loan_data, 'age', 'Upfront_charges', 'Credit_Score', 'Status')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [144]:
# Import necessary modules
import numpy as np


def plot_3d_funct(df, x, y, z, color):
    
    y_features = one_hot_feature_names(df, df.columns, y)
    y_values = df[y_features].idxmax(axis=1)
    y_labels = [y_feature.split('_', 1)[1] for y_feature in y_features]
    
    y_enumerated = orig_features_enumerated(df, y)

    # Get the data points for the x, y, and z axis
    x = df[x]
    y = y_enumerated[y]
    z = df[z]

    # Fit a polynomial curve through the data points
    coeffs = np.polyfit(x, y, deg=3, w=z)

    # Create a function from the fitted curve
    f = np.poly1d(coeffs)

    # Generate a set of x values to evaluate the function on
    x_values = np.linspace(x.min(), x.max(), 100)

    # Evaluate the function on the x values
    y_valuess = f(x_values)

    # Plot the data points and the fitted curve
    plt.figure(figsize=(10,10))
    ax = plt.axes(projection='3d')
    ax.scatter3D(x, y, z, c=df[color])
    ax.plot3D(x_values, y_valuess, z_values, 'r')
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)
    
    ax.set_yticks(range(len(y_labels)))
    ax.set_yticklabels(y_labels)

In [145]:
%matplotlib notebook

plot_3d_funct(new_loan_data, 'rate_of_interest', 'age', 'income', 'Status')

<IPython.core.display.Javascript object>

NameError: name 'z_values' is not defined