In [1]:
# Imports
import sys
import os
import platform

import pandas as pd
import numpy as np
import sklearn as sk
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from pandas.tseries.offsets import DateOffset

from sklearn import svm                                 #  support vector machine (SVM) learning method
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (classification_report, 
                             balanced_accuracy_score, 
                             confusion_matrix, 
                             f1_score)

from imblearn.metrics import classification_report_imbalanced

# import hvplot.pandas
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import pickle     # method for save trained/fit model/s
import joblib     # method for save trained/fit model/s

In [2]:
# Report Technologies
print(f'Python Platform: {platform.platform()}')
print(f'Python {sys.version}')
print()
print(f'Pandas {pd.__version__}')
print(f'Numpy {np.__version__}')
print(f'Scikit-Learn {sk.__version__}')
print(f'Seaborn {sns.__version__}')

Python Platform: macOS-13.0.1-arm64-arm-64bit
Python 3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:52:10) 
[Clang 14.0.6 ]

Pandas 1.5.1
Numpy 1.21.5
Scikit-Learn 1.1.3
Seaborn 0.12.0


# Read in original complete dataset

In [3]:
# This is a compression='gzip' file
two_clients_df = pd.read_csv(
    Path('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018Q4.csv.gz'), 
    compression='gzip', 
    low_memory=False)

# Review the DataFrame
display(two_clients_df.shape)
display(two_clients_df.head(3))


(2260701, 151)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,


In [None]:
# drop duplicate rows
two_clients_df.drop_duplicates()
display(two_clients_df.shape)

In [None]:
# check for duplicated columns
two_clients_df.columns.duplicated()

In [None]:
# identify keys
for col in two_clients_df.columns:
    display(col)

# Delete / drop these columns-

['id',

'member_id',

'issue_d',

'url',

'zip_code',

'initial_list_status',

'hardship_flag',

'hardship_type',

'hardship_reason',

'hardship_status',

'deferral_term',

'hardship_amount',

'hardship_start_date',

'hardship_end_date',

'payment_plan_start_date',

'hardship_length',

'hardship_dpd',

'hardship_loan_status',

'orig_projected_additional_accrued_interest',

'hardship_payoff_balance_amount',

'hardship_last_payment_amount',

'disbursement_method',

'debt_settlement_flag',

'debt_settlement_flag_date',

'settlement_status',

'settlement_date',

'settlement_amount',

'settlement_percentage',

'settlement_term'

'pymnt_plan',

'desc',

'purpose',

'title',

'last_pymnt_d',

'next_pymnt_d',

'earliest_cr_line',

'last_credit_pull_d']

# Split accepted loan data (original full dataset) into two CSV files to Simulate two clients' data 
> bank_1

> bank_2

In [None]:
# Takes original full csv and splits it into ..csv1.csv and ..csv2.csv
chunk_size = 1200000   # ~ 1/2 size of original dataset
def write_chunk(part, lines):
    with open('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv'+ str(part) +'.csv', 'w') as f_out:
        f_out.write(header)
        f_out.writelines(lines)
with open('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv', 'r') as f:
    count = 0
    header = f.readline()
    lines = []
    for line in f:
        count += 1
        lines.append(line)
        if count % chunk_size == 0:
            write_chunk(count // chunk_size, lines)
            lines = []
    # write remainder
    if len(lines) > 0:
        write_chunk((count // chunk_size) + 1, lines)

In [None]:
bank1_df = pd.read_csv(
    Path('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv1.csv'),  
    low_memory=False)

# Review the DataFrame
display(bank1_df.shape)
display(bank1_df.head(3))

In [None]:
bank1_df = bank1_df.drop(['id',
                          'member_id',
                          'issue_d',
                          'url',
                          'zip_code',
                          'initial_list_status',
                          'hardship_flag',
                          'hardship_type',
                          'hardship_reason',
                          'hardship_status',
                          'deferral_term',
                          'hardship_amount',
                          'hardship_start_date',
                          'hardship_end_date',
                          'payment_plan_start_date',
                          'hardship_length',
                          'hardship_dpd',
                          'hardship_loan_status',
                          'orig_projected_additional_accrued_interest',
                          'hardship_payoff_balance_amount',
                          'hardship_last_payment_amount',
                          'disbursement_method',
                          'debt_settlement_flag',
                          'debt_settlement_flag_date',
                          'settlement_status',
                          'settlement_date',
                          'settlement_amount',
                          'settlement_percentage',
                          'settlement_term',
                          'pymnt_plan',
                          'desc',
                          'purpose',
                          'title',
                          'last_pymnt_d',
                          'next_pymnt_d',
                          'earliest_cr_line',
                          'last_credit_pull_d',
                          'sec_app_earliest_cr_line'], axis=1)
display(bank1_df.shape)

In [None]:
bank2_df = pd.read_csv(
    Path('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv2.csv'),  
    low_memory=False)

# Review the DataFrame
display(bank2_df.shape)
display(bank2_df.head(3))

In [None]:
bank2_df = bank2_df.drop(['id',
                          'member_id',
                          'issue_d',
                          'url',
                          'zip_code',
                          'initial_list_status',
                          'hardship_flag',
                          'hardship_type',
                          'hardship_reason',
                          'hardship_status',
                          'deferral_term',
                          'hardship_amount',
                          'hardship_start_date',
                          'hardship_end_date',
                          'payment_plan_start_date',
                          'hardship_length',
                          'hardship_dpd',
                          'hardship_loan_status',
                          'orig_projected_additional_accrued_interest',
                          'hardship_payoff_balance_amount',
                          'hardship_last_payment_amount',
                          'disbursement_method',
                          'debt_settlement_flag',
                          'debt_settlement_flag_date',
                          'settlement_status',
                          'settlement_date',
                          'settlement_amount',
                          'settlement_percentage',
                          'settlement_term',
                          'pymnt_plan',
                          'desc',
                          'purpose',
                          'title',
                          'last_pymnt_d',
                          'next_pymnt_d',
                          'earliest_cr_line',
                          'last_credit_pull_d',
                          'sec_app_earliest_cr_line'], axis=1)
display(bank2_df.shape)

In [None]:
print('bank1')
display(bank1_df.describe())
print('\n')
print('bank2')
display(bank2_df.describe())

In [None]:
print('bank1')
display(bank1_df.info())
print('\n')
print('bank2')
display(bank2_df.info())

In [None]:

bank1_status_plot = bank1_df['loan_status'].value_counts().hvplot.bar(
    title='Bank 1, Loan Status', 
    xlabel='Loan Status', 
    ylabel='Count',
    hover_color = 'blue',
    rot = 45,
    width=800, 
    height=600
)

In [None]:
bank2_status_plot = bank2_df['loan_status'].value_counts().hvplot.bar(
    title='Bank 2, Loan Status', 
    xlabel='Loan Status', 
    ylabel='Count',
    hover_color = 'blue',
    rot = 45,
    width=800, 
    height=600
)
bank1_status_plot + bank2_status_plot

In [None]:
bank1_installment = bank1_df.hvplot.hist(
    y='installment', by='loan_status', subplots=False, 
    width=800, height=600, bins=50, alpha=0.4, 
    title='Bank 1, Installment by Loan Status', 
    xlabel='Installment', ylabel='Counts', legend='top'
)

bank1_loan_amnt = bank1_df.hvplot.hist(
    y='loan_amnt', by='loan_status', subplots=False, 
    width=800, height=600, bins=30, alpha=0.4, 
    title='Bank 1, Loan Amount by Status', 
    xlabel='Loan Amount', ylabel='Counts', legend='top'
)

bank1_installment + bank1_loan_amnt

In [None]:
bank2_installment = bank2_df.hvplot.hist(
    y='installment', by='loan_status', subplots=False, 
    width=800, height=600, bins=50, alpha=0.4, 
    title='Bank 2, Installment by Loan Status', 
    xlabel='Installment', ylabel='Counts', legend='top'
)

bank2_loan_amnt = bank2_df.hvplot.hist(
    y='loan_amnt', by='loan_status', subplots=False, 
    width=800, height=600, bins=30, alpha=0.4, 
    title='Bank 2, Loan Amount by Status', 
    xlabel='Loan Amount', ylabel='Counts', legend='top'
)

bank2_installment + bank2_loan_amnt

In [None]:
bank1_df.groupby(by='loan_status')['loan_amnt'].describe()

In [None]:
bank2_df.groupby(by='loan_status')['loan_amnt'].describe()

In [None]:
print(f'GRADE unique: {bank1_df.grade.unique()}')
print(f'SUB_GRADE unique: {bank1_df.sub_grade.unique()}')

In [None]:
cleaner_app_type = {'term': {' 36 months': 1.0, ' 60 months': 2.0},
                    'sub_grade': {'A1': 1.0, "A2": 2.0, "A3": 3.0, "A4": 4.0, "A5": 5.0,
                                  "B1": 11.0, "B2": 12.0, "B3": 13.0, "B4": 14.0, "B5": 15.0,
                                  "C1": 21.0, "C2": 22.0, "C3": 23.0, "C4": 24.0, "C5": 25.0,
                                  "D1": 31.0, "D2": 32.0, "D3": 33.0, "D4": 34.0, "D5": 35.0,
                                  "E1": 41.0, "E2": 42.0, "E3": 43.0, "E4": 44.0, "E5": 45.0,
                                  "F1": 51.0, "F2": 52.0, "F3": 53.0, "F4": 54.0, "F5": 55.0,
                                  "G1": 61.0, "G2": 62.0, "G3": 63.0, "G4": 64.0, "G5": 65.0,
                                    },
                     "emp_length": {"< 1 year": 0.0, '1 year': 1.0, '2 years': 2.0, '3 years': 3.0, '4 years': 4.0, 
                                   '5 years': 5.0, '6 years': 6.0, '7 years': 7.0, '8 years': 8.0, '9 years': 9.0,
                                   '10+ years': 10.0 }
                   }
bank1_df = bank1_df.replace(cleaner_app_type)

In [None]:
print(f'GRADE unique: {bank1_df.grade.unique()}')
print(f'SUB_GRADE unique: {bank1_df.sub_grade.unique()}')

In [None]:
bank1_df['loan_status'].value_counts() 

In [None]:
print(f'GRADE unique: {bank2_df.grade.unique()}')
print(f'SUB_GRADE unique: {bank2_df.sub_grade.unique()}')

In [None]:
cleaner_app_type = {'term': {' 36 months': 1.0, ' 60 months': 2.0},
                    'sub_grade': {'A1': 1.0, "A2": 2.0, "A3": 3.0, "A4": 4.0, "A5": 5.0,
                                  "B1": 11.0, "B2": 12.0, "B3": 13.0, "B4": 14.0, "B5": 15.0,
                                  "C1": 21.0, "C2": 22.0, "C3": 23.0, "C4": 24.0, "C5": 25.0,
                                  "D1": 31.0, "D2": 32.0, "D3": 33.0, "D4": 34.0, "D5": 35.0,
                                  "E1": 41.0, "E2": 42.0, "E3": 43.0, "E4": 44.0, "E5": 45.0,
                                  "F1": 51.0, "F2": 52.0, "F3": 53.0, "F4": 54.0, "F5": 55.0,
                                  "G1": 61.0, "G2": 62.0, "G3": 63.0, "G4": 64.0, "G5": 65.0,
                                    },
                     "emp_length": {"< 1 year": 0.0, '1 year': 1.0, '2 years': 2.0, '3 years': 3.0, '4 years': 4.0, 
                                   '5 years': 5.0, '6 years': 6.0, '7 years': 7.0, '8 years': 8.0, '9 years': 9.0,
                                   '10+ years': 10.0 }
                   }
bank2_df = bank2_df.replace(cleaner_app_type)

In [None]:
print(f'GRADE unique: {bank2_df.grade.unique()}')
print(f'SUB_GRADE unique: {bank2_df.sub_grade.unique()}')

In [None]:
bank2_df['loan_status'].value_counts() 

In [None]:
# fully_paid = data.loc[data['loan_status']=='Fully Paid', 'grade'].value_counts().hvplot.bar() 
# charged_off = data.loc[data['loan_status']=='Charged Off', 'grade'].value_counts().hvplot.bar() 

# grades = (fully_paid * charged_off).opts(
#     title="Loan Status by Grade", xlabel='Grades', ylabel='Count',
#     width=500, height=450, legend_cols=2, legend_position='top_right'
# )

# fully_paid = data.loc[data['loan_status']=='Fully Paid', 'sub_grade'].value_counts().hvplot.bar() 
# charged_off = data.loc[data['loan_status']=='Charged Off', 'sub_grade'].value_counts().hvplot.bar() 

# sub_grades = (fully_paid * charged_off).opts(
#     title="Loan Status by Grade", xlabel='Grades', ylabel='Count',
#     width=500, height=450, legend_cols=2, legend_position='top_right', 
#     shared_axes=False
# )

# grades + sub_grades

In [None]:
bank1_fully_paid = bank1_df.loc[bank1_df['loan_status']=='Fully Paid', 'grade'].value_counts().hvplot.barh() 
bank1_charged_off = bank1_df.loc[bank1_df['loan_status']=='Charged Off', 'grade'].value_counts().hvplot.barh() 

(bank1_fully_paid * bank1_charged_off).opts(
    title="Bank 1, Loan Status Grade", 
    xlabel='Grades', ylabel='Count',
    width=800, height=600,
    legend_cols=2,
    legend_position='top_right',
    xrotation=45
)

In [None]:
bank2_fully_paid = bank2_df.loc[bank2_df['loan_status']=='Fully Paid', 'grade'].value_counts().hvplot.barh() 
bank2_charged_off = bank2_df.loc[bank2_df['loan_status']=='Charged Off', 'grade'].value_counts().hvplot.barh() 

(bank2_fully_paid * bank2_charged_off).opts(
    title="Bank 2, Loan Status Grade", 
    xlabel='Grades', ylabel='Count',
    width=800, height=600, 
    legend_cols=2,
    legend_position='top_right',
    xrotation=45
)

In [None]:
bank1_fully_paid = bank1_df.loc[bank1_df['loan_status']=='Fully Paid', 'sub_grade'].value_counts().hvplot.barh() 
bank1_charged_off = bank1_df.loc[bank1_df['loan_status']=='Charged Off', 'sub_grade'].value_counts().hvplot.barh() 

(bank1_fully_paid * bank1_charged_off).opts(
    title='Bank 1, Loan Status SubGrade', 
    xlabel='Grades', 
    ylabel='Count',
    width=800, height=600, 
    legend_cols=2, 
    legend_position='top_right', 
    xrotation=45
)

In [None]:
bank2_fully_paid = bank2_df.loc[bank2_df['loan_status']=='Fully Paid', 'sub_grade'].value_counts().hvplot.barh() 
bank2_charged_off = bank2_df.loc[bank2_df['loan_status']=='Charged Off', 'sub_grade'].value_counts().hvplot.barh() 

(bank2_fully_paid * bank2_charged_off).opts(
    title='Bank 1, Loan Status SubGrade', 
    xlabel='Grades', 
    ylabel='Count',
    width=800, height=600, 
    legend_cols=2, 
    legend_position='top_right', 
    xrotation=45
)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(bank1_df.corr(numeric_only = True), annot=True, vmin=-1, vmax=1, fmt='.2f', cmap='Spectral')

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(bank2_df.corr(numeric_only = True), annot=True, vmin=-1, vmax=1, fmt='.2f', cmap='Spectral')

# Visualize and minimize outliers

In [None]:
bank1_installment_box = bank1_df.hvplot.box(
    y='installment', 
    subplots=True, 
    by='loan_status',
    width=800, height=600,
    rot = 45,
    title='Bank 1, Status by Installment', 
    xlabel='Loan Status', 
    ylabel='Installment'
)

bank1_amnt_box = bank1_df.hvplot.box(
    y='loan_amnt', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 1, Loan Status by Amount', 
    xlabel='Loan Status', 
    ylabel='Loan Amount'
)

bank1_installment_box + bank1_amnt_box 

In [None]:
# minimize outliers 
q_low = bank1_df['annual_inc'].quantile(0.08)
q_hi  = bank1_df['annual_inc'].quantile(0.92)
bank1_df = bank1_df[(bank1_df['annual_inc'] < q_hi) & (bank2_df['annual_inc'] > q_low)]
bank1_df = bank1_df[(bank1_df['dti'] <=45)]
q_hi  = bank1_df['bc_open_to_buy'].quantile(0.95)
bank1_df = bank1_df[(bank1_df['bc_open_to_buy'] < q_hi)]
bank1_df = bank1_df[(bank1_df['bc_util'] <=160)]
bank1_df = bank1_df[(bank1_df['revol_util'] <=150)]
bank2_df = bank1_df[(bank1_df['num_op_rev_tl'] <=35)]


In [None]:
bank1_installment_box = bank1_df.hvplot.box(
    y='installment', 
    subplots=True, 
    by='loan_status',
    width=800, height=600,
    rot = 45,
    title='Bank 1, Status by Installment', 
    xlabel='Loan Status', 
    ylabel='Installment'
)

bank1_amnt_box = bank1_df.hvplot.box(
    y='loan_amnt', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 1, Loan Status by Amount', 
    xlabel='Loan Status', 
    ylabel='Loan Amount'
)

bank1_installment_box + bank1_amnt_box 

In [None]:
bank2_installment_box = bank2_df.hvplot.box(
    y='installment', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 2, Status by Installment', 
    xlabel='Loan Status', 
    ylabel='Installment'
)

bank2_amnt_box = bank2_df.hvplot.box(
    y='loan_amnt', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 2, Loan Status by Amount', 
    xlabel='Loan Status', 
    ylabel='Loan Amount'
)

bank2_installment_box + bank2_amnt_box

In [None]:
# minimize outliers 
q_low = bank2_df['annual_inc'].quantile(0.08)
q_hi  = bank2_df['annual_inc'].quantile(0.92)
bank2_df = bank2_df[(bank2_df['annual_inc'] < q_hi) & (bank2_df['annual_inc'] > q_low)]
bank2_df = bank2_df[(bank2_df['dti'] <=45)]
q_hi  = bank2_df['bc_open_to_buy'].quantile(0.95)
bank2_df = bank2_df[(bank2_df['bc_open_to_buy'] < q_hi)]
bank2_df = bank2_df[(bank2_df['bc_util'] <=160)]
bank2_df = bank2_df[(bank2_df['revol_util'] <=150)]
bank2_df = bank2_df[(bank2_df['num_op_rev_tl'] <=35)]


In [None]:
bank2_installment_box = bank2_df.hvplot.box(
    y='installment', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 2, Status by Installment', 
    xlabel='Loan Status', 
    ylabel='Installment'
)

bank2_amnt_box = bank2_df.hvplot.box(
    y='loan_amnt', 
    subplots=True, 
    by='loan_status', 
    width=800, height=600,
    rot = 45,
    title='Bank 2, Loan Status by Amount', 
    xlabel='Loan Status', 
    ylabel='Loan Amount'
)

bank2_installment_box + bank2_amnt_box

In [None]:
# Create a list of bank1_df categorical variables 
bank1_df_categorical_variables = list(bank1_df.dtypes[bank1_df.dtypes == 'object'].index)

# Display the categorical variables list
bank1_df_categorical_variables

In [None]:
# Create a list of bank2_df categorical variables 
bank2_df_categorical_variables = list(bank2_df.dtypes[bank1_df.dtypes == 'object'].index)

# Display the categorical variables list
bank2_df_categorical_variables

In [None]:
# Instantiate OneHotEncoder
bank1_enc = OneHotEncoder(sparse=False)
# Encode the categorcal variables using OneHotEncoder
bank1_encoded_data = bank1_enc.fit_transform(bank1_df[bank1_df_categorical_variables])


In [None]:
# Create a DataFrame with the bank1 encoded variables
bank1_encoded_df = pd.DataFrame(
    bank1_encoded_data,
    columns = bank1_enc.get_feature_names_out(bank1_df_categorical_variables)
)

# confirm 'keys' and review the DataFrame
display(bank1_encoded_df.columns)
print('\n')
display(bank1_encoded_df.head(3))

In [None]:
# Add the numerical variables from bank1_df DataFrame to the one-hot encoding bank1_encoded_df
full_b1_encoded_df = pd.concat(
    [
        bank1_df.drop(columns=bank1_df_categorical_variables), 
        bank1_encoded_df
    ], 
        axis=1
)

# confirm 'keys' and review the DataFrame
display(full_b1_encoded_df.head(3))



In [None]:
print('grade')
display(bank1_df['grade'].value_counts())
print('\nemp_title')
display(bank1_df['emp_title'].value_counts()) 
print('\nhome_ownership')
display(bank1_df['home_ownership'].value_counts())
print('\nverification_status')
display(bank1_df['verification_status'].value_counts())
print('\nloan_status')
display(bank1_df['loan_status'].value_counts())
# print('\naddr_state')
# display(bank1_df['addr_state'].value_counts())
print('\napplication_type')
display(bank1_df['application_type'].value_counts())
print('\nverification_status_joint')
display(bank1_df['verification_status_joint'].value_counts()) 

In [None]:
# Instantiate OneHotEncoder
bank2_enc = OneHotEncoder(sparse=False)
# Encode the categorcal variables using OneHotEncoder
bank2_encoded_data = bank2_enc.fit_transform(bank2_df[bank2_df_categorical_variables])


In [None]:
# Create a DataFrame with the bank2 encoded variables
bank2_encoded_df = pd.DataFrame(
    bank2_encoded_data,
    columns = bank2_enc.get_feature_names_out(bank2_df_categorical_variables)
)

# confirm 'keys' and review the DataFrame
display(bank2_encoded_df.columns)
print('\n')
display(bank2_encoded_df.head(3))

In [None]:
print('grade')
display(bank2_df['grade'].value_counts())
print('\nemp_title')
display(bank2_df['emp_title'].value_counts()) 
print('\nhome_ownership')
display(bank2_df['home_ownership'].value_counts())
print('\nverification_status')
display(bank2_df['verification_status'].value_counts())
print('\nloan_status')
display(bank2_df['loan_status'].value_counts())
# print('\naddr_state')
# display(bank1_df['addr_state'].value_counts())
print('\napplication_type')
display(bank2_df['application_type'].value_counts())
print('\nverification_status_joint')
display(bank2_df['verification_status_joint'].value_counts()) 

In [None]:
# Create a DataFrame with the bank2 encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# confirm 'keys' and review the DataFrame
display(encoded_df.columns)
print('\n')
display(encoded_df.head(3))

# Identify NaN and replace with 0

In [None]:
# identify NaN
display(bank1_df.isnull().values.any())
print('\n')
count_nan = bank1_df.isnull().sum()
display(count_nan)

In [None]:
# replace or fill NaN with 0 and confirm
bank1_df = bank1_df.fillna(0)
display(bank1_df.isnull().values.any())
print('\n')
count_nan = bank1_df.isnull().sum()
display(count_nan)

In [None]:
# Review the DataFrame
display(bank1_df.head(3))
print('\n')
display(bank1_df.shape)
print('\n')
display(bank1_df.dtypes)
print('\n')
display(bank1_df.info())

In [None]:
# identify NaN
display(bank2_df.isnull().values.any())
print('\n')
count_nan = bank2_df.isnull().sum()
display(count_nan)

In [None]:
# replace or fill NaN with 0 and confirm
bank2_df = bank2_df.fillna(0)
display(bank2_df.isnull().values.any())
print('\n')
count_nan = bank2_df.isnull().sum()
display(count_nan)

In [None]:
# Review the DataFrame
display(bank2_df.head(3))
print('\n')
display(bank2_df.shape)
print('\n')
display(bank2_df.dtypes)
print('\n')
display(bank2_df.info())

In [None]:
# Create a list of bank1_df categorical variables 
bank2_df_categorical_variables = list(bank2_df.dtypes[bank2_df.dtypes == 'object'].index)

# Display the categorical variables list
bank2_df_categorical_variables

In [None]:
for col in bank1_df.columns:
    display(col)

In [None]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
scaled_data = StandardScaler().fit_transform(bank1_df)

In [None]:
# use standard scaler to scale accross all column values
bank1_df_scaled = StandardScaler().fit_transform(bank1_df)
# create a dataframe with the scaled data
top_100_crypto_data_scaled = pd.DataFrame(
    top_100_crypto_data_scaled,
    columns=['Price', 'Changes 24H', 'Changes 7D', 'Changes 30D', 'Changes 1Y', 'Market Cap', 'Volume 24H', 'Available Supply'],
    index = top_100_crypto_data.index
)
top_100_crypto_data_scaled.head()

# Loan accepted: If the company approves the loan, there are ~3 primary possible scenarios described below:
> * Fully paid: Applicant has fully paid the loan (the principal and the interest rate)
> * Current: Applicant is in the process of paying the instalments, i.e. the tenure of the loan is not yet completed. These candidates are not labelled as 'defaulted'.
>> * Grace period
>> * Late (16-30)
>> * Late (31-120)
>> * Default
> * Charged-off: Applicant has not paid the instalments in due time for a long period of time, i.e. he/she has defaulted on the loan

In [None]:
# Import the OHLCV dataset into a Pandas Dataframe
accepted_df = pd.read_csv(
    Path('/Users/lokiskylizard/Desktop/proj2data/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv'),
    low_memory=False,
    infer_datetime_format=True, 
    parse_dates=True
)

# Review the DataFrame
display(accepted_df.shape)
display(accepted_df.head(3))