In [None]:
import pandas as pd
import numpy as np
from functions import *

In [None]:
loan_df = pd.read_csv('datasets/loan.csv', sep=',')  # 8 categorical attributes, 4 numerical attributes, and a Loan ID attribute Loan_Status - target
loan_df.head() 

loan_df.rename(columns={'Loan_Status': 'target'}, inplace=True)
column = loan_df.pop('target')
loan_df.insert(len(loan_df.columns), 'target', column)
loan_df.drop(['Loan_ID'], axis=1, inplace=True)

# Loan_ID is just an identifier and is not very useful so can be ignored while predictions
# different algos that I can use here: KNN, decision tree, naive bayes, random forest, neural networks,

In [None]:
loan_df.head()

In [None]:
type_of_col = {
    'Gender': 0,
    'Married': 0,
    'Dependents': 0,
    'Education': 0,
    'Self_Employed': 0,
    'ApplicantIncome': 1,
    'CoapplicantIncome': 1, 
    'LoanAmount': 1,
    'Loan_Amount_Term': 1,
    'Credit_History': 0,
    'Property_Area': 0, 
    'target': 0
}

In [None]:
loan_df.describe()

In [None]:
loan_df.dropna(inplace=True)

In [None]:
loan_df.describe()

In [None]:
loan_df['target'].value_counts()

In [None]:
to_normalize = []
to_encode = []
for col in loan_df.columns:
    if col == 'target':
        continue
    if(type_of_col[col]):
        to_normalize.append(col)
    else:
        to_encode.append(col)

In [None]:
norm_loan_df = normalise(loan_df, to_normalize)
encoded_loan_df = encode_one_hot(norm_loan_df, to_encode)
shuffled_df = shuffle_dt(encoded_loan_df)

In [None]:
smaller_dfs = distribute_records(encoded_loan_df, 10)
max_k = 6

In [None]:
f1_test = []
f1_train = []
accuracies_test = []
accuracies_train = []
for k_val in range(1, 52, max_k):
    print('k_val: ', k_val)
    acc_ts = []
    f1_ts = []
    acc_tr = []
    f1_tr = []
    for fold in range(0,10):
        print('Fold: ', fold)
        test_df = smaller_dfs[fold]
        train_df = pd.concat(smaller_dfs[:fold] + smaller_dfs[fold+1:])
        #print('Fold: ', fold, 'Length of train and test: ', len(train_df), len(test_df))
        test_df.reset_index(inplace=True, drop=True)
        train_df.reset_index(inplace=True, drop=True)
        acc, f1 = accuracy_on_test(train_df, test_df, k_val)
        acctr, f1tr = accuracy_on_train(train_df, k_val)
        acc_tr.append(acctr)
        f1_tr.append(f1tr)
        acc_ts.append(acc)
        f1_ts.append(f1)
    accuracies_test.append(acc_ts)
    f1_test.append(f1_ts)
    accuracies_train.append(acc_tr)
    f1_train.append(f1_tr)

In [None]:
results = []
final_df_acc = pd.DataFrame()
final_df_f1 = pd.DataFrame()
final_df_acc_tr = pd.DataFrame()
final_df_f1_tr = pd.DataFrame()
k_values = []
for i in range(1, 52, max_k):
    k_values.append(i)

for i, k_val in enumerate(k_values):
    final_df_acc_tr[k_val] = accuracies_train[i]
    final_df_f1_tr[k_val] = f1_train[i]
    final_df_acc[k_val] = accuracies_test[i]
    final_df_f1[k_val] = f1_test[i]

In [None]:
final_df_acc = final_df_acc.add_prefix('k_val_')
final_df_f1 = final_df_f1.add_prefix('k_val_')

In [None]:
mean_row = final_df_acc.mean(axis=0)
mean_row.name = 'Mean'
final_df_acc = pd.concat([final_df_acc, mean_row.to_frame().T])
final_df_acc

In [None]:
mean_row = final_df_f1.mean(axis=0)
mean_row.name = 'Mean'
final_df_f1 = pd.concat([final_df_f1, mean_row.to_frame().T])
final_df_f1

In [None]:
train_accuracies = []

for col in final_df_acc_tr.columns:
  column_mean = final_df_acc_tr[col].mean()
  train_accuracies.append(column_mean)

k_values = list(range(1, 52, max_k))

train_std_dev = []
for col in final_df_acc_tr.columns:
  column_std = final_df_acc_tr[col].std()
  train_std_dev.append(column_std)

plt.plot(k_values, train_accuracies, marker='o', linestyle='-')
plt.errorbar(k_values, train_accuracies, yerr=train_std_dev, fmt='o', capsize=5)
plt.xlabel('k values')
plt.ylabel('Accuracies over training data')
plt.title('Accuracy vs k values')
plt.grid(True)
plt.show()

In [None]:
test_accuracies = []

for col in final_df_acc.columns:
  column_mean = final_df_acc[col].mean()
  test_accuracies.append(column_mean)

k_values = list(range(1, 52, max_k))

test_std_dev = []
for col in final_df_acc.columns:
  column_std = final_df_acc[col].std()
  test_std_dev.append(column_std)

plt.plot(k_values, test_accuracies, marker='o', linestyle='-')
plt.errorbar(k_values, test_accuracies, yerr=test_std_dev, fmt='o', capsize=5)
plt.xlabel('k values')
plt.ylabel('Accuracies over testing data')
plt.title('Accuracy vs k values')
plt.grid(True)
plt.show()