# **<span style="color:#121CB6;">Credit Risk Analysis using KNN</span>**

- We start with importing all libraries that are needed.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Latex
sns.set_style('whitegrid')
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from joblib import Parallel, delayed
from sklearn.metrics import f1_score

# **<span style="color:#121CB6;">1.Loading and Understanding the dataset</span>**

In [None]:
df_loan = pd.read_csv("/content/drive/MyDrive/loan/loan.csv", encoding='utf-8')
df_loan.head(7)

In [None]:
df_loan.info()

# **<span style="color:#121CB6;">2. Removing Irrelevant coloumn</span>**

In [None]:
df_loan.drop(columns=df_loan.columns.difference(['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership',
                                                 'annual_inc', 'verification_status', 'loan_status', 'purpose']), inplace=True)

In [None]:
df_loan.isnull().sum()

There are many "Missing Values" in Column "emp_length" and few in "annual_inc".

In [None]:
df_loan.info()

In [None]:
df_loan.head(10)

In [None]:
df_loan.annual_inc = df_loan.annual_inc.fillna(0)
df_loan.isnull().sum()

- to eliminate null values, the annual income column is filled with the value 0

# **<span style="color:#121CB6;">3. Create label Coloumn : Description about loan status</span>**

- In this column, the value 0 will be filled with the correct conditions: 'Fully Paid', 'Does not meet the credit policy. Status:Fully Paid', 'Current'
- Meanwhile, the value of 1 will be filled with incorrect conditions: 'Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period', 'Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'

In [None]:
# binary classification
label_categories = [
    (0, ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid', 'Current']),
    (1, ['Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period',
         'Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'])
]

# function to apply the transformation
def classify_label(text):
    if pd.isna(text):
        return None
    for category, matches in label_categories:
        if any(match in text for match in matches):
            return category
    return None

df_loan.loc[:, 'label'] = df_loan['loan_status'].apply(classify_label)
df_loan = df_loan.drop('loan_status', axis=1)

In [None]:
# label several label with specific grading system.
def SC_LabelEncoder1(text):
    if text == "E":
        return 1
    elif text == "D":
        return 2
    elif text == "C":
        return 3
    elif text == "B":
        return 4
    elif text == "A":
        return 5
    else:
        return 0


def SC_LabelEncoder2(text):
    if text == "< 1 year":
        return 1
    elif text == "1 year":
        return 2
    elif text == "2 years":
        return 3
    elif text == "3 years":
        return 4
    elif text == "4 years":
        return 5
    elif text == "5 years":
        return 6
    elif text == "6 years":
        return 7
    elif text == "7 years":
        return 8
    elif text == "8 years":
        return 9
    elif text == "9 years":
        return 10
    elif text == "10 years":
        return 11
    elif text == "10+ years":
        return 12
    else:
        return 0

def SC_LabelEncoder3(text):
    if text == "RENT":
        return 1
    elif text == "MORTGAGE":
        return 2
    elif text == "OWN":
        return 3
    else:
        return 0

df_loan["grade"] = df_loan["grade"].apply(SC_LabelEncoder1)
df_loan["emp_length"] = df_loan["emp_length"].apply(SC_LabelEncoder2)
df_loan["home_ownership"] = df_loan["home_ownership"].apply(SC_LabelEncoder3)

In [None]:
df_loan.head(10)

In [None]:
df_loan.info()

# **<span style="color:#121CB6;">4. Exploratory Data Analysis</span>**

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.countplot(data=df_loan, x='grade', hue="home_ownership", ax=ax[0]).set_title("Grade/Home Ownership distribution");
sns.countplot(data=df_loan, x='home_ownership', hue='grade', ax=ax[1]).set_title("Grade/Home Ownership distribution");

fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.countplot(data=df_loan, x='label', hue='purpose', ax=ax[0]).set_title("Grade Distribution with verification_status distribution");
sns.countplot(data=df_loan, x='grade', hue='label', ax=ax[1]).set_title("Grade Distribution with loan_status");

## Analysis :
1. The number of Borrowers with high grade will be small compared to low grade
2. Most money borrowers' goals from labels 0 and 1 are debt consolidation
3. The highest number of grades who were able to complete the loan was grade 4, while the most failed to complete the loan was grade 3

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x='purpose', y='loan_amnt', data=df_loan)
plt.xticks(rotation=30)
plt.title('Loan amounts grouped by purpose')

## Analysis :
There are 5 highest categories for the amount of credit with the following purposes: Credit card, MSME business, debt consolidation, home improvement, and buying a house

In [None]:
fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.histplot(df_loan, x='loan_amnt',hue="label", bins=30, ax=ax[0]).set_title("Loan Ammount distribution");
sns.countplot(data=df_loan, x='term', hue="label", ax=ax[1]).set_title("Term distribution");

fig, ax = plt.subplots(1,2,figsize=(15,5))
sns.countplot(data=df_loan, hue='home_ownership', x='label', ax=ax[1]).set_title("Home ownership with loan_status");
sns.countplot(data=df_loan, x='verification_status', hue='label', ax=ax[0]).set_title("Verification Status Distribution with loan_status");

## Analysis :
1. The nominal value of the largest debt is 10000 USD
2. The maximum maturity is 36 months, while for 60 months it is almost a third
3. Most of the credits that can be paid in full are obtained from the "Verified" verification status

- Seeing the correlation between variables:

In [None]:
corr = df_loan[['loan_amnt', 'int_rate', 'grade', 'emp_length', 'home_ownership', 'annual_inc','purpose','label']].corr()
sns.set(rc={'figure.figsize':(11,7)})
sns.heatmap(corr, linewidths=.5, annot=True, cmap="YlGnBu", mask=np.triu(np.ones_like(corr, dtype=bool)))\
    .set_title("Pearson Correlations Heatmap");

## Analysis :
The amount of credit is very dependent on the annual income of the borrower

# **<span style="color:#121CB6;">5. Pra-Processing data for Discrete Coloumn</span>**

In [None]:
# use LabelEncoder() to encode another category column:
for col in ["verification_status", "purpose","term"]:
    le = LabelEncoder()
    le.fit(df_loan[col])
    df_loan[col] = le.transform(df_loan[col])
df_loan.head()

In [None]:
df_loan.isnull().sum()

In [None]:
df_loan.label = df_loan.label.fillna(1)

# **<span style="color:#121CB6;">6. Clustering</span>**

In [None]:
inertias = []

for i in range(2, 16):
    kmeans = KMeans(n_clusters=i, n_init='auto', random_state=0).fit(df_loan)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.title('Inertias v.s. N_Clusters')
plt.plot(np.arange(2, 16), inertias, marker='o', lw=2);

## Analysis:
"Elbow" on the chart above is at 4. The number of clusters must be 4.

In [None]:
sample_size = 1000  # Adjust the sample size based on your data size
df_sample = df_loan.sample(sample_size, random_state=0)

km = KMeans(n_clusters=4, n_init=10, random_state=0)
clusters = km.fit_predict(df_sample)

# Creating a DataFrame for clustered data
df_clustered = df_sample[['loan_amnt', 'int_rate', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'purpose']].copy()
df_clustered.loc[:, "Cluster"] = clusters

# Visualizing the clustered data using pairplot
sns.pairplot(df_clustered[['loan_amnt', 'int_rate', 'grade', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'Cluster']],
             hue="Cluster", markers='.', plot_kws={'alpha':0.5});

# **<span style="color:#121CB6;">7. Predicting Risk: Using the K-Nearest Neighbors Classification Model</span>**

In [None]:
X, y = df_loan.drop("label", axis=1), df_loan["label"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
max_score = 0
max_k = 0

def calculate_score(k, X_train, y_train, X_test, y_test):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)
    score = f1_score(y_test, neigh.predict(X_test), average='micro')
    return k, score

results = Parallel(n_jobs=-1)(delayed(calculate_score)(k, X_train, y_train, X_test, y_test) for k in range(1, 100))

for k, score in results:
    if score > max_score:
        max_k = k
        max_score = score

In [None]:
print('Jika kita menggunakan K-Nearest Neighbors Classification, maka nilai K yang terbaik adalah', str(max_k), 'untuk mendapatkan prediksi terbaik, dengan akurasi rata-rata sebesar', max_score)

Classification with other ML models

Since the KNN (K-Nearest Neighbors) Classification takes a lot of time and memory to predict, it is possible to use other ML models such as SVC, DecisionTree, RandomForest, and GaussianNaiveBayes.

However, in this notebook, We use KNN Model Only, and it is done and has got a good accuracy = 85,27%