# Task for Today  

***

## Loan Risk Prediction  
  
Given *data about German loans*, let's try to detect **high-risk loans** in the data.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/german-credit-data-with-risk/german_credit_data.csv')

In [None]:
data

In [None]:
data.info()

# Helper Functions

In [None]:
def binary_encode(df, columns_with_positive_values):
    df = df.copy()
    for column, positive_value in columns_with_positive_values:
        df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, columns_with_orderings):
    df = df.copy()
    for column, ordering in columns_with_orderings:
        df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

def onehot_encode(df, columns_with_prefixes):
    df = df.copy()
    for column, prefix in columns_with_prefixes:
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop duplicate id column
    df = df.drop('Unnamed: 0', axis=1)
    
    # Encode missing values as 'none'
    for column in ['Saving accounts', 'Checking account']:
        df[column] = df[column].fillna('none')
    
    # Binary encode the Sex and Risk columns
    df = binary_encode(
        df,
        columns_with_positive_values=[
            ('Sex', 'male'),
            ('Risk', 'bad')
        ]
    )
    
    # Ordinal encode the Saving accounts and Checking account columns
    df = ordinal_encode(
        df,
        columns_with_orderings=[
            ('Saving accounts', ['none', 'little', 'moderate', 'rich', 'quite rich']),
            ('Checking account', ['none', 'little', 'moderate', 'rich'])
        ]
    )
    
    # One-hot encode the Housing and Purpose columns
    df = onehot_encode(
        df,
        columns_with_prefixes=[
            ('Housing', 'H'),
            ('Purpose', 'P')
        ]
    )
    
    # Split df into X and y
    y = df['Risk'].copy()
    X = df.drop('Risk', axis=1).copy()
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
def evaluate_model(model, X_test, y_test, classification_threshold=0.5):
    
    y_true = np.array(y_test)
    
    y_pred = (model.predict_proba(X_test) >= classification_threshold).astype(np.int)
    y_pred = list(map(
        lambda x: x[1],
        y_pred
    ))
    
    print("Test Accuracy: {:.2f}%".format(accuracy_score(y_true, y_pred) * 100))
    
    cm = confusion_matrix(y_true, y_pred)
    clr = classification_report(y_true, y_pred, target_names=["Not Risky", "Risky"])
    
    plt.figure(figsize=(8, 8))
    sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(np.arange(2) + 0.5, ["Not Risky", "Risky"])
    plt.yticks(np.arange(2) + 0.5, ["Not Risky", "Risky"])
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

# Preprocessing

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
y_train.value_counts()

In [None]:
model = LogisticRegression(
    class_weight={
        0: 1,
        1: 1.5
    }
)

model.fit(X_train, y_train)

# Results

In [None]:
evaluate_model(model, X_test, y_test, classification_threshold=0.4)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/AWmsXeIcI_E