# Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import plotly
from plotly.offline import iplot
import plotly.express as px
import plotly.figure_factory as ff

import cufflinks as cf

cf.go_offline()
plotly.offline.init_notebook_mode()
cf.set_config_file(world_readable=True, theme='space', offline=True)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier

import optuna

# Data

In [None]:
df = pd.read_csv("../input/loan-prediction-based-on-customer-behavior/Training Data.csv", index_col='Id')
df.shape

# Exploratory Data Analysis

In [None]:
feats = df.columns[:-1]
is_cat = np.array([df[f].dtype == 'object' for f in feats])
cat_feats, num_feats = feats[is_cat].tolist(), feats[~is_cat].tolist()
print(cat_feats, num_feats, sep='\n')

In [None]:
def show_hists(df: pd.DataFrame) -> None:
    fig = plt.figure(figsize=(12, 8))
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    for i, f in enumerate(df.columns):
        axis = fig.add_subplot(3, 2, i + 1)
        axis.hist(df[f])
        if df[f].dtype == 'object' and df[f].nunique() > 3:
            axis.set(xlabel=None, title=f)
        else:
            axis.set(title=f)
    fig.show()

In [None]:
show_hists(df[num_feats])
show_hists(df[cat_feats[:3]])

In [None]:
def feat_likelihoods(
    df: pd.DataFrame,
    feat: str,
    target: str,
    is_num=False
) -> pd.Series:
    lh = (df[[feat, target]].groupby(feat).mean() - df[target].mean())[target].copy()
    if lh.size > 10 and not is_num:
        lh.sort_values(ascending=False, inplace=True)
    return lh

In [None]:
lh_by_cat_feats = pd.Series({f: feat_likelihoods(df, f, 'Risk_Flag') for f in cat_feats})
lh_by_num_feats = pd.Series({f: feat_likelihoods(df, f, 'Risk_Flag', is_num=True) for f in num_feats[1:]})
likelihoods = pd.concat([lh_by_cat_feats, lh_by_num_feats])

for f, lh in likelihoods.items():
    if lh.size > 10:
        lh.iplot(orientation='h', title=f)
    else:
        lh.iplot(kind='bar', orientation='h', title=f)

# Categorical feats encoding

**Married/Single** and **Car_Ownership** are easy to encode.    
For **CITY** and **STATE** we will try target encoding.  
For **Profession** we can use one-hot encoding after replacing each profession with its category (software, medicine, ...).

In [None]:
prof_categories = {
    'Mechanical_engineer'       : 'Tech',
    'Software_Developer'        : 'Soft',
    'Technical_writer'          : 'Tech',
    'Civil_servant'             : 'Gov',
    'Economist'                 : 'Math|Sc',
    'Flight_attendant'          : 'Avia',
    'Architect'                 : 'Design|Tech|Draw',
    'Designer'                  : 'Design',
    'Physician'                 : 'Sc|Tech',
    'Financial_Analyst'         : 'Fin|Math|Sc',
    'Air_traffic_controller'    : 'Avia',
    'Politician'                : 'Gov',
    'Police_officer'            : 'Law|Force',
    'Artist'                    : 'Art|Draw',
    'Surveyor'                  : 'Geo',
    'Design_Engineer'           : 'Design|Tech',
    'Chemical_engineer'         : 'Chem|Tech',
    'Hotel_Manager'             : 'Fin|Manage',
    'Dentist'                   : 'Med',
    'Comedian'                  : 'Art',
    'Biomedical_Engineer'       : 'Bio|Tech',
    'Graphic_Designer'          : 'Design',
    'Computer_hardware_engineer': 'Soft|Tech',
    'Petroleum_Engineer'        : 'Tech|Chem|Geo',
    'Computer_operator'         : 'Soft|Tech',
    'Chartered_Accountant'      : 'Fin|Staff|Office',
    'Microbiologist'            : 'Bio|Sc',
    'Fashion_Designer'          : 'Design',
    'Technician'                : 'Tech',
    'Aviator'                   : 'Avia',
    'Psychologist'              : 'Med|Sc',
    'Magistrate'                : 'Law|Gov',
    'Lawyer'                    : 'Law',
    'Engineer'                  : 'Tech',
    'Official'                  : 'Gov',
    'Analyst'                   : 'Math|Sc|Office',
    'Geologist'                 : 'Geo|Sc',
    'Drafter'                   : 'Tech|Soft|Draw',
    'Statistician'              : 'Math|Sc',
    'Web_designer'              : 'Design',
    'Army_officer'              : 'Gov|Force',
    'Surgeon'                   : 'Med',
    'Scientist'                 : 'Sc',
    'Civil_engineer'            : 'Tech',
    'Industrial_Engineer'       : 'Tech',
    'Technology_specialist'     : 'Tech',
    'Firefighter'               : 'Rescue',
    'Consultant'                : 'Staff',
    'Chef'                      : 'Food',
    'Secretary'                 : 'Fin|Staff|Office',
    'Librarian'                 : 'Staff'
}

df['Profession'].replace(prof_categories, inplace=True)
df = pd.concat([df, df['Profession'].str.get_dummies('|')], axis=1)
df.drop(columns=['Profession'], inplace=True)

In [None]:
df['Own_House']  = (df['House_Ownership'] == 'owned')
df['Rent_House'] = (df['House_Ownership'] == 'rented')
df.drop(columns=['House_Ownership'], inplace=True)

In [None]:
to_binary = ['Married/Single', 'Car_Ownership']
to_target_encoding = ['STATE', 'CITY']

def encode_categories(data: pd.DataFrame) -> None:
    for bf in to_binary:
        encoding = {name: code for code, name in enumerate(data[bf].unique())}
        data[bf].replace(encoding, inplace=True)
    for tef in to_target_encoding:
        data[tef].replace(likelihoods[tef], inplace=True)
    data.rename(columns={f: f+"_lh" for f in to_target_encoding}, inplace=True)
    return data

In [None]:
df = encode_categories(df)
df.head()

# Model selection

As you can see, dataset is imbalanced. Because of this we will try to solve the task as anomaly detection and as imbalanced classification using SMOTE oversampling.

In [None]:
def scale(train_X: pd.DataFrame, test_X: pd.DataFrame) -> None:
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)

In [None]:
def fit_predict(model, train_X, train_y, test_X):
    model.fit(train_X, train_y)
    return pd.Series(model.predict(test_X))

In [None]:
def show_confusion_matrix(actual, predict):
    cfm = confusion_matrix(actual, predict)
    group_names = ['TN', 'FP', 'FN', 'TP']
    group_percentages = [
        '{0:.2%}'.format(value)
        for value in cfm.flatten() / np.sum(cfm)
    ]
    labels = [
        f"{v2}\n{v3}"
        for  v2, v3 in zip(group_names, group_percentages)
    ]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cfm, annot=labels, fmt='', cmap='inferno')
    plt.show()

def try_models(clfs, X_train, y_train, X_test, fit_predict=fit_predict):
    for clf_name, clf_model in clfs.items():
        real_y, pred_y = y_test, fit_predict(clf_model, X_train, y_train, X_test)
        print(f"{clf_name}:")
        print(f"Accuracy: {round(accuracy_score(real_y, pred_y) * 100, 1)}%")
        print(f"ROC-AUC: {round(roc_auc_score(real_y, pred_y), 3)}\n")
        show_confusion_matrix(real_y, pred_y)

## Task - anomalies detection.

In [None]:
X, y = df.drop(columns=['Risk_Flag']), df['Risk_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scale(X_train, X_test)

In [None]:
def anomalies_fp(model, train_X, train_y, test_X):
    return fit_predict(model, train_X, train_y, test_X).replace({-1: 1, 1: 0})

clfs = {
    'OneClassSVM': OneClassSVM(),
    'IsolationForest': IsolationForest(random_state=0)
}
try_models(clfs, X_train, y_train, X_test, anomalies_fp)

#### Conclusion.
* Both models make too many type I errors, so they can't be used.

# Task - imbalanced classification.

In [None]:
def smote_oversampling(train_X: pd.DataFrame, train_y: pd.Series) -> None:
    smote = SMOTE(random_state=0)
    train_X, train_y = smote.fit_resample(train_X, train_y)
    
X, y = df.drop(columns=['Risk_Flag']), df['Risk_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
smote_oversampling(X_train, y_train)
scale(X_train, X_test)

In [None]:
clfs = {
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Random Forest": RandomForestClassifier(random_state=0),
    "XGBoost Classifier": XGBClassifier(
        n_estimators=5000, eval_metric='auc',
        use_label_encoder=False)
}

try_models(clfs, X_train, y_train, X_test)

#### Conclusion.
* Logistic Regression never detect loan defaults, so it can't be used
* Precisions:
    - Decision Tree ~ 0.52
    - Random Forest ~ 0.61
    - XGBoost       ~ 0.59
* Random Forest is model with the best precision
* Decision Tree is the model with the best roc-auc score

# Hyperparameters tuning

In [None]:
def rf_objective(trial):
    params = {
        "n_estimators": trial.suggest_int('n_estimators', 10, 100),
        "max_depth": trial.suggest_categorical("max_depth", [7, 8, 9, 10, 11, 12, None]),
        "criterion": trial.suggest_categorical('criterion', ["gini", "entropy"]),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 5),
        "min_samples_leaf": trial.suggest_categorical('min_samples_leaf', [1, 2]),
        "max_features": trial.suggest_categorical('max_features', ["auto", "sqrt", "log2"]),
        "class_weight": trial.suggest_categorical('class_weight', ["balanced"]),
        "random_state": trial.suggest_categorical('random_state', [0]),
        "n_jobs": trial.suggest_categorical('n_jobs', [-1]),
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    return -roc_auc_score(y_test, model.predict(X_test))

In [None]:
def dt_objective(trial):
    params = {
        "criterion": trial.suggest_categorical('criterion', ["gini", "entropy"]),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 5),
        "min_samples_leaf": trial.suggest_categorical('min_samples_leaf', [1, 2]),
        "max_features": trial.suggest_categorical('max_features', ["auto", "sqrt", "log2"]),
        "class_weight": trial.suggest_categorical('class_weight', ["balanced"]),
        "random_state": trial.suggest_categorical('random_state', [0])
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    return -roc_auc_score(y_test, model.predict(X_test))

In [None]:
# Best score is 0.8412
# study = optuna.create_study()
# study.optimize(rf_objective, n_trials=200, timeout=3600 * 2)
# print(f"Best RandomForest ROC-AUC: {-round(study.best_value, 4)} with parameters {study.best_params}\n\n")

rf_best_score = 0.8438
rf_best_params = {
    'n_estimators': 10, 'criterion': 'entropy',
    'min_samples_split': 2, 'min_samples_leaf': 2,
    'max_features': 'log2', 'class_weight': 'balanced',
    'random_state': 0, 'n_jobs': -1}
rf = RandomForestClassifier(**rf_best_params)
rf.fit(X_train, y_train);

In [None]:
# Best score is 0.8409
# study = optuna.create_study()
# study.optimize(dt_objective, n_trials=200, timeout=3600 * 2)
# print(f"Best DecisionTree ROC-AUC: {-round(study.best_value, 4)} with parameters {study.best_params}\n\n")

dt_best_score = 0.8409
dt_best_params = {
    'criterion': 'gini', 'min_samples_split': 2,
    'min_samples_leaf': 2, 'max_features': 'sqrt',
    'class_weight': 'balanced', 'random_state': 0}
dt = DecisionTreeClassifier(**dt_best_params)
dt.fit(X_train, y_train);

In [None]:
print(f"Random Forest Best: {rf_best_score}")
show_confusion_matrix(y_test, rf.predict(X_test))

print(f"Decision Tree Best: {dt_best_score}")
show_confusion_matrix(y_test, dt.predict(X_test))

# Results

***Random Forest gives more accurate predictions, and its precision is greater. But Decision Tree can also be used, it have greater recall.***