In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
import pickle
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, cross_validate


In [None]:
## Run this cell only once

'''# Create a new experiment
experiment_id = mlflow.create_experiment(
        name="loan_approval_predictions",
        artifact_location="loan_approval_predictions_artifacts",
        tags={"env": "dev", "version": "1.0.0"},
    )

print(experiment_id)    '''


In [3]:
# Retrive experiment id using experiment name to reuse it

experiment_id = mlflow.set_experiment("loan_approval_predictions").experiment_id

In [None]:
#Read data
df = pd.read_csv("artifacts\data\loan_data.csv")

#Investigate data
df.head()

**person_age**	Age of the person	

**person_gender**	Gender of the person	

**person_education**	Highest education level	

**person_income**	Annual income	

**person_emp_exp**	Years of employment experience	

**person_home_ownership**	Home ownership status (e.g., rent, own, mortgage)	

**loan_amnt**	Loan amount requested	

**loan_intent**	Purpose of the loan	

**loan_int_rate**	Loan interest rate	

**loan_percent_income**	Loan amount as a percentage of annual income	

**cb_person_cred_hist_length**	Length of credit history in years	

**credit_score**	Credit score of the person	

**previous_loan_defaults_on_file**	Indicator of previous loan defaults	

**loan_status (target variable)**	Loan approval status: 1 = approved; 0 = rejected

In [None]:
#Data exploration
df.info()

 we can see that we don't have any missing values or null values.
 
 We have to convert integers into floats, so ML algorithms process them better.

In [None]:
df.describe().T

 There are wrong values in [Person_age], the maximum value is 144! we are not dinasours, so we have to check for outliers in [Person_age] column.

 There are wrong values in [person_emp_exp], the maximum value is 125! so we have to check for outliers in [person_emp_exp] column.

 There could be outliers in [person_income], [loan_amnt], [cb_person_cred_hist_length].

 [loan_percent_income] repeate 2 information, [loan_amnt] and [person_income], so we can look for removing one of these to reveale correlation.

 we have to check for any correlation between the target variable [loan_status] and [[credit_score],[previous_loan_defaults_on_file], [loan_int_rate]]

First Remove wrong valeus from [person_age] and [person_emp_exp] columns
    1. show boxplot for each column to decide the border.
    2. remove values greater than border value.


In [None]:
sns.boxenplot(data=df, y='person_age')
plt.show()

In [None]:
sns.boxenplot(data=df, y='person_emp_exp')
plt.show()

In [9]:
# Remove wrong valeus from [person_age] and [person_emp_exp] columns

df = df[(df["person_age"] < 80)]
df = df[(df["person_emp_exp"] < 60)]

In [None]:
sns.boxenplot(data=df, y='person_age')
plt.show()

In [None]:
sns.boxenplot(data=df, y='person_emp_exp')
plt.show()

In [None]:
df.describe().T

That is right that there are still some outliers in the two columns, but these values are real information so we will keep them.

Second: Check outliers for [person_income], [loan_amnt], [cb_person_cred_hist_length] columns.

In [None]:
sns.boxenplot(data=df, y='person_income')
plt.show()
sns.boxenplot(data=df, y='loan_amnt')
plt.show()
sns.boxenplot(data=df, y='cb_person_cred_hist_length')
plt.show()

# Going to invistigate the [annual_income] more

In [None]:
sns.boxenplot(data=df, y='person_income', x='person_education')
plt.show()


In [None]:
sns.boxenplot(data=df, y='person_income', x='person_home_ownership')
plt.show()

In [None]:
sns.scatterplot(data=df,  y='person_income', x='loan_amnt')
plt.show()

In [None]:
sns.scatterplot(data=df,  y='person_income', x='loan_amnt', hue='loan_intent')
plt.show()

[person_income] has a problem, so we have to know more about incomes in US.
1. It's clear now that there are wrong values in the dataset, as it's unnatureal for those who get annual income of more than 1M$ to loan less than 15K$

2.  Minimum income as 8000$ is definitely wrong, it could be the information of monthly income, because the average annual income in 2022 US for age +25 is more than 30,0000$.

In [117]:
df.drop(df[df['person_income'] < 15_000].index, inplace=True)
df.drop(df[(df['person_income'] > 1_000_000) & (df['loan_amnt'] < 15_000)].index, inplace=True)


In [None]:
df.describe().T

For [loan_amnt] column we will remove values less than 1000, because "Most banks, credit unions, and online lenders don't offer personal loans for less than $1,000" (Source)[https://www.investopedia.com/can-you-get-a-usd500-personal-loan-7852432]

In [119]:
df.drop(df[df['loan_amnt'] < 1_000].index, inplace=True)

In [None]:
sns.scatterplot(data=df, x = 'person_age', y='cb_person_cred_hist_length')
plt.show()

For ['cb_person_cred_hist_length'], there are no problems.

Therd: [[loan_percent_income]] repeate 2 information, [loan_amnt] and [person_income], so we can look for removing one of these to reveale correlation.

We can see that [loan_percent_income] and [loan_amnt] are correlated ~(0.60), but it is not highly correlated, event the negative correlation with [person_income] is not that high, making a question of the correctness data in this column and give stronger reaon to remove it.

In [None]:
df.head()

In [None]:
df.drop(columns=['loan_percent_income'], inplace=True)

Fourth: check for any correlation between the target variable [loan_status] and [[credit_score],[previous_loan_defaults_on_file], [loan_int_rate]]

In [24]:
data_correlation = df[['credit_score', 'previous_loan_defaults_on_file', 'loan_int_rate', 'loan_status']]

In [None]:

le = LabelEncoder() 

cat = ['previous_loan_defaults_on_file', 'loan_status']

for col in cat:
    data_correlation[col] = le.fit_transform(data_correlation[col])

In [None]:
data_correlation.corrwith(data_correlation['loan_status'])

[previous_loan_defaults_on_file] is correlated with the [loan_status] by -0.540675, this is consoderied as negative moderate correlation, so we can consider it as usefull information and keep it.

In [None]:
df.describe().T

In [None]:
X = df.drop(columns='loan_status',axis=1)
y = df['loan_status']
y.value_counts()

In [None]:
cat = df.select_dtypes("object").columns
cat

In [30]:
#Create a column transformer to encode categorical columns

cat_transformer = ColumnTransformer([
    ('one_hot',OneHotEncoder(drop='first', handle_unknown='ignore'),['person_gender','loan_intent']),
    ('ordinal',OrdinalEncoder(categories=[[ "High School","Associate","Bachelor","Master","Doctorate"],['OTHER','MORTGAGE','RENT','OWN'],['No','Yes']],handle_unknown='error'),['person_education', 'person_home_ownership','previous_loan_defaults_on_file'])],
    remainder='passthrough')

scale_transformer = ColumnTransformer([('scaler', StandardScaler(), slice(0, None))])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
y_train.value_counts()

We have imbalanced training data, so we need to experiment with multiple approaches and multiple models to 

In [33]:
scoring = {'f1': 'f1',
           'roc_auc': 'roc_auc',
           'precision': 'precision',
           'recall': 'recall'}

In [37]:
from imblearn.over_sampling import SMOTE


In [53]:

# I will include main changes in the run name
run_name = " LogisticRegression with SMOTE"


pipe = Pipeline([
    ("CategoricalFeatures", cat_transformer),
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
#    ('undersampler', RandomUnderSampler(sampling_strategy='auto', random_state=42)),
    ("Scaler", scale_transformer),
    ('clf', LogisticRegression())
])

scores = cross_validate(estimator=pipe,
                        X=X_train,
                        y=y_train,
                        cv=10, 
                        scoring=scoring,
                        n_jobs=-1)


In [None]:
f1_mean = scores.get('test_f1').mean()
roc_auc_mean = scores.get('test_roc_auc').mean()
precision_mean = scores.get('test_precision').mean()
recall_mean = scores.get('test_recall').mean()

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name):
    # Log parameters of the classifier in the pipeline
    mlflow.log_params(pipe.named_steps['clf'].get_params())
    
    # Log metrics
    mlflow.log_metric("f1_mean", f1_mean)
    mlflow.log_metric("roc_auc_mean", roc_auc_mean)
    mlflow.log_metric("precision_mean", precision_mean)
    mlflow.log_metric("recall_mean", recall_mean)

    # Log the pipeline model
    mlflow.sklearn.log_model(pipe, "model")


![Results](artifacts/images/image.png)

We can see after comparing all the experiments that **RandomForestClassifier with SMOTE** gives the most robust performance, so we will test it.

In [None]:

pipe = Pipeline([
    ("CategoricalFeatures", cat_transformer),
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
    ("Scaler", scale_transformer),
    ('clf', RandomForestClassifier())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

As we care about the two classes equally, this performance is very good.
so, we will register the model and use it.

In [84]:

with open('artifacts\model\model_v1.pkl', 'wb') as f:
    pickle.dump(pipe, f)
