In [37]:
import mlflow
experiment_id = mlflow.create_experiment(
        name="loan_approval_predictions",
        artifact_location="loan_approval_predictions_artifacts",
        tags={"env": "dev", "version": "1.0.0"},
    )

print(experiment_id)    


226907290133801603


In [1]:
#Read data
import pandas as pd
df = pd.read_csv("artifacts\data\loan_data.csv")

#investigate data
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [2]:
#Data exploration
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,45000.0,27.764178,6.045108,20.0,24.0,26.0,30.0,144.0
person_income,45000.0,80319.053222,80422.498632,8000.0,47204.0,67048.0,95789.25,7200766.0
person_emp_exp,45000.0,5.410333,6.063532,0.0,1.0,4.0,8.0,125.0
loan_amnt,45000.0,9583.157556,6314.886691,500.0,5000.0,8000.0,12237.25,35000.0
loan_int_rate,45000.0,11.006606,2.978808,5.42,8.59,11.01,12.99,20.0
loan_percent_income,45000.0,0.139725,0.087212,0.0,0.07,0.12,0.19,0.66
cb_person_cred_hist_length,45000.0,5.867489,3.879702,2.0,3.0,4.0,8.0,30.0
credit_score,45000.0,632.608756,50.435865,390.0,601.0,640.0,670.0,850.0
loan_status,45000.0,0.222222,0.415744,0.0,0.0,0.0,0.0,1.0


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [9]:
#Create a column transformer to encode categorical columns

cat_transform = ColumnTransformer([('one_hot',OneHotEncoder(drop='first',handle_unknown='ignore'),[1,7]),
                         ('ordinal',OrdinalEncoder(categories=[['OTHER','MORTGAGE','RENT','OWN'],[ "High School","Associate","Bachelor","Master","Doctorate"],['No','Yes']],handle_unknown='error'),[5,2,12])],
                       remainder='passthrough')

scale_transform = ColumnTransformer([
    ('scale', StandardScaler(), slice(0, None))  
])

In [30]:
X = df.drop(columns='loan_status',axis=1)
y = df['loan_status']
y.value_counts()

loan_status
0    35000
1    10000
Name: count, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

X_Train, X_test, y_Train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X_Train, y_Train, test_size=0.2, random_state=42)

In [44]:
from sklearn import svm

In [45]:
pipe = Pipeline([("CategoricalFeatures", cat_transform),
                ("Scaler", scale_transform),
                ("Model", svm.SVC())])


In [46]:
pipe.fit(X_train, y_train)

with mlflow.start_run(experiment_id=experiment_id, run_name="Loan Approval Predictions"):
    mlflow.log_params(pipe.named_steps['Model'].get_params())
    mlflow.log_metric("train_score", pipe.score(X_train, y_train))
    mlflow.log_metric("val_score", pipe.score(X_val, y_val))
    mlflow.log_metric("test_score", pipe.score(X_test, y_test))
    mlflow.sklearn.log_model(pipe, "model")

2025/01/12 00:20:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '49e1e14d0bec40c18951334e827904c3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
1. Set the MLFLOW_TRACKING_URI environment variable to the desired tracking URI. `export MLFLOW_TRACKING_URI=http://localhost:5000`
2. Set the tracking URI programmatically by calling `mlflow.set_tracking_uri`. `mlflow.set_tracking_uri('http://localhost:5000')`


In [None]:
#Best model until now is within the following run

model_uri = f"runs:/<run_id>/model"
model = mlflow.sklearn.load_model(model_uri)