In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# data = pd.read_csv('https://raw.githubusercontent.com/sdeni/IMLLesson4/main/data/loan-train.csv')
data = pd.read_csv('data/loan-train.csv')

In [3]:
def process_dataframe(data):
    data['Loan_Status'] = data['Loan_Status'].replace({'Y' : 1, 'N' : 0}).astype(int)
    data['Property_Area'] = data['Property_Area'].replace({'Rural' : -1, 'Semiurban' : 0, 'Urban':1}).astype(int)
    data['CoapplicantIncome'] = np.log(data['CoapplicantIncome'])
    data['LoanAmount'] = np.log(data['LoanAmount'])
    data['ApplicantIncome'] = np.log(data['ApplicantIncome'])
    data['Loan_Amount_Term'] = np.log(data['Loan_Amount_Term'])
    data.dropna(subset=['CoapplicantIncome', 'ApplicantIncome',  'Loan_Status', 'Loan_Amount_Term', 'LoanAmount'], inplace=True)
    num_data = data.select_dtypes(include = [np.number])
    num_data['Credit_History'].fillna(0.0, inplace = True)
    num_data['CoapplicantIncome'] = num_data['CoapplicantIncome'].replace({np.inf : -1, -np.inf: -1})
    return num_data

In [4]:
num_data = process_dataframe(data)
num_data.isnull().values.any()

  result = getattr(ufunc, method)(*inputs, **kwargs)


False

In [5]:
X = num_data.drop(columns='Loan_Status')
y = num_data.Loan_Status

In [6]:
X.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,8.430109,7.31854,4.85203,5.886104,1.0,-1
2,8.006368,-1.0,4.189655,5.886104,1.0,1
3,7.856707,7.765569,4.787492,5.886104,1.0,1
4,8.699515,-1.0,4.94876,5.886104,1.0,1
5,8.597297,8.341887,5.587249,5.886104,1.0,1


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 578 entries, 1 to 613
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ApplicantIncome    578 non-null    float64
 1   CoapplicantIncome  578 non-null    float64
 2   LoanAmount         578 non-null    float64
 3   Loan_Amount_Term   578 non-null    float64
 4   Credit_History     578 non-null    float64
 5   Property_Area      578 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 31.6 KB


In [8]:
import mlflow

In [9]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("loan-check-data-experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='loan-check-data-experiment', tags={}>

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [11]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'svm',
        'C': hp.lognormal('SVM_C', 0, 1.0),
        'kernel': hp.choice('kernel', ['linear', 'rbf'])
    },
    {
        'type': 'rf',
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
    {
        'type': 'logreg',
        'C': hp.lognormal('LR_C', 0, 1.0),
        'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
    },
])

def f(params):
    with mlflow.start_run():
        classifier_type = params['type']
        params.pop('type', None)

        mlflow.log_params(params)
        mlflow.log_param('data', 'data/loan-train.csv')
        mlflow.set_tag('model', classifier_type)

        if classifier_type == 'svm':
            clf = SVC(**params)
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)
        elif classifier_type == 'logreg':
            clf = LogisticRegression(**params)
        else:
            return 0
        accuracy = cross_val_score(clf, X, y).mean()
        
        mlflow.log_metric('accuracy', accuracy)
    
        return {'loss': -accuracy, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, search_space, algo=tpe.suggest, max_evals=100, trials=trials)

100%|██████| 100/100 [00:23<00:00,  4.20trial/s, best loss: -0.7682608695652174]


In [12]:
##Save best model

In [13]:
#best params taken from the MLflow interface
best_params = {
    'C': 1.0489085915266627,
    'solver': 'lbfgs'
}


best_model = LogisticRegression(**best_params)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
best_model.fit(X_train, y_train)

In [14]:
import pickle

In [15]:
with open('models/best_model.bin', 'wb') as f_out:
    pickle.dump(best_model, f_out)

In [16]:
with open('preprocessing/process_dataframe.bin', 'wb') as f_out:
    pickle.dump(process_dataframe, f_out)

In [17]:
##Load Model

In [18]:
loaded_model = pickle.load(open('models/best_model.bin', 'rb'))

In [19]:
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print('accuracy on test data: ', accuracy)

accuracy on test data:  0.7931034482758621
