# bank_loan_predictor Notebook

In [1]:
# frontload all of the library imports
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

# set global state variable
SEED = 42

# read the csv into a dataframe
bank = pd.read_csv('./bank-full.csv', delimiter = ';')

# separate numeric, categorical and target columns
num_cols = bank.select_dtypes(['integer', 'float']).columns
cat_cols = bank.select_dtypes(['object']).drop(columns = "y").columns

# print column types
print("Numeric columns are {}.".format(", ".join(num_cols)))
print("Categorical columns are {}.".format(", ".join(cat_cols)))

# split the data into training and testing data as 90-10 split
X_train, X_test, y_train, y_test = train_test_split(bank.drop(columns = "y"), bank["y"], 
                                                    test_size = 0.10, random_state = 42)
# reset index of training and testing data
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

# show the shape of the training and testing data
print("Training data has {} rows.".format(X_train.shape[0]))
print("Test data has {} rows.".format(X_test.shape[0]))

#------------------------------------------------------------------------------------------#

# One hot encode the categorical data
onehotter = OneHotEncoder(sparse_output = False)
onehotter.fit(X_train[cat_cols])
onehot_cols = onehotter.get_feature_names_out(cat_cols)
X_train_onehot = pd.DataFrame(onehotter.transform(X_train[cat_cols]), columns = onehot_cols)
X_test_onehot = pd.DataFrame(onehotter.transform(X_test[cat_cols]), columns = onehot_cols)

# Train normalizer using z-score normalization
znormalizer = StandardScaler()
znormalizer.fit(X_train[num_cols])

# transform numeric columns 
X_train_norm = pd.DataFrame(znormalizer.transform(X_train[num_cols]), columns = num_cols)
X_test_norm = pd.DataFrame(znormalizer.transform(X_test[num_cols]), columns = num_cols)

# build out featurized dataframes
X_train_featurized = X_train_onehot # add one-hot-encoded columns
X_test_featurized = X_test_onehot   # add one-hot-encoded columns
X_train_featurized[num_cols] = X_train_norm # add numeric columns
X_test_featurized[num_cols] = X_test_norm   # add numeric columns

# delete unneeded dataframes from memory
del X_train_norm, X_test_norm, X_train_onehot, X_test_onehot

# show infor about featurized dataframes to compare to non-featurized
print("Featurized training data has {} rows and {} columns.".format(*X_train_featurized.shape))
print("Featurized test data has {} rows and {} columns.".format(*X_test_featurized.shape))

#------------------------------------------------------------------------------------------#

# instantiate logistic regressor
logit = LogisticRegression(max_iter = 5000, solver = 'lbfgs', random_state=SEED)
logit.fit(X_train_featurized, y_train)

y_hat_train = logit.predict(X_train_featurized) # predict training data
y_hat_test = logit.predict(X_test_featurized) # predict testing data

# Show precision and recall for both training and testing predictions
precision_train = precision_score(y_train, y_hat_train, pos_label = 'yes') * 100
precision_test = precision_score(y_test, y_hat_test, pos_label = 'yes') * 100

recall_train = recall_score(y_train, y_hat_train, pos_label = 'yes') * 100
recall_test = recall_score(y_test, y_hat_test, pos_label = 'yes') * 100

print("Precision = {:.0f}% and recall = {:.0f}% on the training data.".format(precision_train, recall_train))
print("Precision = {:.0f}% and recall = {:.0f}% on the validation data.".format(precision_test, recall_test))

Numeric columns are age, balance, day, duration, campaign, pdays, previous.
Categorical columns are job, marital, education, default, housing, loan, contact, month, poutcome.
Training data has 40689 rows.
Test data has 4522 rows.
Featurized training data has 40689 rows and 51 columns.
Featurized test data has 4522 rows and 51 columns.
Precision = 65% and recall = 35% on the training data.
Precision = 63% and recall = 34% on the validation data.


The code above trains a model on data that contains both categorical and numeric features. I normalize the numeric features and one-hot-encode the categorical features as part of pre-processing. In the above code I do this "manually", however as shown [here](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html) we can compose data transformations and ML steps to create a **single multi-step pipeline**. The pipeline object (conviniently called `pipeline` in the docs) has a `fit` and `predict` method:
- By calling `fit`, the raw input data is first transformed into the featurized data, and then passed to the ML algorithm to train a model.
- By calling `predict`, the raw input data is first transformed into the featurized data (just like `fit`), and and then used to get predictions (using the model trained when we called `fit`).

In [2]:
# create local data folder to store
data_folder = "data"
os.makedirs(data_folder, exist_ok=True)

The following will be a json file containing the new data we will use to predict bank loan approval later.

In [3]:
%%writefile ./data/new_data.json
{"age": {"0": 40, "1": 47},
 "balance": {"0": 580, "1": 3644},
 "campaign": {"0": 1, "1": 2},
 "contact": {"0": "unknown", "1": "unknown"},
 "day": {"0": 16, "1": 9},
 "default": {"0": "no", "1": "no"},
 "duration": {"0": 192, "1": 83},
 "education": {"0": "secondary", "1": "secondary"},
 "housing": {"0": "yes", "1": "no"},
 "job": {"0": "blue-collar", "1": "services"},
 "loan": {"0": "no", "1": "no"},
 "marital": {"0": "married", "1": "single"},
 "month": {"0": "may", "1": "jun"},
 "pdays": {"0": -1, "1": -1},
 "poutcome": {"0": "unknown", "1": "unknown"},
 "previous": {"0": 0, "1": 0}}


Overwriting ./data/new_data.json


Now I will begin to start

In [4]:
# frontload all of the library imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# set global state variable
SEED = 42

To start I will not include reading the csv into a dataframe and grabbing the numeric and categorical columns as this varies depending on the dataset. I would expect someone who is working with this model to do this manually.

In [5]:
# try to create functions to wrap in pipeline

class PipelineHelper:
    
        
    # generate full dataframe from csv, numeric and categorical column names
    def generate_df_cols(csv_path, target):
        
        # read the csv into a dataframe
        df = pd.read_csv(csv_path, delimiter = ';')

        # separate numeric, categorical and target columns
        num_columns = df.select_dtypes(['integer', 'float']).columns
        cat_columns = df.select_dtypes(['object']).drop(columns = target).columns
        
        # print column types
        print("Numeric columns are {}.".format(", ".join(num_cols)))
        print("Categorical columns are {}.".format(", ".join(cat_cols)))
        
        return df, cat_columns, num_columns

    # split the data with a 90/10 training vs test
    def split_data(df, target):
        # split the data into training and testing data as 90-10 split
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = target), df[target], 
                                                        test_size = 0.10, random_state = 42)
        return X_train, X_test, y_train, y_test

    # after training pipeline get the precision and recall score
    def get_metrics(y_test, y_pred):
       
        precision_test = precision_score(y_test, y_pred, pos_label = 'yes') * 100
        recall_test = recall_score(y_test, y_pred, pos_label = 'yes') * 100
        print("Precision = {:.0f}% and recall = {:.0f}% on the validation data.".format(precision_test, recall_test))
        
        return precision_test, recall_test





I created a python submodule with these functions that can be utilized for future use. 

In [6]:
# generate data needed pre pipeline
pipe_help = PipelineHelper

csv_path = "./bank-full.csv"

df, cat_cols, num_cols = pipe_help.generate_df_cols(csv_path=csv_path, target='y')

# get training and testing data
X_train, X_test, y_train, y_test = pipe_help.split_data(df, target='y')


Numeric columns are age, balance, day, duration, campaign, pdays, previous.
Categorical columns are job, marital, education, default, housing, loan, contact, month, poutcome.


In [7]:
# frontload all of the library imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

# set global state variable
SEED = 42

# # read the csv into a dataframe
# bank = pd.read_csv('./bank-full.csv', delimiter = ';')

# # separate numeric, categorical and target columns
# num_columns = bank.select_dtypes(['integer', 'float']).columns
# cat_columns = bank.select_dtypes(['object']).drop(columns = "y").columns

# # print column types
# print("Numeric columns are {}.".format(", ".join(num_cols)))
# print("Categorical columns are {}.".format(", ".join(cat_cols)))

# # split the data into training and testing data as 90-10 split
# X_train, X_test, y_train, y_test = train_test_split(bank.drop(columns = "y"), bank["y"], 
#                                                     test_size = 0.10, random_state = 42)

# create pre processing functions
pipeline = Pipeline([
    ("preprocess", 
        ColumnTransformer([ # preprocess data at beginning of pipeline
            ("onehot", OneHotEncoder(sparse_output=False), cat_cols),
            ("standardize", StandardScaler(), num_cols)
        ])
    ),
    ("logit", LogisticRegression(max_iter = 5000, solver = 'lbfgs', random_state=SEED))
])

In [8]:
# try to use this on X_train data
X_train_processed = pipeline.fit(X_train, y_train)
X_train_processed

In [9]:
# save pipeline
from joblib import dump, load
import json

dump(pipeline, './data/logit_pipeline.joblib') 

['./data/logit_pipeline.joblib']

In [10]:
# get predictions for X_test
y_pred = pipeline.predict(X_test)

In [15]:
# get precision and recall
# Show precision and recall for both training and testing predictions
precision_test = precision_score(y_test, y_pred, pos_label = 'yes') * 100
recall_test = recall_score(y_test, y_pred, pos_label = 'yes') * 100

print(f"Precision: {precision_test.round(0)}", "|",f"Recall: {recall_test.round(0)}")

Precision: 63.0 | Recall: 34.0


In [16]:
#logit = dump(logit_regressor, 'filename.joblib') 
# load json file
# load the pipeline
new_data_path = "./data/new_data.json"
pipeline_path = "./data/logit_pipeline.joblib"

def load_df(json_path):
    with open(json_path, "r") as j:
        data = json.load(j)
    new_df = pd.DataFrame(data)
    return new_df

logit_pipeline = load(pipeline_path)
new_df = load_df(new_data_path)

In [17]:
# Make predictions
new_df['y_pred'] = logit_pipeline.predict(new_df)
new_df.head()

Unnamed: 0,age,balance,campaign,contact,day,default,duration,education,housing,job,loan,marital,month,pdays,poutcome,previous,y_pred
0,40,580,1,unknown,16,no,192,secondary,yes,blue-collar,no,married,may,-1,unknown,0,no
1,47,3644,2,unknown,9,no,83,secondary,no,services,no,single,jun,-1,unknown,0,no


In [18]:
# save into new json file
new_json_path = './data/new_preds.json'
new_json = new_df.to_json(new_json_path, orient='records', indent=4)

Write dependencies to data file

In [19]:
data="data"

In [20]:
%%writefile $data/conda.yaml
channels:
  - conda-forge
dependencies:
  - python=3.11
  - pip
  - pip:
    - scikit-learn==1.3.0
    - pandas==2.1.0

Overwriting data/conda.yaml
