# This is my custom code:

- I create a python module with a custom class to help prep the data for the pipeline.
- My hope is to be able to use these down the line for separate projects, especially to ease the process of prepping data for onehote encoding and standardization.

In [1]:
# create local data folder to store data
import os
data_folder = "data_helper"
os.makedirs(data_folder, exist_ok=True)

In [3]:
%%writefile ./data_helper/new_data.json
{"age": {"0": 40, "1": 47},
 "balance": {"0": 580, "1": 3644},
 "campaign": {"0": 1, "1": 2},
 "contact": {"0": "unknown", "1": "unknown"},
 "day": {"0": 16, "1": 9},
 "default": {"0": "no", "1": "no"},
 "duration": {"0": 192, "1": 83},
 "education": {"0": "secondary", "1": "secondary"},
 "housing": {"0": "yes", "1": "no"},
 "job": {"0": "blue-collar", "1": "services"},
 "loan": {"0": "no", "1": "no"},
 "marital": {"0": "married", "1": "single"},
 "month": {"0": "may", "1": "jun"},
 "pdays": {"0": -1, "1": -1},
 "poutcome": {"0": "unknown", "1": "unknown"},
 "previous": {"0": 0, "1": 0}}


Writing ./data_helper/new_data.json


In [5]:
# frontload all of the library imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from pipeline_helpers import PipelineHelper

# set global state variable
SEED = 42

Instead of hardcoding all of the helper functions I will import them from my helper function python file

In [70]:
# # try to create functions to wrap in pipeline

# class PipelineHelper:
    
        
#     # generate full dataframe from csv, numeric and categorical column names
#     def generate_df_cols(csv_path, target):
        
#         # read the csv into a dataframe
#         df = pd.read_csv(csv_path, delimiter = ';')

#         # separate numeric, categorical and target columns
#         num_columns = df.select_dtypes(['integer', 'float']).columns
#         cat_columns = df.select_dtypes(['object']).drop(columns = target).columns
        
#         # print column types
#         print("Numeric columns are {}.".format(", ".join(num_cols)))
#         print("Categorical columns are {}.".format(", ".join(cat_cols)))
        
#         return df, cat_columns, num_columns

#     # split the data with a 90/10 training vs test
#     def split_data(df, target):
#         # split the data into training and testing data as 90-10 split
#         X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = target), df[target], 
#                                                         test_size = 0.10, random_state = 42)
#         return X_train, X_test, y_train, y_test

#     # after training pipeline get the precision and recall score
#     def get_metrics(y_test, y_pred):
       
#         precision_test = precision_score(y_test, y_pred, pos_label = 'yes') * 100
#         recall_test = recall_score(y_test, y_pred, pos_label = 'yes') * 100
#         print("Precision = {:.0f}% and recall = {:.0f}% on the validation data.".format(precision_test, recall_test))
        
#         return precision_test, recall_test





I created a python submodule with these functions that can be utilized for future use. 

In [6]:
# generate data needed pre pipeline
pipe_help = PipelineHelper

csv_path = "./bank-full.csv"

df, cat_cols, num_cols = pipe_help.generate_df_cols(csv_path=csv_path, target='y')

# get training and testing data
X_train, X_test, y_train, y_test = pipe_help.split_data(df, target='y')


Numeric columns are age, balance, day, duration, campaign, pdays, previous.
Categorical columns are job, marital, education, default, housing, loan, contact, month, poutcome.


In [7]:
# create the pipeline

# create pre processing functions
pipeline = Pipeline([
    ("preprocess", 
        ColumnTransformer([ # preprocess data at beginning of pipeline
            ("onehot", OneHotEncoder(sparse_output=False), cat_cols),
            ("standardize", StandardScaler(), num_cols)
        ])
    ),
    ("logit", LogisticRegression(max_iter = 5000, solver = 'lbfgs', random_state=SEED))
])

In [8]:
# try to use this on X_train data
X_train_processed = pipeline.fit(X_train, y_train)
X_train_processed

In [9]:
# save pipeline
from joblib import dump, load
import json

dump(pipeline, './data_helper/logit_pipeline.joblib') 

['./data_helper/logit_pipeline.joblib']

In [10]:
# get predictions for X_test
y_pred = pipeline.predict(X_test)

In [11]:
#logit = dump(logit_regressor, 'filename.joblib') 
# load json file
new_data_path = "./data_helper/new_data.json"
pipeline_path = "./data_helper/logit_pipeline.joblib"

def load_df(json_path):
    with open(json_path, "r") as j:
        data = json.load(j)
    new_df = pd.DataFrame(data)
    return new_df

logit_pipeline = load(pipeline_path)
new_df = load_df(new_data_path)

In [12]:
# Make predictions
new_df['y_pred'] = logit_pipeline.predict(new_df)
new_df.head()

Unnamed: 0,age,balance,campaign,contact,day,default,duration,education,housing,job,loan,marital,month,pdays,poutcome,previous,y_pred
0,40,580,1,unknown,16,no,192,secondary,yes,blue-collar,no,married,may,-1,unknown,0,no
1,47,3644,2,unknown,9,no,83,secondary,no,services,no,single,jun,-1,unknown,0,no


In [13]:
# save into new json file
new_json_path = './data_helper/new_preds.json'
new_json = new_df.to_json(new_json_path, orient='records', indent=4)

Write dependencies to data file

In [16]:
%%writefile ./data_helper/conda.yaml
channels:
  - conda-forge
dependencies:
  - python=3.11
  - pip
  - pip:
    - scikit-learn==1.3.0
    - pandas==2.1.0

Writing ./data_helper/conda.yaml
