In [1]:
import joblib
import warnings
import numpy as np
import pandas as pd
from io import StringIO
from sklearn import set_config
from src.config import config as cfg
from sklearn.pipeline import Pipeline
from src.pipeline.custom_pipeline import ColumnSelector, ConvertDtypes, GetDummies

set_config(display='diagram')
warnings.filterwarnings(action='ignore')

In [2]:
train_data = pd.read_csv('data/train.csv', sep=';', names=cfg.FEATURES + [cfg.LABEL])
validation_data = pd.read_csv('data/validation.csv', sep=';', names=cfg.FEATURES + [cfg.LABEL])

In [3]:
train_data.sample(3).append(validation_data.sample(3))

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6016,5032,15680517,Sal,769,Germany,Female,34,7,137239.17,1,1,1,71379.92,1
6888,3841,15662884,Naylor,739,Germany,Male,58,1,110597.76,1,0,1,160122.66,1
5738,7070,15692137,Jen,759,France,Female,46,2,0.0,1,1,1,138380.11,0
775,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,1,0,29179.52,0
535,5174,15705281,Burt,800,Spain,Male,38,9,0.0,1,1,0,78744.39,0
596,4853,15574137,Ch'in,687,Spain,Male,35,3,0.0,2,1,1,176450.19,0


In [4]:
preprocessor = Pipeline(steps=[
        ('dtypes', ConvertDtypes(numerical=cfg.NUMERICAL_FEATURES, categorical=cfg.CATEGORICAL_FEATURES)),
        ('selector', ColumnSelector(columns=cfg.FEATURES[3:])),
        ('ohe', GetDummies(columns=cfg.CATEGORICAL_FEATURES))
    ])

preprocessor.fit(train_data)

In [5]:
joblib.dump(preprocessor, filename='models/preprocessor.joblib')

['models/preprocessor.joblib']

In [6]:
preprocessor.transform(train_data).sample(5)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Spain,Geography_Germany,Gender_Male
4521,706,38,8,0.0,2,0,1,46635.11,0,1,0,1
3880,596,57,6,0.0,2,1,1,72402.0,0,1,0,1
5920,757,57,3,89079.41,1,1,1,53179.21,1,0,0,1
2739,626,47,2,103108.8,1,0,1,166475.44,0,0,1,0
2646,691,30,9,0.0,1,1,0,49594.02,1,0,0,1


In [7]:
preprocessor = None

In [15]:
def input_fn(input_data, content_type):
    if content_type == 'text/csv':
        df = pd.read_csv(StringIO(input_data), sep=';', header=None)
        if len(df.columns) == len(cfg.FEATURES) + 1:
            df.columns = cfg.FEATURES + [cfg.LABEL]
        elif len(df.columns) == len(cfg.FEATURES):
            df.columns = cfg.FEATURES
        return df
    elif content_type == 'application/json':
        df = pd.read_json(StringIO(input_data))
        return df
    else:
        raise ValueError(f'{content_type} not supported by script')

def predict_fn(input_data, model):
    features = model.transform(input_data).values
    if cfg.LABEL in input_data:
        return np.insert(features, 0, input_data[cfg.LABEL], axis=1)
    else:
        return features

In [17]:
input_data = '7897;15727857;Flynn;635;Spain;Male;41;1;0.0;2;1;0;175611.5'

new_data = input_fn(input_data=input_data, content_type='text/csv')
new_data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,7897,15727857,Flynn,635,Spain,Male,41,1,0.0,2,1,0,175611.5


In [18]:
preprocessor = joblib.load(filename='models/preprocessor.joblib')

transformed = predict_fn(input_data=new_data, model=preprocessor)
transformed

array([[6.350000e+02, 4.100000e+01, 1.000000e+00, 0.000000e+00,
        2.000000e+00, 1.000000e+00, 0.000000e+00, 1.756115e+05,
        0.000000e+00, 1.000000e+00, 0.000000e+00, 1.000000e+00]])