In [1]:
import joblib
import warnings
import numpy as np
import pandas as pd
from io import StringIO
from sklearn import set_config
from src.config import config as cfg
from sklearn.pipeline import Pipeline
from src.pipeline.custom_pipeline import ColumnSelector, ConvertDtypes, GetDummies

set_config(display='diagram')
warnings.filterwarnings(action='ignore')

In [2]:
train_data = pd.read_csv('data/train.csv', sep=';', names=cfg.FEATURES + [cfg.LABEL])
validation_data = pd.read_csv('data/validation.csv', sep=';', names=cfg.FEATURES + [cfg.LABEL])

In [3]:
train_data.sample(3).append(validation_data.sample(3))

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3227,26,15738191,Maclean,577,France,Male,25,3,0.0,2,0,1,124508.29,0
6860,1060,15812197,Kline,850,France,Male,38,7,80293.98,1,0,0,126555.74,0
4443,5592,15607509,Ozerova,539,France,Male,38,5,0.0,2,1,0,47388.41,0
326,8820,15790750,Manfrin,592,Germany,Male,36,10,123187.51,1,0,1,146111.35,0
736,765,15672056,Kenenna,710,Germany,Male,43,2,140080.32,3,1,1,157908.19,1
1097,2534,15631838,Findlay,606,France,Male,61,5,108166.09,2,0,1,8643.21,0


In [4]:
preprocessor = Pipeline(steps=[
        ('dtypes', ConvertDtypes(numerical=cfg.NUMERICAL_FEATURES, categorical=cfg.CATEGORICAL_FEATURES)),
        ('selector', ColumnSelector(columns=cfg.FEATURES[3:])),
        ('ohe', GetDummies(columns=cfg.CATEGORICAL_FEATURES))
    ])

preprocessor.fit(train_data)

In [5]:
joblib.dump(preprocessor, filename='models/preprocessor.joblib')

['models/preprocessor.joblib']

In [6]:
preprocessor.transform(train_data).sample(5)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Spain,Geography_Germany,Gender_Male
285,760,34,6,58003.41,1,1,0,90346.1,0,0,1,0
296,673,40,1,121629.22,1,1,1,3258.6,0,1,0,1
6255,655,27,10,0.0,2,1,0,51620.94,1,0,0,1
4847,628,33,3,0.0,1,1,1,188193.25,0,1,0,0
1635,651,25,2,109175.14,2,1,0,114566.47,0,0,1,0


In [7]:
preprocessor = None

In [8]:
def input_fn(input_data, content_type):
    if content_type == 'text/csv':
        df = pd.read_csv(StringIO(input_data), sep=';', header=None)
        if len(df.columns) == len(cfg.FEATURES) + 1:
            df.columns = cfg.FEATURES + [cfg.LABEL]
        elif len(df.columns) == len(cfg.FEATURES):
            df.columns = cfg.FEATURES
        return df
    elif content_type == 'application/json':
        df = pd.read_json(StringIO(input_data))
        return df
    else:
        raise ValueError(f'{content_type} not supported by script')

def predict_fn(input_data, model):
    features = model.transform(input_data).values
    if cfg.LABEL in input_data:
        return np.insert(features, 0, input_data[cfg.LABEL], axis=1)
    else:
        return features

In [10]:
input_data = '4982;15768137;Bray;667;Spain;Female;23;6;136100.69;2;0;0;169669.33'

new_data = input_fn(input_data=input_data, content_type='text/csv')
new_data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,4982,15768137,Bray,667,Spain,Female,23,6,136100.69,2,0,0,169669.33


In [11]:
preprocessor = joblib.load(filename='models/preprocessor.joblib')

transformed = predict_fn(input_data=new_data, model=preprocessor)
transformed

array([[6.6700000e+02, 2.3000000e+01, 6.0000000e+00, 1.3610069e+05,
        2.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6966933e+05,
        0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00]])