In [None]:
#uncomment of xgboost not installed
# !pip install xgboost

#uncomment if pydotplus not installed; dependency for graphviz
# !pip install pydotplus
# !pip install pydot

# uncomment if never before installed and need to convert to onnx
# !pip install onnx
# !pip install skl2onnx
# !pip install onnxruntime
# !pip install onnxmltools

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, SCORERS

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from six import StringIO
from IPython.display import Image

import pydotplus
import graphviz

from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from skl2onnx import convert_sklearn, __version__, update_registered_converter
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa
import pprint

In [42]:
# the notebooks in this repo are designed to be run on local after cloning this repo;
# if running on COLAB, please ensure to download the respective dataset from "../data/interim" folder and use appropriately
train = pd.read_csv('../data/interim/train.csv')

# Exploring the dataset

In [43]:
train.head()

Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure_Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,Onboard service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Male,Loyal Customer,37,Business travel,Business,3785,4,4,4,...,4,4,4,4,4,5,4,4,1,9
1,dissatisfied,Male,Loyal Customer,60,Personal Travel,Eco,1784,3,4,3,...,2,5,3,2,4,4,4,5,0,0
2,dissatisfied,Male,Loyal Customer,27,Business travel,Business,1825,2,3,4,...,2,2,1,3,3,1,2,2,76,57
3,satisfied,Male,Loyal Customer,50,Business travel,Eco,2024,4,4,2,...,4,4,2,5,4,2,4,4,125,129
4,dissatisfied,Male,Loyal Customer,70,Personal Travel,Eco,1435,4,4,4,...,5,5,5,4,5,4,4,5,0,0


In [44]:
col_names = list(train.columns)

In [45]:
for name in col_names:
    idx = col_names.index(name)
    col_names.remove(name)
    col_names.insert(idx, name.replace(" ", "_"))
col_names

['satisfaction',
 'Gender',
 'Customer_Type',
 'Age',
 'Type_of_Travel',
 'Class',
 'Flight_Distance',
 'Seat_comfort',
 'Departure_Arrival_time_convenient',
 'Food_and_drink',
 'Gate_location',
 'Inflight_wifi_service',
 'Inflight_entertainment',
 'Online_support',
 'Ease_of_Online_booking',
 'Onboard_service',
 'Leg_room_service',
 'Baggage_handling',
 'Checkin_service',
 'Cleanliness',
 'Online_boarding',
 'Departure_Delay_in_Minutes',
 'Arrival_Delay_in_Minutes']

In [46]:
train.columns = col_names
train.head()

Unnamed: 0,satisfaction,Gender,Customer_Type,Age,Type_of_Travel,Class,Flight_Distance,Seat_comfort,Departure_Arrival_time_convenient,Food_and_drink,...,Online_support,Ease_of_Online_booking,Onboard_service,Leg_room_service,Baggage_handling,Checkin_service,Cleanliness,Online_boarding,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes
0,satisfied,Male,Loyal Customer,37,Business travel,Business,3785,4,4,4,...,4,4,4,4,4,5,4,4,1,9
1,dissatisfied,Male,Loyal Customer,60,Personal Travel,Eco,1784,3,4,3,...,2,5,3,2,4,4,4,5,0,0
2,dissatisfied,Male,Loyal Customer,27,Business travel,Business,1825,2,3,4,...,2,2,1,3,3,1,2,2,76,57
3,satisfied,Male,Loyal Customer,50,Business travel,Eco,2024,4,4,2,...,4,4,2,5,4,2,4,4,125,129
4,dissatisfied,Male,Loyal Customer,70,Personal Travel,Eco,1435,4,4,4,...,5,5,5,4,5,4,4,5,0,0


In [47]:
np.unique(train.Seat_comfort)

array([0, 1, 2, 3, 4, 5], dtype=int64)

## Seperating categorical and numerical features

In [48]:
numerical_features = [x for x in train.select_dtypes(exclude = object)]
categorical_features = [x for x in train.select_dtypes(include = object)]

In [49]:
categorical_features

['satisfaction', 'Gender', 'Customer_Type', 'Type_of_Travel', 'Class']

In [50]:
categorical_features.remove('satisfaction')
print(numerical_features)
print(categorical_features)

['Age', 'Flight_Distance', 'Seat_comfort', 'Departure_Arrival_time_convenient', 'Food_and_drink', 'Gate_location', 'Inflight_wifi_service', 'Inflight_entertainment', 'Online_support', 'Ease_of_Online_booking', 'Onboard_service', 'Leg_room_service', 'Baggage_handling', 'Checkin_service', 'Cleanliness', 'Online_boarding', 'Departure_Delay_in_Minutes', 'Arrival_Delay_in_Minutes']
['Gender', 'Customer_Type', 'Type_of_Travel', 'Class']


# Creating model pipeling

In [51]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first', sparse=False))])

preprocessor = ColumnTransformer(
    transformers=[                  
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numerical_features)
    ])

xgb_ohe_model = XGBClassifier(learning_rate =0.1,
                    n_estimators=500,
                    max_depth=20,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=16)


model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('xgb_classifier', xgb_ohe_model)])
model_pipeline.fit(train[categorical_features+numerical_features], train.satisfaction)





Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(drop='first',
                                                                                 sparse=False))]),
                                                  ['Gender', 'Customer_Type',
                                                   'Type_of_Travel', 'Class']),
                                                 ('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Flight_Distance',
                                                   'Seat_comfort',
                                                   'Departure_Arrival_time_convenient',
                           

# Converting datatypes to the final type for ONNX and registering & converting the model to ONNX

In [52]:
def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == 'int64':
            t = FloatTensorType([None, 1])
        elif v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs

inputs = convert_dataframe_schema(train[categorical_features+numerical_features])
pprint.pprint(inputs)

[('Gender', StringTensorType(shape=[None, 1])),
 ('Customer_Type', StringTensorType(shape=[None, 1])),
 ('Type_of_Travel', StringTensorType(shape=[None, 1])),
 ('Class', StringTensorType(shape=[None, 1])),
 ('Age', FloatTensorType(shape=[None, 1])),
 ('Flight_Distance', FloatTensorType(shape=[None, 1])),
 ('Seat_comfort', FloatTensorType(shape=[None, 1])),
 ('Departure_Arrival_time_convenient', FloatTensorType(shape=[None, 1])),
 ('Food_and_drink', FloatTensorType(shape=[None, 1])),
 ('Gate_location', FloatTensorType(shape=[None, 1])),
 ('Inflight_wifi_service', FloatTensorType(shape=[None, 1])),
 ('Inflight_entertainment', FloatTensorType(shape=[None, 1])),
 ('Online_support', FloatTensorType(shape=[None, 1])),
 ('Ease_of_Online_booking', FloatTensorType(shape=[None, 1])),
 ('Onboard_service', FloatTensorType(shape=[None, 1])),
 ('Leg_room_service', FloatTensorType(shape=[None, 1])),
 ('Baggage_handling', FloatTensorType(shape=[None, 1])),
 ('Checkin_service', FloatTensorType(shape=[N

In [53]:
## INFO: model has to be registered since the model is not from the **sklearn** core package but from the **xgboost** package
update_registered_converter(
    XGBClassifier, 'XGBoostXGBClassifier',
    calculate_linear_classifier_output_shapes, convert_xgboost,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [54]:
model_onnx = convert_sklearn(
    model_pipeline, 'pipeline_xgboost',
    inputs)

In [55]:
MODEL_DIR = "../models"
MODEL_FILE_NAME = "satisfaction_XGB_1.onnx"

In [58]:
# os.mkdir(MODEL_DIR) # uncomment if directory does not exist
# And save.
with open(MODEL_DIR + "/" + MODEL_FILE_NAME, "w+b") as f:
    f.write(model_onnx.SerializeToString())

# Testing predictions and the corresponding prediction probabilities to ensure correctness

In [59]:
model_pipeline.predict(train[:5])

array(['satisfied', 'dissatisfied', 'dissatisfied', 'satisfied',
       'dissatisfied'], dtype=object)

In [60]:
model_pipeline.predict_proba(train[:5])

array([[2.5749207e-05, 9.9997425e-01],
       [9.9995321e-01, 4.6806108e-05],
       [9.9845010e-01, 1.5498846e-03],
       [2.2745073e-02, 9.7725493e-01],
       [9.8689812e-01, 1.3101860e-02]], dtype=float32)

# END OF NOTEBOOK