In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, SCORERS
from sklearn import ensemble

# from category_encoders import OneHotEncoder, TargetEncoder, WOEEncoder

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from six import StringIO
from IPython.display import Image

import pydotplus
import graphviz

import wandb
import os

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators import H2OTargetEncoderEstimator

from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
from skl2onnx import convert_sklearn, __version__, update_registered_converter
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx.common.data_types import Int64TensorType
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost  # noqa
import pprint

In [18]:
train = pd.read_csv('../Data/interim/train.csv')

In [19]:
old_names = list(train.columns)
old_names

['satisfaction',
 'Gender',
 'Customer Type',
 'Age',
 'Type of Travel',
 'Class',
 'Flight Distance',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding',
 'Departure Delay in Minutes',
 'Arrival Delay in Minutes']

In [20]:
new_names = ['Gender',
 'Customer_Type',
 'Age',
 'Type_of_Travel',
 'Class',
 'Flight_Distance',
 'Seat_comfort',
 'Departure/Arrival_time_convenient',
 'Food_and_drink',
 'Gate_location',
 'Inflight_wifi_service',
 'Inflight_entertainment',
 'Online_support',
 'Ease_of_Online_booking',
 'On-board_service',
 'Leg_room_service',
 'Baggage_handling',
 'Checkin_service',
 'Cleanliness',
 'Online_boarding',
 'Departure_Delay_in_Minutes',
 'Arrival_Delay_in_Minutes']

In [21]:
name_dict = {}
for n in range(1, len(new_names)+1):
    name_dict[old_names[n]] = new_names[n-1]
name_dict

{'Gender': 'Gender',
 'Customer Type': 'Customer_Type',
 'Age': 'Age',
 'Type of Travel': 'Type_of_Travel',
 'Class': 'Class',
 'Flight Distance': 'Flight_Distance',
 'Seat comfort': 'Seat_comfort',
 'Departure/Arrival time convenient': 'Departure/Arrival_time_convenient',
 'Food and drink': 'Food_and_drink',
 'Gate location': 'Gate_location',
 'Inflight wifi service': 'Inflight_wifi_service',
 'Inflight entertainment': 'Inflight_entertainment',
 'Online support': 'Online_support',
 'Ease of Online booking': 'Ease_of_Online_booking',
 'On-board service': 'On-board_service',
 'Leg room service': 'Leg_room_service',
 'Baggage handling': 'Baggage_handling',
 'Checkin service': 'Checkin_service',
 'Cleanliness': 'Cleanliness',
 'Online boarding': 'Online_boarding',
 'Departure Delay in Minutes': 'Departure_Delay_in_Minutes',
 'Arrival Delay in Minutes': 'Arrival_Delay_in_Minutes'}

In [22]:
train.rename(columns=name_dict, inplace=True)
train.head()

Unnamed: 0,satisfaction,Gender,Customer_Type,Age,Type_of_Travel,Class,Flight_Distance,Seat_comfort,Departure/Arrival_time_convenient,Food_and_drink,...,Online_support,Ease_of_Online_booking,On-board_service,Leg_room_service,Baggage_handling,Checkin_service,Cleanliness,Online_boarding,Departure_Delay_in_Minutes,Arrival_Delay_in_Minutes
0,satisfied,Male,Loyal Customer,37,Business travel,Business,3785,4,4,4,...,4,4,4,4,4,5,4,4,1,9
1,dissatisfied,Male,Loyal Customer,60,Personal Travel,Eco,1784,3,4,3,...,2,5,3,2,4,4,4,5,0,0
2,dissatisfied,Male,Loyal Customer,27,Business travel,Business,1825,2,3,4,...,2,2,1,3,3,1,2,2,76,57
3,satisfied,Male,Loyal Customer,50,Business travel,Eco,2024,4,4,2,...,4,4,2,5,4,2,4,4,125,129
4,dissatisfied,Male,Loyal Customer,70,Personal Travel,Eco,1435,4,4,4,...,5,5,5,4,5,4,4,5,0,0


In [23]:
numerical_features = [x for x in train.select_dtypes(exclude = object)]
categorical_features = [x for x in train.select_dtypes(include = object)]

In [30]:
categorical_features = categorical_features[1:]

In [31]:
y_test

34079       satisfied
12596    dissatisfied
6293        satisfied
15214       satisfied
50131       satisfied
             ...     
6188        satisfied
16833       satisfied
58999    dissatisfied
71208       satisfied
6393     dissatisfied
Name: satisfaction, Length: 15539, dtype: object

In [32]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first', sparse=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),                  
        ('cat', categorical_transformer, categorical_features),
    ])

X_train, X_test, y_train, y_test = train_test_split(train.iloc[:, 1:].copy(), train.iloc[:, 0].copy(), test_size=0.2, random_state=16)

params = {'bootstrap': True, 
        'criterion': 'gini',
        'max_depth': 15, 
        'max_features': 'auto',
        'min_impurity_decrease': 0.001,
        'min_samples_leaf': 30, 
        'min_samples_split': 50,
        'class_weight': "balanced_subsample",
        'n_estimators': 200,
        'oob_score': True, 
        'random_state': 16, 
        'verbose': 0,
        'warm_start': False}

xgb_woe_model = XGBClassifier(learning_rate =0.1,
                    n_estimators=500,
                    max_depth=20,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=16)


model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('xgb_classifier', xgb_woe_model)])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)





In [33]:
y_pred

array(['satisfied', 'dissatisfied', 'satisfied', ..., 'dissatisfied',
       'satisfied', 'dissatisfied'], dtype=object)

In [34]:
def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == 'int64':
            t = FloatTensorType([None, 1])
        elif v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs


inputs = convert_dataframe_schema(train)

pprint.pprint(inputs)

[('satisfaction', StringTensorType(shape=[None, 1])),
 ('Gender', StringTensorType(shape=[None, 1])),
 ('Customer_Type', StringTensorType(shape=[None, 1])),
 ('Age', FloatTensorType(shape=[None, 1])),
 ('Type_of_Travel', StringTensorType(shape=[None, 1])),
 ('Class', StringTensorType(shape=[None, 1])),
 ('Flight_Distance', FloatTensorType(shape=[None, 1])),
 ('Seat_comfort', FloatTensorType(shape=[None, 1])),
 ('Departure/Arrival_time_convenient', FloatTensorType(shape=[None, 1])),
 ('Food_and_drink', FloatTensorType(shape=[None, 1])),
 ('Gate_location', FloatTensorType(shape=[None, 1])),
 ('Inflight_wifi_service', FloatTensorType(shape=[None, 1])),
 ('Inflight_entertainment', FloatTensorType(shape=[None, 1])),
 ('Online_support', FloatTensorType(shape=[None, 1])),
 ('Ease_of_Online_booking', FloatTensorType(shape=[None, 1])),
 ('On-board_service', FloatTensorType(shape=[None, 1])),
 ('Leg_room_service', FloatTensorType(shape=[None, 1])),
 ('Baggage_handling', FloatTensorType(shape=[No

In [43]:
update_registered_converter(
    XGBClassifier, 'XGBoostXGBClassifier',
    calculate_linear_classifier_output_shapes, convert_xgboost,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

In [44]:
model_onnx = convert_sklearn(
    xgb_woe_model, 'pipeline_xgboost',
    [('input', FloatTensorType([None, 2]))],
    target_opset=12)

In [46]:
MODEL_DIR = "./airlineSatisfactionProd"
MODEL_FILE_NAME = "satisfaction_XGB.onnx"

In [47]:
os.mkdir(MODEL_DIR)
# And save.
with open(MODEL_DIR + "/" + MODEL_FILE_NAME, "wb") as f:
    f.write(model_onnx.SerializeToString())