# Data Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import category_encoders as ce
import calendar
import pickle

from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/ship_clean.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 41 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   actual_ship_days          180519 non-null  int64  
 2   estimated_ship_days       180519 non-null  int64  
 3   delivery_status           180519 non-null  object 
 4   late_delivery_risk        180519 non-null  int64  
 5   category_id               180519 non-null  int64  
 6   category_name             180519 non-null  object 
 7   customer_city             180519 non-null  object 
 8   customer_id               180519 non-null  int64  
 9   customer_segment          180519 non-null  object 
 10  customer_state            180519 non-null  object 
 11  customer_zipcode          180516 non-null  float64
 12  department_name           180519 non-null  object 
 13  latitude                  180519 non-null  f

In [4]:
# I'm not sure why the types for the order_weekday_str and order_hour_str columns reverted from objects to ints
# (these were set in the 01_data_cleaning ntbk as objects before being saved to csv), so for now just re-setting them-
df['order_weekday_str'] = df['order_weekday_str'].astype('object')

In [5]:
df['order_hour_str'] = df['order_hour_str'].astype('object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 41 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   actual_ship_days          180519 non-null  int64  
 2   estimated_ship_days       180519 non-null  int64  
 3   delivery_status           180519 non-null  object 
 4   late_delivery_risk        180519 non-null  int64  
 5   category_id               180519 non-null  int64  
 6   category_name             180519 non-null  object 
 7   customer_city             180519 non-null  object 
 8   customer_id               180519 non-null  int64  
 9   customer_segment          180519 non-null  object 
 10  customer_state            180519 non-null  object 
 11  customer_zipcode          180516 non-null  float64
 12  department_name           180519 non-null  object 
 13  latitude                  180519 non-null  f

In [7]:
# set y-

y = df['ontime']

In [8]:
# get a baseline (57%)-

y.value_counts(normalize=True)

0    0.572793
1    0.427207
Name: ontime, dtype: float64

## Model 1:

In [9]:
# drop categorical features with too many values to OHE-
X = df.drop(columns=['category_name', 'customer_city', 'customer_state', 'order_city', 'order_country', 'order_state', 'product_name'])

In [10]:
# drop target column (ontime) and any other correlated columns that can bias the model-
X = X.drop(columns=['actual_ship_days', 'late_delivery_risk', 'delivery_status', 'order_status', 'ship_performance', 'ontime'])

In [11]:
# X has features that are categorical variables (to OHE) extrapolated from order_date, so that column is no longer needed
# shipping_date might bias the model, customer zipcode is too large to scale (throws an error) and is_fraud is an unnecessary
# feature for this model, so I'm dropping all of these columns-
# X = X.drop(columns=['shipping_date', 'order_date', 'customer_zipcode', 'is_fraud', 'order_month'])
X = X.drop(columns=['shipping_date', 'order_date', 'customer_zipcode', 'is_fraud'])

In [12]:
X.info()

# I'm keeping both the order_hour and order_hour_str features in X, but order_hour_str is a categorical variable that will be
# OHE while order_hour is just an int value. The same goes with order_weekday and order_weekday_str, and order_month and 
# order_month_name-

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   estimated_ship_days       180519 non-null  int64  
 2   category_id               180519 non-null  int64  
 3   customer_id               180519 non-null  int64  
 4   customer_segment          180519 non-null  object 
 5   department_name           180519 non-null  object 
 6   latitude                  180519 non-null  float64
 7   longitude                 180519 non-null  float64
 8   market                    180519 non-null  object 
 9   order_id                  180519 non-null  int64  
 10  order_item_discount_rate  180519 non-null  float64
 11  order_item_profit_ratio   180519 non-null  float64
 12  order_item_quantity       180519 non-null  int64  
 13  order_region              180519 non-null  o

In [13]:
# TTS-
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [14]:
# initialize smart_encoder object to tranform object dtypes and skip numeric columns-
smart_encoder = make_column_transformer(
    (OneHotEncoder(sparse=False), make_column_selector(dtype_include=object)),
    remainder = 'passthrough'
)

In [15]:
# OHE-
X_train = smart_encoder.fit_transform(X_train)
X_test = smart_encoder.transform(X_test)

In [16]:
# smart_encoder.get_feature_names_out()

In [17]:
# turn back into dfs-
X_train = pd.DataFrame(X_train, columns=smart_encoder.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=smart_encoder.get_feature_names_out())

In [18]:
# scale-
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [19]:
# X_train_sc = pd.DataFrame(X_train_sc.toarray(), columns = smart_encoder.get_feature_names_out())

In [20]:
# turn back into dfs-
X_train_sc = pd.DataFrame(X_train_sc)
X_test_sc = pd.DataFrame(X_test_sc)

In [21]:
# X_train_sc = pd.DataFrame(X_train_sc, columns=smart_encoder.get_feature_names_out())
# X_test_sc = pd.DataFrame(X_test_sc, columns=smart_encoder.get_feature_names_out())

In [22]:
# define Rachael's function-

def pipe(model):
    #instantiate model
    model = model()
    #fit to scaled data
    model.fit(X_train_sc, y_train)
    
    #make predictions
    predictions = model.predict(X_test_sc)
    #print accuracy scores for training and testing groups
    print(f'{model} training score: {model.score(X_train_sc, y_train)}')
    print(f'{model} testing score: {model.score(X_test_sc, y_test)}')
    
    return

# from:
# Rachael Friedman
# https://towardsdatascience.com/how-to-make-your-modeling-process-more-efficient-89e70259839d

In [23]:
# run the models through the function-
pipe(LogisticRegression)

LogisticRegression() training score: 0.7166608808691991
LogisticRegression() testing score: 0.7172834035009972


In [24]:
pipe(DecisionTreeClassifier)

DecisionTreeClassifier() training score: 1.0
DecisionTreeClassifier() testing score: 0.9167294482605806


In [25]:
pipe(RandomForestClassifier)

RandomForestClassifier() training score: 1.0
RandomForestClassifier() testing score: 0.8824728561932196


In [26]:
pipe(ExtraTreesClassifier)

ExtraTreesClassifier() training score: 1.0
ExtraTreesClassifier() testing score: 0.9290937292266784


In [27]:
# dt = DecisionTreeClassifier()

In [28]:
# dt.fit(X_train_sc, y_train)

In [29]:
# dt.score(X_test_sc, y_test)

In [30]:
etc = ExtraTreesClassifier()  

In [31]:
etc.fit(X_train_sc, y_train)

ExtraTreesClassifier()

In [32]:
etc.score(X_test_sc, y_test)

0.928074451584312

In [33]:
etc_importances = etc.feature_importances_ 

In [34]:
# get feature names out
features = smart_encoder.get_feature_names_out()

In [35]:
etc_importances_df = pd.DataFrame({'feature': features, 'importance': etc_importances}).sort_values('importance', ascending = False)

In [36]:
top_features = etc_importances_df[:15]

In [37]:
top_features

Unnamed: 0,feature,importance
49,onehotencoder__shipping_mode_Standard Class,0.087975
93,remainder__estimated_ship_days,0.062322
46,onehotencoder__shipping_mode_First Class,0.058583
96,remainder__latitude,0.039169
95,remainder__customer_id,0.038972
97,remainder__longitude,0.038294
98,remainder__order_id,0.036205
48,onehotencoder__shipping_mode_Second Class,0.026652
104,remainder__order_hour,0.023899
100,remainder__order_item_profit_ratio,0.020943


In [38]:
# dt_importances = dt.feature_importances_   

In [39]:
# features = X_train_sc.columns

In [40]:
# dt_importances_df = pd.DataFrame({'feature': features, 'importance': dt_importances}).sort_values('importance', ascending = False)

In [41]:
# dt_importances_df

## Best Performing Model 1: Extra Trees Classifier => Test Accuracy 92.98%

## Model 2:

In [42]:
# just to make sure colinearity isn't an issue I'm trying again with a new X, this time dropping the numeric columns
# order_hour, order_weekday and order_month; keeping the categorical/string versions order_hour_str, order_weekday_str
# and order_month_name

In [43]:
# drop categorical features with too many values to OHE-
X = df.drop(columns=['category_name', 'customer_city', 'customer_state', 'order_city', 'order_country', 'order_state', 'product_name'])

In [44]:
# drop target column (ontime) and any other correlated columns that can bias the model-
X = X.drop(columns=['actual_ship_days', 'late_delivery_risk', 'delivery_status', 'order_status', 'ship_performance', 'ontime'])

In [45]:
X = X.drop(columns=['shipping_date', 'order_date', 'customer_zipcode', 'is_fraud', 'order_hour', 'order_weekday', 'order_month'])

In [46]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180519 entries, 0 to 180518
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   type                      180519 non-null  object 
 1   estimated_ship_days       180519 non-null  int64  
 2   category_id               180519 non-null  int64  
 3   customer_id               180519 non-null  int64  
 4   customer_segment          180519 non-null  object 
 5   department_name           180519 non-null  object 
 6   latitude                  180519 non-null  float64
 7   longitude                 180519 non-null  float64
 8   market                    180519 non-null  object 
 9   order_id                  180519 non-null  int64  
 10  order_item_discount_rate  180519 non-null  float64
 11  order_item_profit_ratio   180519 non-null  float64
 12  order_item_quantity       180519 non-null  int64  
 13  order_region              180519 non-null  o

In [47]:
X.head()

Unnamed: 0,type,estimated_ship_days,category_id,customer_id,customer_segment,department_name,latitude,longitude,market,order_id,...,order_item_profit_ratio,order_item_quantity,order_region,product_card_id,product_price,shipping_mode,order_before_noon,order_month_name,order_weekday_str,order_hour_str
0,DEBIT,4,73,20755,Consumer,Fitness,18.251453,-66.037056,Pacific Asia,77202,...,0.29,1,Southeast Asia,1360,327.75,Standard Class,0,Jan,2,22
1,TRANSFER,4,73,19492,Consumer,Fitness,18.279451,-66.037064,Pacific Asia,75939,...,-0.8,1,South Asia,1360,327.75,Standard Class,0,Jan,5,12
2,CASH,4,73,19491,Consumer,Fitness,37.292233,-121.881279,Pacific Asia,75938,...,-0.8,1,South Asia,1360,327.75,Standard Class,0,Jan,5,12
3,DEBIT,4,73,19490,Home Office,Fitness,34.125946,-118.291016,Pacific Asia,75937,...,0.08,1,Oceania,1360,327.75,Standard Class,1,Jan,5,11
4,PAYMENT,4,73,19489,Corporate,Fitness,18.253769,-66.037048,Pacific Asia,75936,...,0.45,1,Oceania,1360,327.75,Standard Class,1,Jan,5,11


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [49]:
smart_encoder = make_column_transformer(
    (OneHotEncoder(sparse=False), make_column_selector(dtype_include=object)),
    remainder = 'passthrough'
)

In [50]:
X_train = smart_encoder.fit_transform(X_train)
X_test = smart_encoder.transform(X_test)

In [51]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [52]:
# turn back into dfs-
X_train_sc = pd.DataFrame(X_train_sc)
X_test_sc = pd.DataFrame(X_test_sc)

In [53]:
pipe(LogisticRegression)

LogisticRegression() training score: 0.7200289536077524
LogisticRegression() testing score: 0.7193219587857301


In [54]:
pipe(DecisionTreeClassifier)  

DecisionTreeClassifier() training score: 1.0
DecisionTreeClassifier() testing score: 0.9142920452027476


In [55]:
pipe(ExtraTreesClassifier)

ExtraTreesClassifier() training score: 1.0
ExtraTreesClassifier() testing score: 0.9227121648570795


In [56]:
# so not much of a difference, still good scores

## Best Performing Model 2: Extra Trees Classifier => Test Accuracy 92.36%

In [57]:
# pickle and export-
# with open('./models/extra_trees_best.pkl', 'wb') as pickle_out:
#     pickle_out = pickle.dump(etc2, pickle_out)

In [58]:
# turn back into dfs-
X_train_sc = pd.DataFrame(X_train_sc)
X_test_sc = pd.DataFrame(X_test_sc)

In [59]:
etc2 = ExtraTreesClassifier()

In [60]:
etc2.fit(X_train_sc, y_train)

ExtraTreesClassifier()

In [61]:
etc2.score(X_test_sc, y_test)

0.9192333259472635

In [62]:
# pickle and export-
# with open('./models/extra_trees_demo.pkl', 'wb') as pickle_out:
#     pickle_out = pickle.dump(etc2, pickle_out)