In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#Import data from csv
import pandas as pd
df = pd.read_csv('gdrive/MyDrive/Reva/SOL_mat_log1.csv',parse_dates=["order_date"])

In [3]:
# Load required libraries
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from geopy.geocoders import Nominatim
from geopy import distance

import sklearn
from sklearn import preprocessing

import seaborn as sns
pd.options.display.max_rows = 200
pd.options.display.max_columns = None

In [4]:
df.dtypes

order_date          datetime64[ns]
cust_seg                     int64
cust_city                   object
cust_country                object
sm_zone                     object
MOT                         object
facility_id                 object
facility_country            object
prdt_code                    int64
prdt_cat                    object
prdt_seg                    object
prdt_desc                   object
time to deliver              int64
dtype: object

In [5]:
#Remove the dates now that the dates have been expressed as month and week
df['order_month']=df['order_date'].dt.month
df['order_week']=df['order_date'].dt.isocalendar().week

del df['order_date']

In [6]:
del df['prdt_desc']

In [7]:
df_m1=df.copy(deep=True)
df_m2=df.copy(deep=True)
df_m3=df.copy(deep=True)

In [8]:
df_m1.dtypes

cust_seg             int64
cust_city           object
cust_country        object
sm_zone             object
MOT                 object
facility_id         object
facility_country    object
prdt_code            int64
prdt_cat            object
prdt_seg            object
time to deliver      int64
order_month          int64
order_week          UInt32
dtype: object

In [17]:
cat=['prdt_cat','sm_zone','cust_seg','prdt_seg','prdt_code','facility_id','facility_country','MOT','order_week','order_month','cust_city','cust_country']

In [10]:
# Data Preprocessing and modelling library 
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score,mean_squared_error

In [18]:
for i in cat:
    df_m1[i] = df_m1[i].astype('category')

In [21]:
X_m1=df_m1.drop(['time to deliver'], axis=1)
Y_m1 = df_m1['time to deliver']

In [23]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat)])

In [24]:
# Test train split from the data with 20% reserved for testing
x_m1tr, x_m1ts, y_m1tr, y_m1ts = train_test_split(X_m1,Y_m1, train_size=0.8, random_state=12)

In [25]:
# Linear Regression
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgr', LinearRegression())])
lr.fit(x_m1tr, y_m1tr)
print("LR 80pc Train score: %.3f" % lr.score(x_m1tr, y_m1tr))
print("LR 20pc Test score: %.3f" % lr.score(x_m1ts, y_m1ts))

LR 80pc Train score: 0.822
LR 20pc Test score: 0.811


In [26]:
# SVR Regression
sv = Pipeline(steps=[('preprocessor', preprocessor),
                      ('svr', LinearSVR())])
sv.fit(x_m1tr, y_m1tr)
print("SVR 80pc Train score: %.3f" % sv.score(x_m1tr, y_m1tr))
print("SVR 20pc Test score: %.3f" % sv.score(x_m1ts, y_m1ts))

SVR 80pc Train score: 0.818
SVR 20pc Test score: 0.814


In [27]:
# Random Forest Regression
rr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rfr', RandomForestRegressor())])
rr.fit(x_m1tr, y_m1tr)
print("RR 80pc Train score: %.3f" % rr.score(x_m1tr, y_m1tr))
print("RR 20pc Test score: %.3f" % rr.score(x_m1ts, y_m1ts))

RR 80pc Train score: 0.963
RR 20pc Test score: 0.791


In [28]:
# XGB Regression
xg = Pipeline(steps=[('preprocessor', preprocessor),
                     ('xgr', xgb.XGBRegressor())])
xg.fit(x_m1tr, y_m1tr)
print("XG 80pc Train score: %.3f" % xg.score(x_m1tr, y_m1tr))
print("XG 20pc Test score: %.3f" % xg.score(x_m1ts, y_m1ts))

XG 80pc Train score: 0.827
XG 20pc Test score: 0.817


In [29]:
y_test_lr = lr.predict(x_m1ts)
print("MSE for linear regression",np.sqrt(mean_squared_error(y_m1ts, y_test_lr, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_lr))
y_test_rr = rr.predict(x_m1ts)
print("MSE for Random forest regression",np.sqrt(mean_squared_error(y_m1ts, y_test_rr, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_rr))
y_test_sv = sv.predict(x_m1ts)
print("MSE for SVR regression",np.sqrt(mean_squared_error(y_m1ts, y_test_sv, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_sv))
y_test_xg = xg.predict(x_m1ts)
print("MSE for XGB regression",np.sqrt(mean_squared_error(y_m1ts, y_test_xg, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_xg))

MSE for linear regression 4.2561113455224655
R2 test data: 0.811:
MSE for Random forest regression 4.482802007458855
R2 test data: 0.791:
MSE for SVR regression 4.221635697508328
R2 test data: 0.814:
MSE for XGB regression 4.188186446424926
R2 test data: 0.817:


In [None]:
# get a list of models to evaluate
def get_models():
	models = dict()
#	models['knn'] = KNeighborsRegressor()
	models['xgb'] = xgb.XGBRegressor()
	return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
	return scores
# define dataset
x_m1tr, x_m1ts, y_m1tr, y_m1ts = train_test_split(X_m1,Y_m1, train_size=0.7, random_state=12)

x_m1tr=pd.get_dummies(data=x_m1tr, columns=cat)
x_m1ts=pd.get_dummies(data=x_m1ts, columns=cat)

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, x_m1tr, y_m1tr)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
#pyplot.boxplot(results, labels=names, showmeans=True)
#pyplot.show()

In [30]:
# Test train split from the data with 40% reserved for testing
x_m1tr, x_m1ts, y_m1tr, y_m1ts = train_test_split(X_m1,Y_m1, train_size=0.6, random_state=12)

In [32]:
# Linear Regression
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgr', LinearRegression())])
lr.fit(x_m1tr, y_m1tr)
print("LR 60pc Train score: %.3f" % lr.score(x_m1tr, y_m1tr))
print("LR 40pc Test score: %.3f" % lr.score(x_m1ts, y_m1ts))

LR 60pc Train score: 0.826
LR 40pc Test score: 0.803


In [34]:
# SVR Regression
sv = Pipeline(steps=[('preprocessor', preprocessor),
                      ('svr', LinearSVR())])
sv.fit(x_m1tr, y_m1tr)
print("SVR 60pc Train score: %.3f" % sv.score(x_m1tr, y_m1tr))
print("SVR 40pc Test score: %.3f" % sv.score(x_m1ts, y_m1ts))

SVR 60pc Train score: 0.822
SVR 40pc Test score: 0.808


In [35]:
# XGB Regression
xg = Pipeline(steps=[('preprocessor', preprocessor),
                     ('xgr', xgb.XGBRegressor())])
xg.fit(x_m1tr, y_m1tr)
print("XG 60pc Train score: %.3f" % xg.score(x_m1tr, y_m1tr))
print("XG 40pc Test score: %.3f" % xg.score(x_m1ts, y_m1ts))

XG 60pc Train score: 0.832
XG 40pc Test score: 0.811


In [37]:
y_test_lr = lr.predict(x_m1ts)
print("MSE for Random forest regression",np.sqrt(mean_squared_error(y_m1ts, y_test_lr, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_lr))
y_test_sv = sv.predict(x_m1ts)
print("MSE for SVR regression",np.sqrt(mean_squared_error(y_m1ts, y_test_sv, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_sv))
y_test_xg = xg.predict(x_m1ts)
print("MSE for XGB regression",np.sqrt(mean_squared_error(y_m1ts, y_test_xg, squared=True)))
print('R2 test data: %.3f:' % r2_score(y_m1ts,y_test_xg))

MSE for Random forest regression 4.330151030605341
R2 test data: 0.803:
MSE for SVR regression 4.268029935681318
R2 test data: 0.808:
MSE for XGB regression 4.239999428485471
R2 test data: 0.811:


In [None]:
import joblib
joblib.dump(xg,"model_1.sav")

['model_1.sav']

In [19]:
del df_m1['cust_city']

In [20]:
cat1=['prdt_cat','sm_zone','cust_seg','prdt_seg','prdt_code','facility_id','facility_country','MOT','order_week','order_month','cust_country']
for i in cat1:
    df_m1[i] = df_m1[i].astype('category')
X_m1=df_m1.drop(['time to deliver'], axis=1)
Y_m1 = df_m1['time to deliver']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, cat1)])

# Test train split from the data with 20% reserved for testing
x_m1tr, x_m1ts, y_m1tr, y_m1ts = train_test_split(X_m1,Y_m1, train_size=0.8, random_state=12)

In [22]:
# Linear Regression
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgr', LinearRegression())])
lr.fit(x_m1tr, y_m1tr)
print("LR 80pc Train score: %.3f" % lr.score(x_m1tr, y_m1tr))
print("LR 20pc Test score: %.3f" % lr.score(x_m1ts, y_m1ts))

LR 80pc Train score: 0.815
LR 20pc Test score: 0.819


In [23]:
# SVR Regression
sv = Pipeline(steps=[('preprocessor', preprocessor),
                      ('svr', LinearSVR())])
sv.fit(x_m1tr, y_m1tr)
print("SVR 80pc Train score: %.3f" % sv.score(x_m1tr, y_m1tr))
print("SVR 20pc Test score: %.3f" % sv.score(x_m1ts, y_m1ts))

SVR 80pc Train score: 0.814
SVR 20pc Test score: 0.819


In [24]:
# XGB Regression
xg = Pipeline(steps=[('preprocessor', preprocessor),
                     ('xgr', xgb.XGBRegressor())])
xg.fit(x_m1tr, y_m1tr)
print("XG 80pc Train score: %.3f" % xg.score(x_m1tr, y_m1tr))
print("XG 20pc Test score: %.3f" % xg.score(x_m1ts, y_m1ts))

XG 80pc Train score: 0.835
XG 20pc Test score: 0.816


In [26]:
import joblib
joblib.dump(xg,"model.sav")

['model.sav']