In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv('Final_Supply_Chain.csv')

In [29]:
features=df[['Type', 'Benefit per order', 'Sales per customer', 'Delivery Status', 'Late_delivery_risk', 
                    'Category Name', 'Customer City', 'Customer Country', 'Customer Id', 'Customer Segment', 
                    'Customer State', 'Customer Zipcode', 'Department Name', 'Latitude', 'Longitude', 'Market', 
                    'Order City', 'Order Country', 'order date (DateOrders)','shipping date (DateOrders)', 'Order Id', 'Order Item Cardprod Id',
                    'Order Item Discount', 'Order Item Discount Rate', 'Order Item Product Price', 'Order Item Quantity',
                    'Order Region', 'Order State', 'Product Status', 'Shipping Mode']]
features = features.drop(columns = ['Late_delivery_risk'])

In [30]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
def Labelencoder_feature(x):
    le=LabelEncoder()
    x=le.fit_transform(x)
    return x

In [31]:
features['shipping date (DateOrder)'] = pd.to_datetime(df['shipping date (DateOrders)'])
features['shipping_day_of_week'] = pd.to_datetime(df['shipping date (DateOrders)']).dt.dayofweek
features['shipping_date'] = pd.to_datetime(df['shipping date (DateOrders)']).dt.day
features['shipping_month'] = pd.to_datetime(df['shipping date (DateOrders)']).dt.month

In [32]:
target = df['Late_delivery_risk']
features=features.apply(Labelencoder_feature)
features.head()

final_features=features[['Type', 'Shipping Mode', 'Order Region',
       'Customer City', 'shipping_day_of_week','shipping_date','shipping_month']]

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()
#scaled_df = scaler.fit_transform(final_features)
#scaled_df = pd.DataFrame(scaled_df) 


In [34]:
#Import ML models:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
from sklearn.metrics import confusion_matrix

# Splitting the dataset into training and test set:

X_train, X_test, y_train, y_test = train_test_split(final_features, target, test_size=0.2, random_state=42)

# Feature Scaling:

scaler_x = MinMaxScaler((-1,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [35]:
#Linear Regression

C_param_range = [0.001,0.01,0.1,1,10,100]

table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
table['C_parameter'] = C_param_range

j = 0
for i in C_param_range:
    
    # Apply logistic regression model to training data
    Logreg = LogisticRegression(penalty = 'l2', C = i,random_state = 42)
    Logreg.fit(X_train,y_train)
    
    # Predict using model
    y_pred_lr = Logreg.predict(X_test)
    
    # Saving accuracy score in table
    table.iloc[j,1] = accuracy_score(y_test,y_pred_lr)
    j += 1
    
table   

Unnamed: 0,C_parameter,Accuracy
0,0.001,0.687486
1,0.01,0.687486
2,0.1,0.687486
3,1.0,0.687486
4,10.0,0.687486
5,100.0,0.687486


In [36]:
# SVM 

from sklearn.svm import SVC

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict using model:

y_predict_svm=svm.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_svm)
print(cnf_matrix)

Accuracy_svm=svm.score(X_test,y_test)
print(Accuracy_svm)

[[13305  3002]
 [ 8281 11516]]
0.6874861511189896


In [37]:
cv_scores_test= cross_val_score(svm,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(svm,X_train,y_train,cv=5,scoring='roc_auc')
print(cv_scores_test)
cv_scores_svm_test= cv_scores_test.mean()
cv_scores_svm_train= cv_scores_train.mean()
cv_scores_std_svm= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_svm_test))
print ('Mean cross validation train score: ' +str(cv_scores_svm_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_svm))

[0.71379047 0.71752096 0.71279906 0.71066181 0.72016942]
Mean cross validation test score: 0.7149883439853403
Mean cross validation train score: 0.7185466666233005
Standard deviation in cv scores: 0.0034118736470449204


In [38]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(bootstrap=True,n_estimators=100,criterion='entropy')
rf.fit(X_train, y_train)


y_predict_rf = rf.predict(X_test)


cnf_matrix = confusion_matrix(y_test, y_predict_rf)
print(cnf_matrix)
Accuracy_rf=rf.score(X_test,y_test)
print(Accuracy_rf)

[[14863  1444]
 [ 1720 18077]]
0.9123642809660979


In [39]:
cv_scores_test= cross_val_score(rf,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(rf,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_rf_test= cv_scores_test.mean()
cv_scores_rf_train= cv_scores_train.mean()
cv_scores_std_rf= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.84580321 0.85187276 0.84705428 0.84096624 0.85054266]
Mean cross validation test score: 0.8472478291234671
Mean cross validation train score: 0.9639065762844352
Standard deviation in cv scores: 0.003842619185923175


In [40]:
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
gbc = GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=100, random_state=5, max_depth=9, max_leaf_nodes=100)
gbc.fit(X_train, y_train)

#Predict using the model:

y_predict_gbc = gbc.predict(X_test)

#Confusion matrix:

cnf_matrix = confusion_matrix(y_test, y_predict_gbc)
print(cnf_matrix)
Accuracy_gbc=gbc.score(X_test,y_test)
print(Accuracy_gbc)

[[14165  2142]
 [ 8397 11400]]
0.7080932860624861


In [41]:
cv_scores_test= cross_val_score(gbc,X_test,y_test,cv=5,scoring='roc_auc')
cv_scores_train= cross_val_score(gbc,X_train,y_train,cv=5,scoring='roc_auc')

print(cv_scores_test)

cv_scores_gbc_test= cv_scores_test.mean()
cv_scores_gbc_train= cv_scores_train.mean()
cv_scores_std_gbc= cv_scores_test.std()
print ('Mean cross validation test score: ' +str(cv_scores_rf_test))
print ('Mean cross validation train score: ' +str(cv_scores_rf_train))
print ('Standard deviation in cv scores: ' +str(cv_scores_std_rf))

[0.7729657  0.78315983 0.77888819 0.76682563 0.78547635]
Mean cross validation test score: 0.8472478291234671
Mean cross validation train score: 0.9639065762844352
Standard deviation in cv scores: 0.003842619185923175


In [42]:
m_Labels = ['SVM','Random Forest', 'Gradient Boosting']
scores_test = [cv_scores_svm_test, cv_scores_rf_test, cv_scores_gbc_test]
scores_train = [cv_scores_svm_train, cv_scores_rf_train, cv_scores_gbc_test]
accuracies = [Accuracy_svm, Accuracy_rf, Accuracy_gbc]

score_tab_acc = pd.DataFrame(list(zip(m_Labels, accuracies)), 
               columns =['Algorithm', 'Model accuracy score']) 

score_tab = pd.DataFrame(list(zip(m_Labels, scores_train, scores_test)), 
               columns =['Algorithm', 'ROC-AUC train score', 'ROC-AUC test score' ]) 

print(score_tab_acc)
print("\n")
print(score_tab)

           Algorithm  Model accuracy score
0                SVM              0.687486
1      Random Forest              0.912364
2  Gradient Boosting              0.708093


           Algorithm  ROC-AUC train score  ROC-AUC test score
0                SVM             0.718547            0.714988
1      Random Forest             0.963907            0.847248
2  Gradient Boosting             0.777463            0.777463


In [43]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(max_features='auto',random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10,12,16], "n_estimators": [50, 100,400,700,1000]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

gs = gs.fit(X_train, y_train)

print(gs.best_estimator_) 
print(gs.best_score_)

RandomForestClassifier(criterion='entropy', n_estimators=700, n_jobs=-1,
                       random_state=1)
0.8848596112119201


In [45]:
rf = RandomForestClassifier(criterion = 'entropy',n_jobs = -1, n_estimators = 700, random_state = 1)
rf.fit(X_train, y_train)
y_predictions_rf = rf.predict(X_test)
acc = accuracy_score(y_test, y_predictions_rf)
cv = cross_val_score(rf, X_test, y_test,scoring='roc_auc').mean()
print(acc)
print(cv)

0.9135552847329935
0.8489443512909378


In [46]:
target = pd.Series(y_predictions_rf, name = 'Delay_status')
results = pd.DataFrame(list(zip(target, df['Customer Id'])))


In [47]:

importances = list(rf.feature_importances_)
imp=np.sort(importances)
tab=pd.DataFrame(list(zip(final_features,imp)),columns =['Features', 'Importance scores']) 
print(tab)

               Features  Importance scores
0                  Type           0.042530
1         Shipping Mode           0.075476
2          Order Region           0.092842
3         Customer City           0.117717
4  shipping_day_of_week           0.178021
5         shipping_date           0.199907
6        shipping_month           0.293507


In [48]:
from sklearn.metrics import auc, roc_curve, roc_auc_score
y_scores_rf = rf.predict_proba(X_test)
y_scores_rf = y_scores_rf[:,1]
auroc = roc_auc_score(y_test, y_scores_rf)
print("ROC-AUC Score:", auroc)

ROC-AUC Score: 0.9749182540307888


In [51]:
from sklearn.metrics import f1_score
print(f1_score(y_test,y_predictions_rf))
print(f1_score(y_test,y_predict_gbc))

0.920619579316835
0.6838837397642402
