In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split

%matplotlib inline

# Import Dataset

In [None]:
dataset = pd.read_csv('../input/3rd-party-delivery-dataset/3rd Party Delivery Data.csv')
dataset.head()

Drop Datetime Columns

In [None]:
df = dataset.drop(['Customer placed order datetime','Placed order with restaurant datetime','Driver at restaurant datetime','Delivered to consumer datetime'], axis=1)
df.head()

In [None]:
#ProfileReport(df)

In [None]:
df.info()

Change Dtypes of columns 

In [None]:
df['Is ASAP'] = df['Is ASAP'].astype(int)

# Change Delivery ID, Restaurant ID, Customer ID, and Region to Category
df['Driver ID'] = df['Driver ID'].astype('category')
df['Restaurant ID'] = df['Restaurant ID'].astype('category')
df['Consumer ID'] = df['Consumer ID'].astype('category')
df['Delivery Region'] = df['Delivery Region'].astype('category')

In [None]:
df

In [None]:
df.info()

# Visualize DF

In [None]:
fig = px.histogram(df, x=df['Delivery Region'],
                  color='Delivery Region')
fig.update_layout(
    title='Number of Deliveries by Region',
    yaxis_title_text='Number of Deliveries')
fig.show()

In [None]:
# Order Total vs Restaurant
fig = px.scatter(df, x=df['Restaurant ID'], y=df['Order total'], color=df['Delivery Region'])
fig.update_layout(
    title_text='Order Total ($) by Restaurant ID',
    yaxis_title_text='Order Total ($)')
fig.show()

In [None]:
# Driver ID vs Order Total
fig = px.scatter(df, x=df['Driver ID'], y=df['Order total'], color=df['Delivery Region'])
fig.update_layout(
    title_text='Order Total ($) by Driver ID',
    yaxis_title_text='Order Total ($)')
fig.show()

In [None]:
driverRevenue = pd.DataFrame()
driverRevenue['Driver ID'] = df['Driver ID']
driverRevenue['Order total'] = df['Order total']

driverRevenue = driverRevenue.sort_values(by='Order total', ascending=False)
driverRevenue = driverRevenue.groupby('Driver ID').sum().reset_index()

fig = px.bar(driverRevenue, x='Driver ID', y='Order total', color='Driver ID')
fig.update_layout(
    title_text='Order Revenue ($) by Driver ID',
    yaxis_title_text='Total Revenue ($)')
fig.show()

# Total Tip ($) by Driver ID

In [None]:
driverTip = pd.DataFrame()
driverTip['Driver ID'] = df['Driver ID']
driverTip['Amount of Tip'] = df['Amount of tip']

driverTip = driverTip.sort_values(by='Amount of Tip', ascending=False)
driverTip = driverTip.groupby('Driver ID').sum().reset_index()

fig = px.bar(driverTip, x='Driver ID', y='Amount of Tip', color='Driver ID')
fig.update_layout(
    title_text='Total Tip ($) by Driver ID',
    yaxis_title_text='Total Tip ($)')
fig.show()

# Amount of tip by Order total

In [None]:
# Order Total vs Amount Tip
fig = px.scatter(df, x='Order total', y='Amount of tip', color='Delivery Region', opacity=0.35, trendline='ols')
fig.update_layout(
    title='Amount of Tip by Order Total',
    yaxis_title_text='Amount of Tip ($)',
    xaxis_title_text='Order Total ($)')
fig.show()

In [None]:
# Order Total vs Amount Tip
fig = px.scatter(df, x='Order total', y='Amount of tip', color='Delivery Region', opacity=0.35, trendline='ols', facet_col='Delivery Region')
fig.update_layout(
    title='Amount of Tip by Order Total',
    yaxis_title_text='Amount of Tip ($)',
    xaxis_title_text='Order Total ($)')
fig.show()

# Tip % vs Order total

In [None]:
# Tip Percentage
tipPercentage = df.copy()
tipPercentage['Tip Percentage'] = df['Amount of tip']/df['Order total']

In [None]:
# Order Total vs Amount Tip
fig = px.scatter(tipPercentage, x='Order total', y='Tip Percentage', color='Delivery Region', opacity=0.35, trendline='ols')
fig.update_layout(
    title='Amount of Tip by Order Total',
    yaxis_title_text='Amount of Tip ($)',
    xaxis_title_text='Order Total ($)')
fig.show()

In [None]:

# Order Total vs Amount Tip
fig = px.scatter(tipPercentage, x='Order total', y='Tip Percentage', color='Delivery Region', opacity=0.35, trendline='ols', facet_col='Delivery Region')
fig.update_layout(
    title='Tip % by Order Total',
    yaxis_title_text='Tip (%)',
    xaxis_title_text='Order Total ($)')
fig.show()

# Deliveries by Driver ID by Delivery Region

In [None]:
xTab = pd.crosstab(df['Driver ID'], df['Delivery Region'])
xTab

In [None]:
# Number of Deliveries by Restaurant ID by Delivery Region
fig = px.bar(xTab, color='Delivery Region',
            labels={'value':'Number of Deliveries'})
fig.update_xaxes(type='category')
fig.update_layout(
    title='Number of Deliveries by Driver ID')
fig.show()

In [None]:
# Number of Deliveries by Restaurant ID by Delivery Region
fig = px.bar(xTab, color='Delivery Region',
            labels={'value':'# of Deliveries',
                   'Delivery Region': 'Region'}
            ,facet_row='Delivery Region')
fig.update_xaxes(type='category')
fig.update_layout(
    title='Number of Deliveries by Driver ID by Delivery Region')
fig.show()

# Deliveries by Restuarant ID by Delivery Region

In [None]:
xTab2 = pd.crosstab(df['Restaurant ID'], df['Delivery Region'])
xTab2

In [None]:
# Number of Deliveries by Restaurant ID by Delivery Region
fig = px.bar(xTab2, color='Delivery Region',
            labels={'value':'Number of Deliveries'})
fig.update_xaxes(type='category')
fig.update_layout(
    title='Number of Deliveries by Restaurant ID')
fig.show()

In [None]:
# Number of Deliveries by Restaurant ID by Delivery Region
fig = px.bar(xTab2, color='Delivery Region',
            labels={'value':'# of Deliveries',
                   'Delivery Region': 'Region'}
            ,facet_row='Delivery Region')
fig.update_xaxes(type='category')
fig.update_layout(
    title='Number of Deliveries by Driver ID by Delivery Region')
fig.show()

In [None]:
# Sunburst of Delivery Region, Restaurant ID by Order Total
fig = px.sunburst(df, path=['Delivery Region', 'Restaurant ID'], values='Order total')
fig.show()

In [None]:
# Sunburst of Delivery Region, Driver ID by Order Total
fig = px.sunburst(df, path=['Delivery Region', 'Driver ID'], values='Order total')
fig.show()

In [None]:
# Sunburst of Delivery Region, Driver ID by Order Total
fig = px.sunburst(df, path=['Delivery Region', 'Driver ID'], values='Amount of tip')
fig.show()

In [None]:
# Denisty Heat Map for Delivery Region and Order Total
fig = px.density_heatmap(df, x="Order total", y="Amount of tip", facet_col='Delivery Region')
fig.show()

Top Drivers by Order Total

# Create DFs for Regressions

In [None]:
regressDF = df.copy()
dropCols = ['Driver ID', 'Restaurant ID', 'Consumer ID', 'Delivery Region']
orderTitle = 'Order total'

mountainView = regressDF[regressDF['Delivery Region']=='Mountain View'].drop(dropCols, axis=1)
paloAlto = regressDF[regressDF['Delivery Region']=='Palo Alto'].drop(dropCols, axis=1)
sanJose = regressDF[regressDF['Delivery Region']=='San Jose'].drop(dropCols, axis=1)

# Encode 'Delivery Region' in master list
encodedDF = pd.get_dummies(regressDF, columns=['Delivery Region']).drop(['Driver ID', 'Restaurant ID', 'Consumer ID'], axis=1)
# Encode 'If Tip', 'If Refund'
encodedDF['If Tip'] = encodedDF['Amount of tip'].apply(lambda x: 1 if x > 0 else 0)
encodedDF['If Refund'] = encodedDF['Refunded amount'].apply(lambda x: 1 if x > 0 else 0)
#encodedDF['If Time < 30min Total'] = encodedDF['Time(Min) b/w Driver Deliver and Placed at Restaurant'].apply(lambda x: 1 if x <= 30 else 0)
#encodedDF['If Time < 60min Total'] = encodedDF['Time(Min) b/w Driver Deliver and Placed at Restaurant'].apply(lambda x: 1 if x <= 60 else 0)

# Master Lists
orderDF = encodedDF[[col for col in encodedDF.columns if col != 'Order total']+['Order total']]

# Lists by Region
    # Mountain View
mvOrder = mountainView[[col for col in mountainView.columns if col != 'Order total']+['Order total']]
    
    # Palo Alto
paOrder = paloAlto[[col for col in paloAlto.columns if col != 'Order total']+['Order total']]
    
    # San Jose
sjOrder = sanJose[[col for col in sanJose.columns if col != 'Order total']+['Order total']]

# 'If Refund' Regressions

**'If Refunded' Split**

In [None]:
# orderDF Regressions
X = orderDF.drop(['If Refund', 'Refunded amount'], axis=1)
y = orderDF['If Refund']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
print("X_train.shape = " + str(X_train.shape))
print("X_test.shape = " + str(X_test.shape))
print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))

**'If Refunded' Logistic Regression**

In [None]:
# Logistic
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
orderAccLog = round(logreg.score(X_test, y_test)*100, 2)
orderAccLog

**'If Refunded' KNN Classifier**

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
orderAccKnn = round(knn.score(X_test, y_test) * 100, 2)
orderAccKnn

**'If Refunded' Gaussian Naive Bayes**

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
orderAccGaussian = round(gaussian.score(X_test, y_test) * 100, 2)
orderAccGaussian

**'If Refunded' Perceptron**

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
orderAccPerceptron = round(perceptron.score(X_test, y_test) * 100, 2)
orderAccPerceptron

**'If Refunded' Decision Tree**

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)
orderAccDecisionTree = round(decisionTree.score(X_test, y_test) * 100, 2)
orderAccDecisionTree

**'If Refunded' Random Forest**

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train, y_train)
orderAccRandomForest = round(randomForest.score(X_test, y_test) * 100, 2)
orderAccRandomForest

**'If Refunded' Regession Breakdown**

In [None]:
models = pd.DataFrame({
    'Models': ['Logistice Regression', 'KNN', 'Naive Bayes', 'Perceptron', 'Decision Tree', 'Random Forest'],
    'Score' : [orderAccLog, orderAccKnn, orderAccGaussian, orderAccPerceptron, orderAccDecisionTree, orderAccRandomForest]
})

models.sort_values(by='Score', ascending=False)

**'If Refunded' Logistic Feature Importance**

In [None]:
effective = pd.DataFrame({'feature':list(X.columns),'feature_importance':[abs(i) for i in logreg.coef_[0]]})
#effective["feature_name"] = X.columns.tolist()
#effective["feature_importance"] = logreg.feature_importances_
effective.sort_values("feature_importance",ascending=False)

**'If Refunded' Random Forest Feature Importance**

In [None]:
effective = pd.DataFrame()
effective["feature_name"] = X.columns.tolist()
effective["feature_importance"] = randomForest.feature_importances_
effective.sort_values("feature_importance",ascending=False)

All of the time features, 'Order total', and 'Amount of Tip' seem to be the features with the most importance. Lets run regressions/classifiers with just those columns.

# orderDF Dropped Features Analysis

In [None]:
# dropedOrderDF Regressions
X = orderDF.drop(['If Refund', 'Refunded amount', 'Amount of discount', 'Delivery Region_Palo Alto', 'Delivery Region_Mountain View', 'Delivery Region_San Jose', 'Is ASAP'], axis=1)
y = orderDF['If Refund']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
print("X_train.shape = " + str(X_train.shape))
print("X_test.shape = " + str(X_test.shape))
print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))

Dropped Features Logistic Regression

In [None]:
# Logistic
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
orderAccLog = round(logreg.score(X_test, y_test)*100, 2)
orderAccLog

**Dropped Features KNN Classifier**

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
orderAccKnn = round(knn.score(X_test, y_test) * 100, 2)
orderAccKnn

**Dropped Features Gaussian Naive Bayes**

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
orderAccGaussian = round(gaussian.score(X_test, y_test) * 100, 2)
orderAccGaussian

**Dropped Features Perceptron**

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
orderAccPerceptron = round(perceptron.score(X_test, y_test) * 100, 2)
orderAccPerceptron


**Dropped Features Decision Tree**

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)
orderAccDecisionTree = round(decisionTree.score(X_test, y_test) * 100, 2)
orderAccDecisionTree

**Dropped Features Random Forest**

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train, y_train)
orderAccRandomForest = round(randomForest.score(X_test, y_test) * 100, 2)
orderAccRandomForest

**Dropped Features Breakdown**

In [None]:
models = pd.DataFrame({
    'Models': ['Logistice Regression', 'KNN', 'Naive Bayes', 'Perceptron', 'Decision Tree', 'Random Forest'],
    'Score' : [orderAccLog, orderAccKnn, orderAccGaussian, orderAccPerceptron, orderAccDecisionTree, orderAccRandomForest]
})

models.sort_values(by='Score', ascending=False)

In [None]:
effective = pd.DataFrame({'feature':list(X.columns),'feature_importance':[abs(i) for i in logreg.coef_[0]]})
#effective["feature_name"] = X.columns.tolist()
#effective["feature_importance"] = logreg.feature_importances_
effective.sort_values("feature_importance",ascending=False)

In [None]:
effective = pd.DataFrame()
effective["feature_name"] = X.columns.tolist()
effective["feature_importance"] = randomForest.feature_importances_
effective.sort_values("feature_importance",ascending=False)

Even with the dropped features the 'Order total' and 'Time b/w Delivery and Placed at restaurant' are the biggest factors when understanding why refunds happen from this dataset. 

# 'If Tip' Regressions

**'If Tip' Split**

In [None]:
# orderDF Regressions
X = orderDF.drop(['If Tip', 'Amount of tip'], axis=1)
y = orderDF['If Tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
print("X_train.shape = " + str(X_train.shape))
print("X_test.shape = " + str(X_test.shape))
print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))

**'If Tip' Logistic Regression**

In [None]:
# Logistic
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
orderAccLog = round(logreg.score(X_test, y_test)*100, 2)
orderAccLog

**'If Tip' KNN Classifier**

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
orderAccKnn = round(knn.score(X_test, y_test) * 100, 2)
orderAccKnn

**'If Tip' Gaussian Naive Bayes Regression**

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
orderAccGaussian = round(gaussian.score(X_test, y_test) * 100, 2)
orderAccGaussian

**'If Tip' Perceptron Regression**

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
orderAccPerceptron = round(perceptron.score(X_test, y_test) * 100, 2)
orderAccPerceptron

**'If Tip' Decision Tree Classifier**

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)
orderAccDecisionTree = round(decisionTree.score(X_test, y_test) * 100, 2)
orderAccDecisionTree

**'If Tip' Random Forest Classifier**

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train, y_train)
orderAccRandomForest = round(randomForest.score(X_test, y_test) * 100, 2)
orderAccRandomForest

**'If Tip' Regession Breakdown**

In [None]:
models = pd.DataFrame({
    'Models': ['Logistice Regression', 'KNN', 'Naive Bayes', 'Perceptron', 'Decision Tree', 'Random Forest'],
    'Score' : [orderAccLog, orderAccKnn, orderAccGaussian, orderAccPerceptron, orderAccDecisionTree, orderAccRandomForest]
})

models.sort_values(by='Score', ascending=False)

**'If Refunded' Random Tree Feature Importance**

In [None]:
effective = pd.DataFrame()
effective["feature_name"] = X.columns.tolist()
effective["feature_importance"] = randomForest.feature_importances_
effective.sort_values("feature_importance",ascending=False)

The same features seem to play a relevant role as the 'If Refunded' EDA. Lets see if the same happens when we drop all the features with < 10% importance.


# 'If Tip' Dropped Features

In [None]:
# dropedTipDF Regressions
X = orderDF.drop(['If Tip', 'Amount of tip', 'Refunded amount', 'Amount of discount', 'Delivery Region_Palo Alto', 'Delivery Region_Mountain View', 'Delivery Region_San Jose', 'Is ASAP', 'If Refund'], axis=1)
y = orderDF['If Tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)
print("X_train.shape = " + str(X_train.shape))
print("X_test.shape = " + str(X_test.shape))
print("y_train.shape = " + str(y_train.shape))
print("y_test.shape = " + str(y_test.shape))

In [None]:
# Logistic
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=2000)
logreg.fit(X_train, y_train)
orderAccLog = round(logreg.score(X_test, y_test)*100, 2)
orderAccLog

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
orderAccKnn = round(knn.score(X_test, y_test) * 100, 2)
orderAccKnn

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
orderAccGaussian = round(gaussian.score(X_test, y_test) * 100, 2)
orderAccGaussian

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
orderAccPerceptron = round(perceptron.score(X_test, y_test) * 100, 2)
orderAccPerceptron

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier()
decisionTree.fit(X_train, y_train)
orderAccDecisionTree = round(decisionTree.score(X_test, y_test) * 100, 2)
orderAccDecisionTree

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomForest = RandomForestClassifier(n_estimators=100)
randomForest.fit(X_train, y_train)
orderAccRandomForest = round(randomForest.score(X_test, y_test) * 100, 2)
orderAccRandomForest

In [None]:
models = pd.DataFrame({
    'Models': ['Logistice Regression', 'KNN', 'Naive Bayes', 'Perceptron', 'Decision Tree', 'Random Forest'],
    'Score' : [orderAccLog, orderAccKnn, orderAccGaussian, orderAccPerceptron, orderAccDecisionTree, orderAccRandomForest]
})

models.sort_values(by='Score', ascending=False)

In [None]:
effective = pd.DataFrame({'feature':list(X.columns),'feature_importance':[abs(i) for i in logreg.coef_[0]]})
#effective["feature_name"] = X.columns.tolist()
#effective["feature_importance"] = logreg.feature_importances_
effective.sort_values("feature_importance",ascending=False)

In [None]:
effective = pd.DataFrame()
effective["feature_name"] = X.columns.tolist()
effective["feature_importance"] = randomForest.feature_importances_
effective.sort_values("feature_importance",ascending=False)