## Data Import and Preprocessing

In [1]:

import pandas as pd
from matplotlib import pyplot as plt

# Define Excel File Constant, and import function

EXCEL_FILE = "data/Muesli Project raw data 21-3.xlsx"

def import_xls(file, sheet, header=0):
    df = pd.read_excel(file, sheet_name = sheet, header=header)
    return df

    

In [2]:
# Import Excel Sheets as separate DFs

df_orders = import_xls(EXCEL_FILE, "Orders", header=1)
df_order_process = import_xls(EXCEL_FILE, "Order Process Data")
df_intern = import_xls(EXCEL_FILE, "InternData Study")
df_campaign = import_xls(EXCEL_FILE, "Campaign Data")
df_list = [df_orders, df_order_process, df_intern, df_campaign]


In [3]:
# Define Column Renaming Function
def column_rename(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_')
    return df

In [40]:
# Rename all DF columns
for df in df_list:
    column_rename(df)
#display(df_orders.head())


Unnamed: 0,order_id,order_date,customer_id,customer_name,origin_channel,country/region,city,state,postal_code,region,category,sub-category,product_id,quantity
0,CA-2017-103800,2017-01-03,DP-13000,Darren Powers,Email,United States,Houston,Texas,77095.0,Central,Power Muesli,Nuts and more,OFF-PA-10000174,2.0
1,CA-2017-112326,2017-01-04,PO-19195,Phillina Ober,Facebook,United States,Naperville,Illinois,60540.0,Central,Power Muesli,No Taste All Power,OFF-LA-10003223,3.0
4,CA-2017-141817,2017-01-05,MB-18085,Mick Brown,Email,United States,Philadelphia,Pennsylvania,19143.0,East,Power Muesli,Super Mega Protein,OFF-AR-10003478,3.0
5,CA-2017-106054,2017-01-06,JO-15145,Jack O'Briant,Sales,United States,Athens,Georgia,30605.0,South,Power Muesli,Super Mega Protein,OFF-AR-10002399,3.0
6,CA-2017-130813,2017-01-06,LS-17230,Lycoris Saunders,Email,United States,Los Angeles,California,90049.0,West,Power Muesli,Nuts and more,OFF-PA-10002005,3.0


In [12]:
# Check for duplicates
for idx, df in enumerate(df_list):
    print(idx)
    print(f" Total order_id count is {df['order_id'].count()}")
    print(f" Total unique order_id count is {df['order_id'].nunique()}")
# 0 df_orders, 1 df_order_process, 2 df_intern, 3 df_campaign

0
 Total order_id count is 5009
 Total unique order_id count is 5009
1
 Total order_id count is 3002
 Total unique order_id count is 3002
2
 Total order_id count is 204
 Total unique order_id count is 204
3
 Total order_id count is 333
 Total unique order_id count is 333


In [30]:
# Define duplicate row removal on order_id function  
def duplicate_id_removal(df):
    df.drop_duplicates(subset=['order_id'], inplace=True)
    df.reset_index

In [31]:
# Remove duplicates for all DFs
for df in df_list:
     duplicate_id_removal(df)

In [50]:
# See if assumption about all order IDs being included in the order process tracking is correct
num_unique_dif = df_orders['order_id'].count() - df_order_process['order_id'].count()
print(f"We don't have all the order ids in the order process dataset. \nContrary to company assumptions there is a difference of {num_unique_dif} untracked orders")

# Investigate if all orders from 2/1/2019 on are tracked then

df_orders_recent = df_orders[df_orders['order_date'] >= df_order_process["order_date"][0]]
display(df_orders_recent.info())
display(df_order_process.info())


# df_orders['order_id'].isin(df_order_process['order_id']).count()
# ~ = not in

We don't have all the order ids in the order process dataset. 
Contrary to company assumptions there is a difference of 2007 untracked orders
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3002 entries, 4095 to 9993
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order_id        3002 non-null   object        
 1   order_date      3002 non-null   datetime64[ns]
 2   customer_id     3002 non-null   object        
 3   customer_name   3002 non-null   object        
 4   origin_channel  3002 non-null   object        
 5   country/region  3002 non-null   object        
 6   city            3002 non-null   object        
 7   state           3002 non-null   object        
 8   postal_code     2998 non-null   float64       
 9   region          3002 non-null   object        
 10  category        3002 non-null   object        
 11  sub-category    3002 non-null   object        
 12  product_id     

None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3002 entries, 0 to 3002
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            3002 non-null   object        
 1   order_date          3002 non-null   datetime64[ns]
 2   on_truck_scan_date  3002 non-null   datetime64[ns]
 3   processing_mode     3002 non-null   object        
dtypes: datetime64[ns](2), object(2)
memory usage: 181.8+ KB


None

In [60]:
df_list = [df_orders, df_order_process, df_intern, df_campaign]

df_all = pd.merge(df_orders, df_order_process, how='left', on='order_id')
df_all = pd.merge(df_all, df_intern, how='left', on='order_id')
df_all = pd.merge(df_all, df_campaign, how='left', on='order_id')

df = pd.merge(df_orders, df_order_process, how='inner', on='order_id') 

Unnamed: 0,order_id,order_date,customer_id,customer_name,origin_channel,country/region,city,state,postal_code,region,...,order_id.1,order_date.1,on_truck_scan_date,processing_mode,order_id.2,ready_to_ship_date,pickup_date,order_id.3,arrival_scan_date,customer_name.1
0,,NaT,,,,,,,,,...,CA-2019-160304,2019-01-02,2019-01-09,Standard Processing,CA-2019-116540,2019-09-02,2019-09-03,CA-2019-109666,2019-05-03,Kunst Miller
1,,NaT,,,,,,,,,...,CA-2019-125206,2019-01-03,2019-01-07,Express,,NaT,NaT,CA-2019-138933,2019-05-03,Jack Lebron
2,,NaT,,,,,,,,,...,US-2019-116365,2019-01-03,2019-01-09,Standard Processing,CA-2019-129847,2019-09-04,2019-09-04,CA-2019-130001,2019-05-03,Heather Kirkland
3,,NaT,,,,,,,,,...,CA-2019-105207,2019-01-03,2019-01-09,Standard Processing,CA-2019-129630,2019-09-04,2019-09-04,CA-2019-113061,2019-05-06,Ed Ludwig
4,,NaT,,,,,,,,,...,US-2019-164630,2019-01-04,2019-01-11,Standard Processing,CA-2019-106278,2019-09-05,2019-09-06,CA-2019-162138,2019-05-06,Grace Kelly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9982,US-2020-158526,2020-12-29,KH-16360,Katherine Hughes,Email,United States,Louisville,Kentucky,40214.0,South,...,,NaT,NaT,,,NaT,NaT,,NaT,
9987,CA-2020-115427,2020-12-30,EB-13975,Erica Bern,Sales,United States,Fairfield,California,94533.0,West,...,,NaT,NaT,,,NaT,NaT,,NaT,
9989,CA-2020-126221,2020-12-30,CC-12430,Chuck Clark,Facebook,United States,Columbus,Indiana,47201.0,Central,...,,NaT,NaT,,,NaT,NaT,,NaT,
9990,CA-2020-143259,2020-12-30,PO-18865,Patrick O'Donnell,Email,United States,New York City,New York,10009.0,East,...,,NaT,NaT,,,NaT,NaT,,NaT,


In [None]:
# Doublecheck if we have the same amount of unique order ids in merged df as in the df_order_process
display(df['order_id'].nunique())

display(df_all.head())
display(df.head())

display(df_all.info())
display(df.info()) 

In [None]:
df["order_id"].nunique()
drop_list = ['customer_name', 'country/region', 'city', 'state', 'postal_code', 'region'] 
df = df.drop(drop_list, axis=1)


In [None]:
display(df.head(10))


In [None]:
df[df["order_date_x"] != df["order_date_y"]]
df[df["order_date_x"] == df["order_date_y"]].count()



In [None]:
df.drop("order_date_y", axis=1, inplace =True)


In [None]:
df.rename(columns = {"order_date_x": "order_date"}, inplace=True)
display(df)

In [None]:
df["pretransportation_duration"] = df["on_truck_scan_date"] - df["order_date"]
df["pretransportation_duration"] = df["pretransportation_duration"].dt.days


In [None]:
df.describe()

In [None]:
df['weekday'] = df['order_date'].dt.weekday

In [None]:
df.head()

In [None]:
# Create two labels of delivery date depending on express/ non/express processing(see flowchart)
same_arrival_day = [0, 2, 5, 6]
different_arrival_day = [1, 3, 4]

# Devide the dataframe into two groups with regard to these labels
same_day = df[df['weekday'].isin(same_arrival_day)]
different_days = df[df['weekday'].isin(different_arrival_day)]

# Doublecheck if the division is done correctly
check_difference = df['order_id'].count() - same_day['order_id'].count() - different_days['order_id'].count()
display(f'The Difference should be zero: ', check_difference)

In [None]:
# Compare descriptive statistics of each dataset
display('Same Day' ,same_day.describe())
display('Different Day' ,different_days.describe())

# Comparison:
# quantities: shows very equal distribution, though same_day orders are from 4 days and different days are from 3
 # Tuesday, Thursday and Friday are days with a lot of incoming orders


In [None]:
# We should analyse and visualize the days with the most incoming orders
# e.g. Barchart of orderquantities and weekdays


# Combine pretransportation_duration with standard_processing-column

In [None]:
# Investigate Delivery Time Assumption (average 3 days)
# Create merged df and campaign DF
campaign_merged = pd.merge(df, df_campaign , how='inner', on='order_id')

#Split into truck leaving the same day as the order and truck leaving on a different day
matched_different_days = campaign_merged[campaign_merged['weekday'].isin(different_arrival_day)].reset_index().drop("index", axis=1)
matched_same_day = campaign_merged[campaign_merged['weekday'].isin(same_arrival_day)].reset_index().drop("index", axis=1)

#Create new column for delivery duration for both DFs
matched_different_days["delivery_duration"] = matched_different_days["arrival_scan_date"] - matched_different_days["on_truck_scan_date"]
matched_same_day["delivery_duration"] = matched_same_day["arrival_scan_date"] - matched_same_day["on_truck_scan_date"]

#Simple Analysis on the two DFs
display(matched_different_days.describe())
display(matched_same_day.describe())

#Groupby weekday of truck scan

# Create new column for on truck scan weekday
campaign_merged['weekday_scan'] = campaign_merged['on_truck_scan_date'].dt.weekday
# Create new column for delivery duration
campaign_merged['delivery_duration'] = campaign_merged["arrival_scan_date"] - campaign_merged["on_truck_scan_date"]
campaign_merged['delivery_duration'].dt.days
# Create new column for total duration
campaign_merged['total_duration'] = campaign_merged["arrival_scan_date"] - campaign_merged["order_date"]
campaign_merged['delivery_duration'].dt.days

# Drop irrelevant columns
campaign_merged.drop(['quantity', 'weekday'], axis=1, inplace=True)
display(campaign_merged.describe())

In [None]:
# Groupy on truck weekday
display(campaign_merged)
groupby = campaign_merged.groupby(by="weekday_scan")
groupby.mean()


In [None]:
# KPI:
# Company Level
# Average delivery duration (standard): now --> goal
# Average delivery duration (express): now --> goal
# --> stacked barchart from order to warehouse to delivery(process issues not relevant)

# Processing Level
# Average processing duration (standard): now --> goal
# Average processing duration (express): now --> goal 
# --> visualization (process issues not relevant, besides delivery)

# Warehouse Level
# Average warehouse duration (standard): now --> goal
# Average warehouse duration (express): now --> goal 
# --> visualization (all process issues are relevant) 