##### The purpose of this script is to explore the relationships between timeliness of order deliveries, customer satisfaction, and customer sales.
# Table of Contents

### 1. Importing libraries and data

### 2. Exploratory analysis

### 3. Exporting data

## 1. Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os

In [2]:
# Enable matplotlib visuals to appear in the notebook 

%matplotlib inline

In [3]:
# Define path

path = r'C:\Users\radav\OneDrive\Documents\Career Foundry\Data Analytics\Immersion\Achievement 6 Advanced Analytics and Dashboard Design\Olist'

In [4]:
# Import Olist data

df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'delivered_orders_location.csv'), index_col=[0])
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col=[0])
df_products_english = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'product_category_name_translation.csv'), index_col = False)
df_items = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'order_items_checked.csv'), index_col=[0])

In [5]:
# Check output

df.head()

Unnamed: 0,order_id,customer_id,customer_unique_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_value,act_delivery_days,seller_delivery_days,carrier_delivery_days,est_less_act_delivery_days,late_delivery,geolocation_zip_code_prefix,geolocation_state,state_name,region
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,871766c5855e863f6eccc05f988b23cb,delivered,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29,5.0,72.19,7,6,1,8,0,28013,RJ,Rio de Janeiro,SE
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,eb28e67c4c0b83846050ddfb8a35d051,delivered,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15,4.0,259.83,16,8,8,2,0,15775,SP,Sao Paulo,SE
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,3818d81c6709e39d06b2738a8d3a2474,delivered,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05,5.0,216.87,7,1,6,13,0,35661,MG,Minas Gerais,SE
3,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,af861d436cfc08b2c2ddefd0ba074622,delivered,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20,4.0,25.78,6,2,4,5,0,12952,SP,Sao Paulo,SE
4,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,64b576fb70d441e8f1b2d7d446e483c5,delivered,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17,5.0,218.04,25,11,13,15,0,13226,SP,Sao Paulo,SE


In [6]:
df.shape

(95287, 20)

## 2. Exploratory analysis

#### i) Order delivery performance by unique customers

In [7]:
# Check output

df_subset = df[['order_id', 'customer_unique_id', 'review_score', 'payment_value', 'act_delivery_days', 'late_delivery']]

In [8]:
# Check output

df_subset.head()

Unnamed: 0,order_id,customer_unique_id,review_score,payment_value,act_delivery_days,late_delivery
0,00010242fe8c5a6d1ba2dd792cb16214,871766c5855e863f6eccc05f988b23cb,5.0,72.19,7,0
1,00018f77f2f0320c557190d7a144bdd3,eb28e67c4c0b83846050ddfb8a35d051,4.0,259.83,16,0
2,000229ec398224ef6ca0657da4fc703e,3818d81c6709e39d06b2738a8d3a2474,5.0,216.87,7,0
3,00024acbcdf0a6daa1e931b038114c75,af861d436cfc08b2c2ddefd0ba074622,4.0,25.78,6,0
4,00042b26cf59d7ce69dfabb4e55b4fd9,64b576fb70d441e8f1b2d7d446e483c5,5.0,218.04,25,0


In [9]:
df_agg = df_subset.groupby(['customer_unique_id'], as_index=False).agg(order_count=('order_id', 'count'), 
                                                                       avg_review_score=('review_score', 'mean'), 
                                                                       avg_act_delivery_days=('act_delivery_days', 'mean'),
                                                                       late_deliveries=('late_delivery', 'sum'))

In [10]:
df_agg.head()

Unnamed: 0,customer_unique_id,order_count,avg_review_score,avg_act_delivery_days,late_deliveries
0,0000366f3b9a7992bf8c76cfdf3221e2,1,5.0,6.0,0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1,4.0,3.0,0
2,0000f46a3911fa3c0805444483337064,1,3.0,25.0,0
3,0000f6ccb0745a6a4b88665a16c9f078,1,4.0,20.0,0
4,0004aac84e0df4da2b147fca70cf8255,1,5.0,13.0,0


In [11]:
df_agg.shape

(92052, 5)

In [12]:
# Create a new repeat customer flag column using order count data and the loc function

df_agg.loc[df_agg['order_count'] == 1, 'customer_type'] = 'Single-order'
df_agg.loc[df_agg['order_count'] > 1, 'customer_type'] = 'Repeat'

In [13]:
# Check output

df_agg.head()

Unnamed: 0,customer_unique_id,order_count,avg_review_score,avg_act_delivery_days,late_deliveries,customer_type
0,0000366f3b9a7992bf8c76cfdf3221e2,1,5.0,6.0,0,Single-order
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1,4.0,3.0,0,Single-order
2,0000f46a3911fa3c0805444483337064,1,3.0,25.0,0,Single-order
3,0000f6ccb0745a6a4b88665a16c9f078,1,4.0,20.0,0,Single-order
4,0004aac84e0df4da2b147fca70cf8255,1,5.0,13.0,0,Single-order


In [14]:
df_agg.describe()

Unnamed: 0,order_count,avg_review_score,avg_act_delivery_days,late_deliveries
count,92052.0,92052.0,92052.0,92052.0
mean,1.035143,4.153852,12.043982,0.083051
std,0.223674,1.279743,9.452035,0.279209
min,1.0,1.0,0.0,0.0
25%,1.0,4.0,6.0,0.0
50%,1.0,5.0,10.0,0.0
75%,1.0,5.0,15.0,0.0
max,15.0,5.0,208.0,4.0


In [15]:
# Create dataframe for single-order customers

df_single = df_agg[df_agg['customer_type'] == 'Single-order']

In [16]:
# Create dataframe for repeat customers

df_repeat = df_agg[df_agg['customer_type'] == 'Repeat']

In [17]:
# Check basic statistics for both customer groups

df_single.describe()

Unnamed: 0,order_count,avg_review_score,avg_act_delivery_days,late_deliveries
count,89283.0,89283.0,89283.0,89283.0
mean,1.0,4.152571,12.051712,0.080967
std,0.0,1.285797,9.515671,0.272786
min,1.0,1.0,0.0,0.0
25%,1.0,4.0,6.0,0.0
50%,1.0,5.0,10.0,0.0
75%,1.0,5.0,15.0,0.0
max,1.0,5.0,208.0,1.0


In [18]:
df_repeat.describe()

Unnamed: 0,order_count,avg_review_score,avg_act_delivery_days,late_deliveries
count,2769.0,2769.0,2769.0,2769.0
mean,2.168292,4.195153,11.794725,0.150235
std,0.582617,1.065647,7.09761,0.433222
min,2.0,1.0,1.0,0.0
25%,2.0,3.5,7.0,0.0
50%,2.0,4.5,10.5,0.0
75%,2.0,5.0,14.666667,0.0
max,15.0,5.0,73.0,4.0


#### There is minimal difference between the two groups in terms of review scores, and single-order customers have a lower late delivery rate. The analysis may be biased due to the specific time-frame being evaluated that doesn't distinguish effectively between regular and one-off customers.

#### ii) Product delivery performance

In [19]:
# Check output of products files

df_products.head()

Unnamed: 0,product_id,product_category_name,product_name_length,product_description_length,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [20]:
df_products_english.head()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


In [21]:
df_products_english.shape

(71, 2)

In [22]:
# Merge English product category names to products dataframe

df_products_merged = pd.merge(df_products, df_products_english[['product_category_name', 'product_category_name_english']], on='product_category_name', how='left')

In [23]:
# Move English product category name column

col_list = df_products_merged.columns.tolist()
col_list.insert(2, col_list.pop(col_list.index('product_category_name_english')))
df_products_merged = df_products_merged.reindex(columns=col_list)

In [24]:
# Check output of merged products file

df_products_merged.head()

Unnamed: 0,product_id,product_category_name,product_category_name_english,product_name_length,product_description_length,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,perfumery,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,art,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,sports_leisure,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,baby,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,housewares,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [25]:
# Merge product id's to main dataframe

df_productid = pd.merge(df, df_items[['order_id', 'product_id']], on='order_id', how='left')
df_merged = pd.merge(df_productid, df_products_merged[['product_id', 'product_category_name_english']], on='product_id', how='left')

In [26]:
# Check output

df_merged.head()

Unnamed: 0,order_id,customer_id,customer_unique_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,...,seller_delivery_days,carrier_delivery_days,est_less_act_delivery_days,late_delivery,geolocation_zip_code_prefix,geolocation_state,state_name,region,product_id,product_category_name_english
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,871766c5855e863f6eccc05f988b23cb,delivered,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29,5.0,...,6,1,8,0,28013,RJ,Rio de Janeiro,SE,4244733e06e7ecb4970a6e2683c13e61,cool_stuff
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,eb28e67c4c0b83846050ddfb8a35d051,delivered,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15,4.0,...,8,8,2,0,15775,SP,Sao Paulo,SE,e5f2d52b802189ee658865ca93d83a8f,pet_shop
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,3818d81c6709e39d06b2738a8d3a2474,delivered,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05,5.0,...,1,6,13,0,35661,MG,Minas Gerais,SE,c777355d18b72b67abbeef9df44fd0fd,furniture_decor
3,00024acbcdf0a6daa1e931b038114c75,d4eb9395c8c0431ee92fce09860c5a06,af861d436cfc08b2c2ddefd0ba074622,delivered,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20,4.0,...,2,4,5,0,12952,SP,Sao Paulo,SE,7634da152a4610f1595efa32f14722fc,perfumery
4,00042b26cf59d7ce69dfabb4e55b4fd9,58dbd0b2d70206bf40e62cd34e84d795,64b576fb70d441e8f1b2d7d446e483c5,delivered,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17,5.0,...,11,13,15,0,13226,SP,Sao Paulo,SE,ac6c3623068f30de03045865e4e10089,garden_tools


In [27]:
df_merged.shape

(108766, 22)

In [28]:
# Count missing values

df_merged.isnull().sum()

order_id                            0
customer_id                         0
customer_unique_id                  0
order_status                        0
order_purchase_timestamp            0
order_approved_at                   0
order_delivered_carrier_date        0
order_delivered_customer_date       0
order_estimated_delivery_date       0
review_score                        0
payment_value                       0
act_delivery_days                   0
seller_delivery_days                0
carrier_delivery_days               0
est_less_act_delivery_days          0
late_delivery                       0
geolocation_zip_code_prefix         0
geolocation_state                   0
state_name                          0
region                              0
product_id                          0
product_category_name_english    1540
dtype: int64

In [29]:
# Replace blank category names with unknown label 

df_merged['product_category_name_english'].fillna('Unknown', inplace = True)

In [30]:
# Check output

df_merged['product_category_name_english'].isnull().sum()

0

In [31]:
# Create subset of all orders delivered late

df_late =  df_merged[df_merged['late_delivery']==1]

In [32]:
# Check output

df_late.head()

Unnamed: 0,order_id,customer_id,customer_unique_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,...,seller_delivery_days,carrier_delivery_days,est_less_act_delivery_days,late_delivery,geolocation_zip_code_prefix,geolocation_state,state_name,region,product_id,product_category_name_english
8,0005a1a1728c9d785b8e2b08b904576c,16150771dfd4776261284213b89c304e,639d23421f5517f69d0c3d6e6564cf0e,delivered,2018-03-19 18:40:33,2018-03-20 18:35:21,2018-03-28 00:37:42,2018-03-29 18:17:31,2018-03-29,1.0,...,8,1,-1,1,11075,SP,Sao Paulo,SE,310ae3c140ff94b03219ad0adc3c778f,health_beauty
11,00063b381e2406b52ad429470734ebd5,6a899e55865de6549a58d2c6845e5604,3fb97204945ca0c01bcf3eee6031c5f1,delivered,2018-07-27 17:21:27,2018-07-27 18:00:06,2018-07-30 14:52:00,2018-08-07 13:56:52,2018-08-07,5.0,...,2,7,-1,1,15910,SP,Sao Paulo,SE,f177554ea93259a5b282f24e33f65ab6,fashion_bags_accessories
21,000e906b789b55f64edcb1f84030f90d,6a3b2fc9f270df258605e22bef19fd88,3588484a539617d91500764822230fb6,delivered,2017-11-21 18:54:23,2017-11-21 19:09:02,2017-11-22 20:46:54,2017-12-09 17:27:23,2017-12-07,3.0,...,1,16,-3,1,18900,SP,Sao Paulo,SE,57d79905de06d8897872c551bfd09358,telephony
36,0017afd5076e074a48f1f1a4c7bac9c5,8085a9af46f619bc25966f151a362b0d,98758d88bf4b8eef1372ddee45d63178,delivered,2017-04-06 22:16:10,2017-04-06 22:25:19,2017-04-17 13:54:37,2017-05-23 08:32:07,2017-05-19,1.0,...,10,35,-5,1,57250,AL,Alagoas,NE,fe59a1e006df3ac42bf0ceb876d70969,computers_accessories
44,001c85b5f68d2be0cb0797afc9e8ce9a,48ed31e735f1c420ed6ca3637b7c744d,55a269f324455e78349e7b9b7e7e5911,delivered,2017-11-24 19:19:18,2017-11-24 22:38:47,2017-11-27 12:42:15,2017-12-22 18:37:40,2017-12-14,2.0,...,2,25,-9,1,8072,SP,Sao Paulo,SE,84f456958365164420cfc80fbe4c7fab,bed_bath_table


In [33]:
df_late.shape

(8502, 22)

In [34]:
df_late['product_category_name_english'].value_counts()

product_category_name_english
bed_bath_table           898
health_beauty            827
furniture_decor          673
sports_leisure           613
computers_accessories    583
                        ... 
arts_and_craftmanship      2
books_imported             2
party_supplies             2
flowers                    1
diapers_and_hygiene        1
Name: count, Length: 68, dtype: int64

#### Export merged file to use visualization software (Tableau) to show which product categories are having the most problems with late order deliveries.

## 3. Exporting data

In [35]:
# Export merged file

df_merged.to_csv(os.path.join(path, '02 Data','Prepared Data', 'delivered_products_cleaned.csv'))