## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import os

## Import Data

In [4]:
#path to main folder
path = r'C:\Users\steve\Documents\Olist Marketplace Analysis'

# import orders file
orders_df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_orders_dataset.csv'), index_col = False)

## Analysis

#### 01. Content + Shape

In [7]:
orders_df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [8]:
orders_df.shape

(99441, 8)

In [9]:
# set all date time columns to type datetime64[ns], including
# order_purchase_timestamp, order_approved_at, order_delivered_carrier_date
# order_delivered_customer_date, order_estimated_delivery_date

orders_df[['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']] = orders_df[['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']].apply(pd.to_datetime) 

In [10]:
# -- calculate time taked from each step in hours
# purchased to approve
orders_df['purchase_to_approved'] = orders_df['order_approved_at'] - orders_df['order_purchase_timestamp']
# show in hours
orders_df['purchase_to_approved_hrs'] = orders_df['purchase_to_approved'].dt.total_seconds()/3600
# round to hundreths place
orders_df['purchase_to_approved_hrs'] = orders_df['purchase_to_approved_hrs'].round(2)

# approve to deliver carrier
orders_df['approved_to_carrier'] = orders_df['order_delivered_carrier_date'] - orders_df['order_approved_at']
orders_df['approved_to_carrier_hrs'] = orders_df['approved_to_carrier'].dt.total_seconds()/3600
orders_df['approved_to_carrier_hrs'] = orders_df['approved_to_carrier_hrs'].round(2)

# deliver carrier to deliver customer
orders_df['carrier_to_cust'] = orders_df['order_delivered_customer_date'] - orders_df['order_delivered_carrier_date']
orders_df['carrier_to_customer_hrs'] = orders_df['carrier_to_cust'].dt.total_seconds()/3600
orders_df['carrier_to_customer_hrs'] = orders_df['carrier_to_customer_hrs'].round(2)

# purchased to delivered
orders_df['purchase_to_delivered'] = orders_df['order_delivered_customer_date'] - orders_df['order_purchase_timestamp']
orders_df['purchase_to_delivered_hrs'] = orders_df['purchase_to_delivered'].dt.total_seconds()/3600
orders_df['purchase_to_delivered_hrs'] = orders_df['purchase_to_delivered_hrs'].round(2)

orders_df.head(3)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved,purchase_to_approved_hrs,approved_to_carrier,approved_to_carrier_hrs,carrier_to_cust,carrier_to_customer_hrs,purchase_to_delivered,purchase_to_delivered_hrs
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,0 days 00:10:42,0.18,2 days 08:47:45,56.8,6 days 01:30:13,145.5,8 days 10:28:40,202.48
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,1 days 06:42:50,30.71,0 days 11:06:33,11.11,12 days 00:56:45,288.95,13 days 18:46:08,330.77
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,0 days 00:16:34,0.28,0 days 04:54:37,4.91,9 days 04:16:29,220.27,9 days 09:27:40,225.46


In [11]:
# only keep derived columns showing by hour
orders_df = orders_df.drop(columns = ['purchase_to_approved', 'approved_to_carrier', 'carrier_to_cust', 'purchase_to_delivered'])

In [12]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
 8   purchase_to_approved_hrs       99281 non-null  float64       
 9   approved_to_carrier_hrs        97644 non-null  float64       
 10  carrier_to_customer_hrs        96475 non-null  float64       
 11  purchase_to_del

In [13]:
orders_df.describe()

Unnamed: 0,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved_hrs,approved_to_carrier_hrs,carrier_to_customer_hrs,purchase_to_delivered_hrs
count,99441,99281,97658,96476,99441,99281.0,97644.0,96475.0,96476.0
mean,2017-12-31 08:43:12.776581120,2017-12-31 18:35:24.098800128,2018-01-04 21:49:48.138278656,2018-01-14 12:09:19.035542272,2018-01-24 03:08:37.730111232,10.419073,67.32091,223.933117,301.408865
min,2016-09-04 21:15:19,2016-09-15 12:16:38,2016-10-08 10:34:01,2016-10-11 13:46:32,2016-09-30 00:00:00,0.0,-4109.26,-386.31,12.8
25%,2017-09-12 14:46:19,2017-09-12 23:24:16,2017-09-15 22:28:50.249999872,2017-09-25 22:07:22.249999872,2017-10-03 00:00:00,0.22,21.01,98.4,162.39
50%,2018-01-18 23:04:36,2018-01-19 11:36:13,2018-01-24 16:10:58,2018-02-02 19:28:10.500000,2018-02-15 00:00:00,0.34,43.64,170.39,245.225
75%,2018-05-04 15:42:16,2018-05-04 20:35:10,2018-05-08 13:37:45,2018-05-15 22:48:52.249999872,2018-05-25 00:00:00,14.58,85.93,288.7,377.29
max,2018-10-17 17:30:18,2018-09-03 17:40:06,2018-09-11 19:48:28,2018-10-17 13:22:46,2018-11-12 00:00:00,4509.18,3018.3,4924.58,5031.09
std,,,,,,26.038012,85.186235,210.242915,229.116711


#### 02. Value Counts

Order id

In [16]:
# get counts of order ids
orders_df['order_id'].value_counts(dropna = False)

order_id
e481f51cbdc54678b7cc49136f2d6af7    1
f01059d0d674e1282df4e8fbbe015aa2    1
fbc17f0f2a2125054d5ac5c22d2d5120    1
9373150545066777b1cd2bc20e93cf8e    1
917399e96f92268dfa2c0351b1b75fba    1
                                   ..
6b8986012d61963295ffa3ea869aff86    1
be879f757debd3b384b540daa6ddd97e    1
ac3ce7eda1246f39509a505242fcc169    1
e801a93b3904ca9e8350f176a037047b    1
66dea50a8b16d9b4dee7af250b4be1a5    1
Name: count, Length: 99441, dtype: int64

**Unique Order IDs** = 99,441

Customer id

In [19]:
orders_df['customer_id'].value_counts(dropna = False)

customer_id
9ef432eb6251297304e76186b10a928d    1
413f7e58270a32396af030a075b924be    1
eb4350b67a0264c67e5e06a038e4afbb    1
622b07d262d545d16efbd4363a89cb91    1
c701fbfa77791abd05eef9eacf7ea7a8    1
                                   ..
39585f08d13377e50fde35467984e6eb    1
eb3d995301c320683de629f5b4dd0c78    1
f2507ea56d748a23037bb1214964e87d    1
ae8269d850cd3a3d06a56877c450b3f8    1
edb027a75a1449115f6b43211ae02a24    1
Name: count, Length: 99441, dtype: int64

**Unique Customer IDs** = 99,441 *(same as order id count)*

Order Status

In [22]:
orders_df['order_status'].value_counts(dropna = False)

order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64

**Unique Order Statuses** = 8

### CONSISTENCY CHECKS

#### 01. Mixed-Type Data

In [26]:
# check if there are any mixed-type columns
for col in orders_df.columns.tolist():
    weird = (orders_df[[col]].map(type) != orders_df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (orders_df[weird]) > 0:
        print(col)

order_approved_at
order_delivered_carrier_date
order_delivered_customer_date


In [27]:
orders_df['order_approved_at'].dtype

dtype('<M8[ns]')

*These columns contain datetime info and will be left as such*

#### 02. Missing Values

In [30]:
# returns number of missing data by column
orders_df.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
purchase_to_approved_hrs          160
approved_to_carrier_hrs          1797
carrier_to_customer_hrs          2966
purchase_to_delivered_hrs        2965
dtype: int64

In [31]:
# explore null values of approved_at
df_approved_null = orders_df[orders_df['order_approved_at'].isna()]
df_approved_null

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved_hrs,approved_to_carrier_hrs,carrier_to_customer_hrs,purchase_to_delivered_hrs
1130,00b1cb0320190ca0daa2c88b35206009,3532ba38a3fd242259a514ac2b6ae6b6,canceled,2018-08-28 15:26:39,NaT,NaT,NaT,2018-09-12,,,,
1801,ed3efbd3a87bea76c2812c66a0b32219,191984a8ba4cbb2145acb4fe35b69664,canceled,2018-09-20 13:54:16,NaT,NaT,NaT,2018-10-17,,,,
1868,df8282afe61008dc26c6c31011474d02,aa797b187b5466bc6925aaaa4bb3bed1,canceled,2017-03-04 12:14:30,NaT,NaT,NaT,2017-04-10,,,,
2029,8d4c637f1accf7a88a4555f02741e606,b1dd715db389a2077f43174e7a675d07,canceled,2018-08-29 16:27:49,NaT,NaT,NaT,2018-09-13,,,,
2161,7a9d4c7f9b068337875b95465330f2fc,7f71ae48074c0cfec9195f88fcbfac55,canceled,2017-05-01 16:12:39,NaT,NaT,NaT,2017-05-30,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
97696,5a00b4d35edffc56b825c3646a99ba9d,6a3bdf004ca96338fb5fad1b8d93c2e6,canceled,2017-07-02 15:38:46,NaT,NaT,NaT,2017-07-25,,,,
98415,227c804e2a44760671a6a5697ea549e4,62e7477e75e542243ee62a0ba73f410f,canceled,2017-09-28 15:02:56,NaT,NaT,NaT,2017-10-16,,,,
98909,e49e7ce1471b4693482d40c2bd3ad196,e4e7ab3f449aeb401f0216f86c2104db,canceled,2018-08-07 11:16:28,NaT,NaT,NaT,2018-08-10,,,,
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,NaT,NaT,NaT,2018-10-01,,,,


In [32]:
# carrier date null values
df_carrier_null = orders_df[orders_df['order_delivered_carrier_date'].isna()]
df_carrier_null

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved_hrs,approved_to_carrier_hrs,carrier_to_customer_hrs,purchase_to_delivered_hrs
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,NaT,NaT,2017-05-09,49.05,,,
103,0760a852e4e9d89eb77bf631eaaf1c84,d2a79636084590b7465af8ab374a8cf5,invoiced,2018-08-03 17:44:42,2018-08-07 06:15:14,NaT,NaT,2018-08-21,84.51,,,
128,15bed8e2fec7fdbadb186b57c46c92f2,f3f0e613e0bdb9c7cee75504f0f90679,processing,2017-09-03 14:22:03,2017-09-03 14:30:09,NaT,NaT,2017-10-03,0.14,,,
266,8e24261a7e58791d10cb1bf9da94df5c,64a254d30eed42cd0e6c36dddb88adf0,unavailable,2017-11-16 15:09:28,2017-11-16 15:26:57,NaT,NaT,2017-12-05,0.29,,,
324,d3c8851a6651eeff2f73b0e011ac45d0,957f8e082185574de25992dc659ebbc0,processing,2016-10-05 22:44:13,2016-10-06 15:51:05,NaT,NaT,2016-12-09,17.11,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,NaT,NaT,NaT,2018-10-01,,,,
99313,e9e64a17afa9653aacf2616d94c005b8,b4cd0522e632e481f8eaf766a2646e86,processing,2018-01-05 23:07:24,2018-01-09 07:18:05,NaT,NaT,2018-02-06,80.18,,,
99347,a89abace0dcc01eeb267a9660b5ac126,2f0524a7b1b3845a1a57fcf3910c4333,canceled,2018-09-06 18:45:47,NaT,NaT,NaT,2018-09-27,,,,
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,NaT,NaT,2017-09-15,119.28,,,


In [33]:
# delivered date null
df_delivered_null = orders_df[orders_df['order_delivered_customer_date'].isna()]
df_delivered_null

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved_hrs,approved_to_carrier_hrs,carrier_to_customer_hrs,purchase_to_delivered_hrs
6,136cce7faa42fdb2cefd53fdc79a6098,ed0271e0b7da060a393796590e7b737a,invoiced,2017-04-11 12:22:08,2017-04-13 13:25:17,NaT,NaT,2017-05-09,49.05,,,
44,ee64d42b8cf066f35eac1cf57de1aa85,caded193e8e47b8362864762a83db3c5,shipped,2018-06-04 16:44:48,2018-06-05 04:31:18,2018-06-05 14:32:00,NaT,2018-06-28,11.78,10.01,,
103,0760a852e4e9d89eb77bf631eaaf1c84,d2a79636084590b7465af8ab374a8cf5,invoiced,2018-08-03 17:44:42,2018-08-07 06:15:14,NaT,NaT,2018-08-21,84.51,,,
128,15bed8e2fec7fdbadb186b57c46c92f2,f3f0e613e0bdb9c7cee75504f0f90679,processing,2017-09-03 14:22:03,2017-09-03 14:30:09,NaT,NaT,2017-10-03,0.14,,,
154,6942b8da583c2f9957e990d028607019,52006a9383bf149a4fb24226b173106f,shipped,2018-01-10 11:33:07,2018-01-11 02:32:30,2018-01-11 19:39:23,NaT,2018-02-07,14.99,17.11,,
...,...,...,...,...,...,...,...,...,...,...,...,...
99283,3a3cddda5a7c27851bd96c3313412840,0b0d6095c5555fe083844281f6b093bb,canceled,2018-08-31 16:13:44,NaT,NaT,NaT,2018-10-01,,,,
99313,e9e64a17afa9653aacf2616d94c005b8,b4cd0522e632e481f8eaf766a2646e86,processing,2018-01-05 23:07:24,2018-01-09 07:18:05,NaT,NaT,2018-02-06,80.18,,,
99347,a89abace0dcc01eeb267a9660b5ac126,2f0524a7b1b3845a1a57fcf3910c4333,canceled,2018-09-06 18:45:47,NaT,NaT,NaT,2018-09-27,,,,
99348,a69ba794cc7deb415c3e15a0a3877e69,726f0894b5becdf952ea537d5266e543,unavailable,2017-08-23 16:28:04,2017-08-28 15:44:47,NaT,NaT,2017-09-15,119.28,,,


**Missing values exist** - leave all as is *(reasons listed below)*  
- **Approved at:** values are null because order was canceled  
- **Carrier date:** these are orders that have not been picked up by the carrier  
- **Deliver date:** these are order that have not yet been received by the customer

#### 03. Duplicates

In [36]:
# creates a subet containing only the full duplicates
df_dups = orders_df[orders_df.duplicated()]
df_dups

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_to_approved_hrs,approved_to_carrier_hrs,carrier_to_customer_hrs,purchase_to_delivered_hrs


**No full duplicates**

### Export Data

In [39]:
orders_df.to_csv(os.path.join(path, '02 Data', 'Prepared Data','order_checked.csv'))