##### The purpose of this script is to merge data from the orders, reviews and payments files, and perform other data cleaning and derivation of variables to prepare the dataset for further analysis.
# Table of Contents

### 1. Importing libraries

### 2. Importing data

### 3. Exploratory analysis

### 4. Exporting data

## 1. Importing libraries

In [32]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 2. Importing data

In [33]:
# Create shortcut for importing files
path = r'C:\Users\radav\OneDrive\Documents\Career Foundry\Data Analytics\Immersion\Achievement 6 Advanced Analytics and Dashboard Design\Olist'

In [34]:
# Import Olist dataset files
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index_col=[0])
df_reviews = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'order_reviews_checked.csv'), index_col=[0])
df_payments = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'order_payments_checked.csv'), index_col=[0])
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_checked.csv'), index_col=[0])
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col=[0])
df_items = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'order_items_checked.csv'), index_col=[0])
df_sellers = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'sellers_checked.csv'), index_col=[0])

## 3. Exploratory analysis

### i) Check relationship between order items and order form of payment 

In [35]:
# Identify examples of orders with multiple payment types
df_payments[df_payments['payment_sequential']==2]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
25,5cfd514482e22bc992e7693f0e3e8df7,2,voucher,1,45.17
75,3689194c14ad4e2e7361ebd1df0e77b0,2,voucher,1,57.53
102,21b8b46679ea6482cbf911d960490048,2,voucher,1,43.12
139,82ffe097d8ddbf319a523b9bbe7725d5,2,voucher,1,30.00
164,487c1451b8fd7347d0e80e5aca887e91,2,voucher,1,30.00
...,...,...,...,...,...
103499,08fa8b49ff198d1332df4668087150ed,2,voucher,1,50.00
103500,71853944ebfe6bd5f5de0302cba14354,2,voucher,1,29.18
103520,e2da042c42ce790c81ef3a9f666a92b6,2,voucher,1,67.33
103732,f0a5b7c94819c69d12a1c0458ec74756,2,voucher,1,80.40


In [36]:
#Check details of an order with mutiple payment types
df_orders[df_orders['order_id']=='5cfd514482e22bc992e7693f0e3e8df7']

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
55502,5cfd514482e22bc992e7693f0e3e8df7,519a8af813fe88578029697625439e8b,delivered,2017-10-13 17:19:17,2017-10-13 18:06:56,2017-10-16 19:42:28,2017-10-19 20:41:43,2017-11-06 00:00:00


In [37]:
df_payments[df_payments['order_id']=='5cfd514482e22bc992e7693f0e3e8df7']

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
25,5cfd514482e22bc992e7693f0e3e8df7,2,voucher,1,45.17
57742,5cfd514482e22bc992e7693f0e3e8df7,1,credit_card,4,665.41


In [38]:
df_items[df_items['order_id']=='5cfd514482e22bc992e7693f0e3e8df7']

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
40830,5cfd514482e22bc992e7693f0e3e8df7,1,38faaf77a4cd4dfb50ea8512e66285b0,7e93a43ef30c4f03f38b393420bc753a,2017-10-19 19:06:56,689.99,20.59


#### Form of payment is tied to the order total and not directly linked to specific products, therefore data analysis of payments should be conducted at an orders level.

### ii) Merge data from orders, reviews and payments files

In [39]:
# Add 'review_score' from order_reviews file to orders file
df_orders_reviews_merged = pd.merge(df_orders, df_reviews[['order_id', 'review_score']], on='order_id', how='left')

In [40]:
# Check output
df_orders_reviews_merged.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,4.0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,5.0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,5.0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,5.0


In [41]:
df_orders_reviews_merged.shape

(99992, 9)

In [42]:
# Count missing values
df_orders_reviews_merged.isnull().sum()

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 161
order_delivered_carrier_date     1793
order_delivered_customer_date    2987
order_estimated_delivery_date       0
review_score                      768
dtype: int64

In [43]:
# Merge payments df into current df with review scores
df_orders_payments_merged = df_orders_reviews_merged.merge(df_payments, on = 'order_id', indicator = True)

In [44]:
# Check output
df_orders_payments_merged.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_sequential,payment_type,payment_installments,payment_value,_merge
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,1,credit_card,1,18.12,both
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,3,voucher,1,2.0,both
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,2,voucher,1,18.59,both
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,4.0,1,boleto,1,141.46,both
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,5.0,1,credit_card,3,179.12,both


In [45]:
df_orders_payments_merged['_merge'].value_counts()

_merge
both          104477
left_only          0
right_only         0
Name: count, dtype: int64

In [46]:
# Remove '_merge' column
df_orders_payments_merged = df_orders_payments_merged.drop(columns = ['_merge'])

In [47]:
# Check revised output 
df_orders_payments_merged.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_sequential,payment_type,payment_installments,payment_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,1,credit_card,1,18.12
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,3,voucher,1,2.0
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,2,voucher,1,18.59
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,4.0,1,boleto,1,141.46
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,5.0,1,credit_card,3,179.12


In [48]:
# Add unique customer id's from customers file to merged file to enable some custmoer-level analysis
df_merged = pd.merge(df_orders_payments_merged, df_customers[['customer_id', 'customer_unique_id']], on='customer_id', how='left')

In [49]:
# Check revised output 
df_merged.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_sequential,payment_type,payment_installments,payment_value,customer_unique_id
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,1,credit_card,1,18.12,7c396fd4830fd04220f754e42b4e5bff
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,3,voucher,1,2.0,7c396fd4830fd04220f754e42b4e5bff
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,2,voucher,1,18.59,7c396fd4830fd04220f754e42b4e5bff
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,4.0,1,boleto,1,141.46,af07308b275d755c9edb36a90c618231
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,5.0,1,credit_card,3,179.12,3a653a41f6f9fc3d2a113cf8398680e8


In [50]:
# Move customer unique id column
col_list = df_merged.columns.tolist()
col_list.insert(2, col_list.pop(col_list.index('customer_unique_id')))
df_merged = df_merged.reindex(columns=col_list)

In [51]:
# Check revised output 
df_merged.head()

Unnamed: 0,order_id,customer_id,customer_unique_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_sequential,payment_type,payment_installments,payment_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,1,credit_card,1,18.12
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,3,voucher,1,2.0
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,4.0,2,voucher,1,18.59
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,af07308b275d755c9edb36a90c618231,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,4.0,1,boleto,1,141.46
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,3a653a41f6f9fc3d2a113cf8398680e8,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,5.0,1,credit_card,3,179.12


In [52]:
df_merged.shape

(104477, 14)

### iii) Convert date format and derive new variables with differences between key dates per record

In [53]:
# Convert the date columns to date format
columns = ['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date']
df_merged[columns] = df_merged[columns].apply(pd.to_datetime)

In [54]:
df_merged.dtypes

order_id                                 object
customer_id                              object
customer_unique_id                       object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_approved_at                datetime64[ns]
order_delivered_carrier_date     datetime64[ns]
order_delivered_customer_date    datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
review_score                            float64
payment_sequential                        int64
payment_type                             object
payment_installments                      int64
payment_value                           float64
dtype: object

In [55]:
# Create a new column to identify difference between actual and estimated customer delivery date
df_merged['act_delivery_days'] = (df_merged['order_delivered_customer_date'] - df_merged['order_approved_at']).dt.days

In [56]:
# Create a new column to identify difference between dates from orders approved to orders delivered to carrier 
df_merged['seller_delivery_days'] = (df_merged['order_delivered_carrier_date'] - df_merged['order_approved_at']).dt.days

In [57]:
# Create a new column to identify difference between dates from orders delivered to carrier to orders delivered to customer
df_merged['carrier_delivery_days'] = (df_merged['order_delivered_customer_date'] - df_merged['order_delivered_carrier_date']).dt.days

In [58]:
# Create a new column to identify difference between actual and estimated customer delivery date
df_merged['est_less_act_delivery_days'] = (df_merged['order_estimated_delivery_date'] - df_merged['order_delivered_customer_date']).dt.days

In [59]:
df_merged.head()

Unnamed: 0,order_id,customer_id,customer_unique_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_score,payment_sequential,payment_type,payment_installments,payment_value,act_delivery_days,seller_delivery_days,carrier_delivery_days,est_less_act_delivery_days
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,4.0,1,credit_card,1,18.12,8.0,2.0,6.0,7.0
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,4.0,3,voucher,1,2.0,8.0,2.0,6.0,7.0
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,7c396fd4830fd04220f754e42b4e5bff,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,4.0,2,voucher,1,18.59,8.0,2.0,6.0,7.0
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,af07308b275d755c9edb36a90c618231,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,4.0,1,boleto,1,141.46,12.0,0.0,12.0,5.0
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,3a653a41f6f9fc3d2a113cf8398680e8,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,5.0,1,credit_card,3,179.12,9.0,0.0,9.0,17.0


In [60]:
df_merged.shape

(104477, 18)

#### The merged file will enable data analysis of 1) the value of total orders, 2) product fulfilment performance, 3) unique customer orders.

## 4. Exporting data

In [61]:
# Export merged file
df_merged.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_merged.csv'))