# **Task 3: Data cleaning**

In [10]:
import pandas as pd

In [11]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [12]:
pd.options.mode.chained_assignment = None

## **Data importing and cleaning**

In [13]:
# import all necessary datasets
data_orders_payments = pd.read_csv('data_raw/order_payments.csv')
data_orders = pd.read_csv('data_raw/orders.csv')
data_customers = pd.read_csv('data_raw/customers.csv')
data_sellers = pd.read_csv('data_raw/sellers.csv')
data_order_items = pd.read_csv('data_raw/order_items.csv')
data_product = pd.read_csv('data_raw/products.csv')
data_geolocation = pd.read_csv('data_raw/geolocation.csv')

### **Prepare geolocation, merge with customers and orders**

In [14]:
# Take a look at customers
data_customers.head(2)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP


In [15]:
# A lot of cities, this could not be a feature in the future
# But we could use this info to compare seller and customer city
data_customers.customer_city.value_counts()

sao paulo              15540
rio de janeiro          6882
belo horizonte          2773
brasilia                2131
curitiba                1521
                       ...  
ipueira                    1
aracatu                    1
pacotuba                   1
sao joao do sobrado        1
viseu                      1
Name: customer_city, Length: 4119, dtype: int64

In [16]:
# A lot of states, this could be a feature in the future
# But we could use this info to compare seller and customer state
data_customers.customer_state.value_counts()

SP    41746
RJ    12852
MG    11635
RS     5466
PR     5045
SC     3637
BA     3380
DF     2140
ES     2033
GO     2020
PE     1652
CE     1336
PA      975
MT      907
MA      747
MS      715
PB      536
PI      495
RN      485
AL      413
SE      350
TO      280
RO      253
AM      148
AC       81
AP       68
RR       46
Name: customer_state, dtype: int64

In [17]:
data_geolocation.head(2)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP


In [18]:
data_customers.shape, data_geolocation.shape

((99441, 5), (1000163, 5))

In [19]:
# We have 278 missing values with coordinates in our dataset. 
# It could be useful in feature engineering to calculate the distance between seller and customer
# So we need only raws with lat and lng
data_customers['customer_zip_code_prefix'].isin(data_geolocation['geolocation_zip_code_prefix']).value_counts()

True     99163
False      278
Name: customer_zip_code_prefix, dtype: int64

In [20]:
# As we have mininum 100 examples for each geolocation zip code prefix and out cliet approved the 
# aggregation, we could group values by mean()
data_geolocation[data_geolocation['geolocation_zip_code_prefix'] == 14409]

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
334422,14409,-20.509897,-47.397866,franca,SP
334449,14409,-20.497396,-47.399241,franca,SP
334459,14409,-20.510459,-47.399553,franca,SP
334462,14409,-20.480940,-47.394161,franca,SP
334463,14409,-20.515413,-47.398194,franca,SP
...,...,...,...,...,...
336625,14409,-20.483960,-47.403324,franca,SP
336650,14409,-20.483960,-47.403324,franca,SP
336653,14409,-20.487255,-47.405925,franca,SP
336657,14409,-20.510459,-47.399553,franca,SP


In [21]:
# Check the number of the unique zip codes
data_geolocation.geolocation_zip_code_prefix.unique().shape

(19015,)

In [22]:
data_geolocation_agg = data_geolocation.groupby('geolocation_zip_code_prefix').agg({'geolocation_lat':'mean',
                                                                                    'geolocation_lng':'mean'})
data_geolocation_agg

Unnamed: 0_level_0,geolocation_lat,geolocation_lng
geolocation_zip_code_prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,-23.550190,-46.634024
1002,-23.548146,-46.634979
1003,-23.548994,-46.635731
1004,-23.549799,-46.634757
1005,-23.549456,-46.636733
...,...,...
99960,-27.953722,-52.025511
99965,-28.183372,-52.039850
99970,-28.343766,-51.874689
99980,-28.389129,-51.843836


In [23]:
# Now its time to merge geolocation with customers
# 

data_merge_0 = pd.merge(data_customers, 
                        data_geolocation_agg, 
                        how='inner', 
                        left_on = 'customer_zip_code_prefix', 
                        right_on = 'geolocation_zip_code_prefix').reset_index()

# rename lat and lng because we will have similar from sellers dataframe
data_merge_0.rename(columns={'geolocation_lat':'customer_lat', 'geolocation_lng': 'customer_lng'}, inplace=True)

data_merge_0.drop(['customer_zip_code_prefix','customer_unique_id','index'], axis=1, inplace=True)

data_merge_0.head(2)

Unnamed: 0,customer_id,customer_city,customer_state,customer_lat,customer_lng
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP,-20.498489,-47.396929
1,5dca924cc99eea2dc5ba40d11ec5dd0f,franca,SP,-20.498489,-47.396929


In [24]:
# Check the shape of the final dataset
data_merge_0.shape

(99163, 5)

In [25]:
data_orders.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00


In [26]:
data_orders.order_status.value_counts().sort_values(ascending=False)

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

In [39]:
# As we could see everything except delivered has no delivered date.
# So, that was not a necessary data for us
data_orders[data_orders.order_status == 'shipped'].head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
44,ee64d42b8cf066f35eac1cf57de1aa85,caded193e8e47b8362864762a83db3c5,shipped,2018-06-04 16:44:48,2018-06-05 04:31:18,2018-06-05 14:32:00,,2018-06-28 00:00:00
154,6942b8da583c2f9957e990d028607019,52006a9383bf149a4fb24226b173106f,shipped,2018-01-10 11:33:07,2018-01-11 02:32:30,2018-01-11 19:39:23,,2018-02-07 00:00:00
162,36530871a5e80138db53bcfd8a104d90,4dafe3c841d2d6cc8a8b6d25b35704b9,shipped,2017-05-09 11:48:37,2017-05-11 11:45:14,2017-05-11 13:21:47,,2017-06-08 00:00:00
231,4d630f57194f5aba1a3d12ce23e71cd9,6d491c9fe2f04f6e2af6ec033cd8907c,shipped,2017-11-17 19:53:21,2017-11-18 19:50:31,2017-11-22 17:28:34,,2017-12-13 00:00:00
299,3b4ad687e7e5190db827e1ae5a8989dd,1a87b8517b7d31373b50396eb15cb445,shipped,2018-06-28 12:52:15,2018-06-28 13:11:09,2018-07-04 15:20:00,,2018-08-03 00:00:00


In [27]:
# Create mask and drop all unnecesary values
mask_not_delivered = data_orders.order_status.unique()[1:]
data_orders.drop(data_orders[data_orders.order_status.isin(mask_not_delivered)].index, axis=0, inplace=True)
data_orders.shape

(96478, 8)

In [28]:
# merge of data with orders and and customers
data_merge_1 = pd.merge(data_orders, data_merge_0, on='customer_id')

data_merge_1.drop(['order_status'], axis=1, inplace=True)

data_merge_1.head(2)

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,sao paulo,SP,-23.576983,-46.587161
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,barreiras,BA,-12.177924,-44.660711


### **Merge sellers with geolocation, then with order_item and product**

In [29]:
data_sellers

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP
...,...,...,...,...
3090,98dddbc4601dd4443ca174359b237166,87111,sarandi,PR
3091,f8201cab383e484733266d1906e2fdfa,88137,palhoca,SC
3092,74871d19219c7d518d0090283e03c137,4650,sao paulo,SP
3093,e603cf3fec55f8697c9059638d6c8eb5,96080,pelotas,RS


In [30]:
# Check sellers shape
data_sellers.shape

(3095, 4)

In [31]:
# Prepare sellers and merge with geolocation
data_merge_1_5 = pd.merge(data_sellers, data_geolocation_agg, 
                          left_on = 'seller_zip_code_prefix', 
                          right_on='geolocation_zip_code_prefix')

data_merge_1_5.rename(columns={'geolocation_lat':'seller_lat', 'geolocation_lng': 'seller_lng'}, inplace=True)

data_merge_1_5.drop(['seller_zip_code_prefix'], axis=1, inplace=True)

In [32]:
data_merge_1_5

Unnamed: 0,seller_id,seller_city,seller_state,seller_lat,seller_lng
0,3442f8959a84dea7ee197c632cb2df15,campinas,SP,-22.893848,-47.061337
1,e0eabded302882513ced4ea3eb0c7059,campinas,SP,-22.893848,-47.061337
2,d1b65fc7debc3361ea86b5f14c68d2e2,mogi guacu,SP,-22.383437,-46.947927
3,ce3ad9de960102d0677a81f5d0bb7b2d,rio de janeiro,RJ,-22.909572,-43.177703
4,1d2732ef8321502ee8488e8bed1ab8cd,rio de janeiro,RJ,-22.909572,-43.177703
...,...,...,...,...,...
3083,f1fdf2d13186575751aa25876536d85c,sao paulo,SP,-23.530647,-46.736453
3084,98dddbc4601dd4443ca174359b237166,sarandi,PR,-23.448041,-51.869960
3085,74871d19219c7d518d0090283e03c137,sao paulo,SP,-23.657851,-46.676925
3086,e603cf3fec55f8697c9059638d6c8eb5,pelotas,RS,-31.751072,-52.323202


In [33]:
# Merge order items and prepared sellers with geolocation
data_merge_2 = pd.merge(data_order_items, data_merge_1_5, on='seller_id')

data_merge_2.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,volta redonda,SP,-22.496953,-44.127492
1,0188777fe321843a18be24a6e9aa1e53,1,436c8d57ff8d4aa254318e9bd9b48c83,48436dade18ac8b2bce089ec2a041202,2017-07-31 14:35:11,55.9,9.94,volta redonda,SP,-22.496953,-44.127492


In [34]:
data_product

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


In [35]:
data_merge_3 = pd.merge(data_merge_2, data_product, on='product_id')
data_merge_3.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,volta redonda,SP,-22.496953,-44.127492,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0
1,130898c0987d1801452a8ed92a670612,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-07-05 02:44:11,55.9,17.96,volta redonda,SP,-22.496953,-44.127492,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0


In [36]:
del data_merge_0, data_merge_1_5, data_merge_2

In [37]:
data_merge_1.head(2)

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,sao paulo,SP,-23.576983,-46.587161
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,barreiras,BA,-12.177924,-44.660711


In [38]:
data_merge_1.shape 

(96214, 11)

In [39]:
data_merge_3.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,volta redonda,SP,-22.496953,-44.127492,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0
1,130898c0987d1801452a8ed92a670612,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-07-05 02:44:11,55.9,17.96,volta redonda,SP,-22.496953,-44.127492,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0


In [40]:
data_merge_3.shape 

(112397, 19)

In [41]:
data_merge_4 = pd.merge(data_merge_1, data_merge_3, how='inner', on='order_id')
data_merge_4.dropna(inplace=True)
data_merge_4.head(2)

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,sao paulo,SP,-23.576983,-46.587161,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,maua,SP,-23.680729,-46.444238,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,barreiras,BA,-12.177924,-44.660711,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.7,22.76,belo horizonte,SP,-19.807681,-43.980427,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0


In [42]:
data_merge_4

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,sao paulo,SP,-23.576983,-46.587161,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,maua,SP,-23.680729,-46.444238,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,barreiras,BA,-12.177924,-44.660711,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.70,22.76,belo horizonte,SP,-19.807681,-43.980427,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,vianopolis,GO,-16.745150,-48.514783,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13 08:55:23,159.90,19.22,guariba,SP,-21.363502,-48.229601,automotivo,46.0,232.0,1.0,420.0,24.0,19.0,21.0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,sao goncalo do amarante,RN,-5.774190,-35.271143,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23 19:45:59,45.00,27.20,belo horizonte,MG,-19.837682,-43.924053,pet_shop,59.0,468.0,3.0,450.0,30.0,10.0,20.0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,santo andre,SP,-23.676370,-46.514627,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19 20:31:37,19.90,8.72,mogi das cruzes,SP,-23.543395,-46.262086,papelaria,38.0,316.0,4.0,250.0,51.0,15.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109656,63943bddc261676b46f01ca7ac2f7bd8,1fca14ff2861355f6e5f14306ff977a7,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-28 17:37:56,2018-03-02 00:00:00,praia grande,SP,-24.001500,-46.449864,1,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,2018-02-12 13:10:37,174.90,20.10,tupa,SP,-21.930548,-50.498348,bebes,52.0,828.0,4.0,4950.0,40.0,10.0,40.0
109657,83c1379a015df1e13d02aae0204711ab,1aa71eb042121263aafbe80c1b562c9c,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-21 11:24:17,2017-09-27 00:00:00,nova vicosa,BA,-17.898358,-39.373630,1,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,2017-09-05 15:04:16,205.99,65.02,sao paulo,SP,-23.553642,-46.452661,eletrodomesticos_2,51.0,500.0,2.0,13300.0,32.0,90.0,22.0
109658,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00,japuiba,RJ,-22.562825,-42.694574,1,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,2018-01-12 21:36:21,179.99,40.59,ilicinea,MG,-20.940578,-45.827237,informatica_acessorios,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0
109659,11c177c8e97725db2631073c19f07b62,b331b74b18dc79bcdf6532d51e1637c1,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-25 23:32:54,2018-02-15 00:00:00,japuiba,RJ,-22.562825,-42.694574,2,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,2018-01-12 21:36:21,179.99,40.59,ilicinea,MG,-20.940578,-45.827237,informatica_acessorios,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0


In [43]:
data_merge_4.shape

(108106, 29)

## **Preprocessing**

In [44]:
# Count of items in order
# As a result....Probably we have to agg orders
data_merge_4.order_item_id.value_counts()

1     94624
2      9458
3      2207
4       923
5       435
6       247
7        58
8        35
9        28
10       25
11       17
12       13
13        8
14        7
15        5
16        3
17        3
18        3
19        3
20        3
21        1
Name: order_item_id, dtype: int64

In [45]:
# Check the count number of different products on one order, for example for the biggest
data_merge_4[data_merge_4.order_item_id == 21]

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
97856,8272b63d03f5f79c56e9e4120aec44ef,fc3d1daec319d62d49bfb5e1f83123e9,2017-07-16 18:19:25,2017-07-17 18:25:23,2017-07-20 15:45:53,2017-07-31 18:03:02,2017-07-28 00:00:00,sao paulo,SP,-23.680743,-46.784224,21,79ce45dbc2ea29b22b5a261bbb7b7ee7,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,7.8,6.57,sao paulo,SP,-23.488234,-46.469989,beleza_saude,27.0,152.0,2.0,1000.0,25.0,6.0,12.0


In [46]:
data_merge_4[data_merge_4.order_id == '8272b63d03f5f79c56e9e4120aec44ef'].head(2)

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_city,customer_state,customer_lat,customer_lng,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,seller_city,seller_state,seller_lat,seller_lng,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
97836,8272b63d03f5f79c56e9e4120aec44ef,fc3d1daec319d62d49bfb5e1f83123e9,2017-07-16 18:19:25,2017-07-17 18:25:23,2017-07-20 15:45:53,2017-07-31 18:03:02,2017-07-28 00:00:00,sao paulo,SP,-23.680743,-46.784224,1,270516a3f41dc035aa87d220228f844c,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89,sao paulo,SP,-23.488234,-46.469989,beleza_saude,45.0,232.0,3.0,800.0,21.0,4.0,15.0
97837,8272b63d03f5f79c56e9e4120aec44ef,fc3d1daec319d62d49bfb5e1f83123e9,2017-07-16 18:19:25,2017-07-17 18:25:23,2017-07-20 15:45:53,2017-07-31 18:03:02,2017-07-28 00:00:00,sao paulo,SP,-23.680743,-46.784224,12,270516a3f41dc035aa87d220228f844c,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23,1.2,7.89,sao paulo,SP,-23.488234,-46.469989,beleza_saude,45.0,232.0,3.0,800.0,21.0,4.0,15.0


In [51]:
data_merge_4[data_merge_4.order_id == '8272b63d03f5f79c56e9e4120aec44ef'].product_id.value_counts()

270516a3f41dc035aa87d220228f844c    10
05b515fdc76e888aada3c6d66c201dff    10
79ce45dbc2ea29b22b5a261bbb7b7ee7     1
Name: product_id, dtype: int64

In [53]:
data_merge_4.drop(columns=["Unnamed: 0"], inplace=True)

In [54]:
# save results
data_merge_4.to_csv('data_raw/data_merged.csv', index=False)

In [55]:
# import previous results 
data_merge_4  = pd.read_csv('data_raw/data_merged.csv')

In [56]:
# We are going to rearange columns for convinience
data_merge_4.columns.to_list()

['order_id',
 'customer_id',
 'order_purchase_timestamp',
 'order_approved_at',
 'order_delivered_carrier_date',
 'order_delivered_customer_date',
 'order_estimated_delivery_date',
 'customer_city',
 'customer_state',
 'customer_lat',
 'customer_lng',
 'order_item_id',
 'product_id',
 'seller_id',
 'shipping_limit_date',
 'price',
 'freight_value',
 'seller_city',
 'seller_state',
 'seller_lat',
 'seller_lng',
 'product_category_name',
 'product_name_lenght',
 'product_description_lenght',
 'product_photos_qty',
 'product_weight_g',
 'product_length_cm',
 'product_height_cm',
 'product_width_cm']

In [57]:
data_merge = data_merge_4[[ 'order_id',
                             'product_id',
                             'customer_id',
                             'seller_id',
                             'order_item_id',
                             'price',
                             'freight_value',
                             'product_name_lenght',
                             'product_description_lenght',
                             'product_photos_qty',
                             'product_weight_g',
                             'product_length_cm',
                             'product_height_cm',
                             'product_width_cm',
                             'customer_city',
                             'customer_state',
                             'customer_lat',
                             'customer_lng',
                             'seller_city',
                             'seller_state',
                             'seller_lat',
                             'seller_lng',
                             'order_purchase_timestamp',
                             'order_approved_at',
                             'order_delivered_carrier_date',
                             'shipping_limit_date',
                             'order_delivered_customer_date',
                             'order_estimated_delivery_date']]

In [58]:
data_merge

Unnamed: 0,order_id,product_id,customer_id,seller_id,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,shipping_limit_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,3504c0cb71d7fa48d967e0e4c94d59d9,1,29.99,8.72,40.0,268.0,4.0,500.0,19.0,8.0,13.0,sao paulo,SP,-23.576983,-46.587161,maua,SP,-23.680729,-46.444238,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-06 11:07:15,2017-10-10 21:25:13,2017-10-18 00:00:00
1,53cdb2fc8bc7dce0b6741e2150273451,595fac2a385ac33a80bd5114aec74eb8,b0830fb4747a6c6d20dea0b8c802d7ef,289cdb325fb7e7f891c38608bf9e0962,1,118.70,22.76,29.0,178.0,1.0,400.0,19.0,13.0,19.0,barreiras,BA,-12.177924,-44.660711,belo horizonte,SP,-19.807681,-43.980427,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-07-30 03:24:27,2018-08-07 15:27:45,2018-08-13 00:00:00
2,47770eb9100c2d0c44946d9cf07ec65d,aa4383b373c6aca5d8797843e5594415,41ce2a54c0b03bf3443c3d931a367089,4869f7a5dfa277a7dca6462dcf3b52b2,1,159.90,19.22,46.0,232.0,1.0,420.0,24.0,19.0,21.0,vianopolis,GO,-16.745150,-48.514783,guariba,SP,-21.363502,-48.229601,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-13 08:55:23,2018-08-17 18:06:29,2018-09-04 00:00:00
3,949d5b44dbf5de918fe9c16f97b45f8a,d0b61bfb1de832b15ba9d266ca96e5b0,f88197465ea7920adcdbec7375364d82,66922902710d126a0e7d26b0e3805106,1,45.00,27.20,59.0,468.0,3.0,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,-5.774190,-35.271143,belo horizonte,MG,-19.837682,-43.924053,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-11-23 19:45:59,2017-12-02 00:28:42,2017-12-15 00:00:00
4,ad21c59c0840e6cb83a9ceb5573f8159,65266b2da20d04dbe00c5c2d3bb7859e,8ab97904e6daea8866dbdbc4fb7aad2c,2c9e548be18521d1c43cde1c582c6de8,1,19.90,8.72,38.0,316.0,4.0,250.0,51.0,15.0,15.0,santo andre,SP,-23.676370,-46.514627,mogi das cruzes,SP,-23.543395,-46.262086,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-19 20:31:37,2018-02-16 18:17:02,2018-02-26 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108101,63943bddc261676b46f01ca7ac2f7bd8,f1d4ce8c6dd66c47bbaa8c6781c2a923,1fca14ff2861355f6e5f14306ff977a7,1f9ab4708f3056ede07124aad39a2554,1,174.90,20.10,52.0,828.0,4.0,4950.0,40.0,10.0,40.0,praia grande,SP,-24.001500,-46.449864,tupa,SP,-21.930548,-50.498348,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-12 13:10:37,2018-02-28 17:37:56,2018-03-02 00:00:00
108102,83c1379a015df1e13d02aae0204711ab,b80910977a37536adeddd63663f916ad,1aa71eb042121263aafbe80c1b562c9c,d50d79cb34e38265a8649c383dcffd48,1,205.99,65.02,51.0,500.0,2.0,13300.0,32.0,90.0,22.0,nova vicosa,BA,-17.898358,-39.373630,sao paulo,SP,-23.553642,-46.452661,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-05 15:04:16,2017-09-21 11:24:17,2017-09-27 00:00:00
108103,11c177c8e97725db2631073c19f07b62,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48,1,179.99,40.59,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0,japuiba,RJ,-22.562825,-42.694574,ilicinea,MG,-20.940578,-45.827237,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-12 21:36:21,2018-01-25 23:32:54,2018-02-15 00:00:00
108104,11c177c8e97725db2631073c19f07b62,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48,2,179.99,40.59,59.0,1893.0,1.0,6550.0,20.0,20.0,20.0,japuiba,RJ,-22.562825,-42.694574,ilicinea,MG,-20.940578,-45.827237,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-12 21:36:21,2018-01-25 23:32:54,2018-02-15 00:00:00


In [61]:
data_merge['product_volume'] = data_merge['product_length_cm'] * data_merge['product_height_cm'] * data_merge['product_width_cm']

In [62]:
data_merge.drop(['product_width_cm','product_length_cm', 'product_height_cm'], axis=1, inplace=True)

In [63]:
data_merge

Unnamed: 0,order_id,product_id,customer_id,seller_id,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,shipping_limit_date,order_delivered_customer_date,order_estimated_delivery_date,product_volume
0,e481f51cbdc54678b7cc49136f2d6af7,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,3504c0cb71d7fa48d967e0e4c94d59d9,1,29.99,8.72,40.0,268.0,4.0,500.0,sao paulo,SP,-23.576983,-46.587161,maua,SP,-23.680729,-46.444238,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-06 11:07:15,2017-10-10 21:25:13,2017-10-18 00:00:00,1976.0
1,53cdb2fc8bc7dce0b6741e2150273451,595fac2a385ac33a80bd5114aec74eb8,b0830fb4747a6c6d20dea0b8c802d7ef,289cdb325fb7e7f891c38608bf9e0962,1,118.70,22.76,29.0,178.0,1.0,400.0,barreiras,BA,-12.177924,-44.660711,belo horizonte,SP,-19.807681,-43.980427,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-07-30 03:24:27,2018-08-07 15:27:45,2018-08-13 00:00:00,4693.0
2,47770eb9100c2d0c44946d9cf07ec65d,aa4383b373c6aca5d8797843e5594415,41ce2a54c0b03bf3443c3d931a367089,4869f7a5dfa277a7dca6462dcf3b52b2,1,159.90,19.22,46.0,232.0,1.0,420.0,vianopolis,GO,-16.745150,-48.514783,guariba,SP,-21.363502,-48.229601,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-13 08:55:23,2018-08-17 18:06:29,2018-09-04 00:00:00,9576.0
3,949d5b44dbf5de918fe9c16f97b45f8a,d0b61bfb1de832b15ba9d266ca96e5b0,f88197465ea7920adcdbec7375364d82,66922902710d126a0e7d26b0e3805106,1,45.00,27.20,59.0,468.0,3.0,450.0,sao goncalo do amarante,RN,-5.774190,-35.271143,belo horizonte,MG,-19.837682,-43.924053,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-11-23 19:45:59,2017-12-02 00:28:42,2017-12-15 00:00:00,6000.0
4,ad21c59c0840e6cb83a9ceb5573f8159,65266b2da20d04dbe00c5c2d3bb7859e,8ab97904e6daea8866dbdbc4fb7aad2c,2c9e548be18521d1c43cde1c582c6de8,1,19.90,8.72,38.0,316.0,4.0,250.0,santo andre,SP,-23.676370,-46.514627,mogi das cruzes,SP,-23.543395,-46.262086,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-19 20:31:37,2018-02-16 18:17:02,2018-02-26 00:00:00,11475.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108101,63943bddc261676b46f01ca7ac2f7bd8,f1d4ce8c6dd66c47bbaa8c6781c2a923,1fca14ff2861355f6e5f14306ff977a7,1f9ab4708f3056ede07124aad39a2554,1,174.90,20.10,52.0,828.0,4.0,4950.0,praia grande,SP,-24.001500,-46.449864,tupa,SP,-21.930548,-50.498348,2018-02-06 12:58:58,2018-02-06 13:10:37,2018-02-07 23:22:42,2018-02-12 13:10:37,2018-02-28 17:37:56,2018-03-02 00:00:00,16000.0
108102,83c1379a015df1e13d02aae0204711ab,b80910977a37536adeddd63663f916ad,1aa71eb042121263aafbe80c1b562c9c,d50d79cb34e38265a8649c383dcffd48,1,205.99,65.02,51.0,500.0,2.0,13300.0,nova vicosa,BA,-17.898358,-39.373630,sao paulo,SP,-23.553642,-46.452661,2017-08-27 14:46:43,2017-08-27 15:04:16,2017-08-28 20:52:26,2017-09-05 15:04:16,2017-09-21 11:24:17,2017-09-27 00:00:00,63360.0
108103,11c177c8e97725db2631073c19f07b62,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48,1,179.99,40.59,59.0,1893.0,1.0,6550.0,japuiba,RJ,-22.562825,-42.694574,ilicinea,MG,-20.940578,-45.827237,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-12 21:36:21,2018-01-25 23:32:54,2018-02-15 00:00:00,8000.0
108104,11c177c8e97725db2631073c19f07b62,d1c427060a0f73f6b889a5c7c61f2ac4,b331b74b18dc79bcdf6532d51e1637c1,a1043bafd471dff536d0c462352beb48,2,179.99,40.59,59.0,1893.0,1.0,6550.0,japuiba,RJ,-22.562825,-42.694574,ilicinea,MG,-20.940578,-45.827237,2018-01-08 21:28:27,2018-01-08 21:36:21,2018-01-12 15:35:03,2018-01-12 21:36:21,2018-01-25 23:32:54,2018-02-15 00:00:00,8000.0


In [101]:
data_merge.drop(['product_description_lenght', 'product_name_lenght','product_photos_qty'], axis=1, inplace=True)

In [103]:
data_merge.head(2)

Unnamed: 0,order_id,product_id,customer_id,seller_id,order_item_id,price,freight_value,product_weight_g,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,shipping_limit_date,order_delivered_customer_date,order_estimated_delivery_date,product_volume
0,e481f51cbdc54678b7cc49136f2d6af7,87285b34884572647811a353c7ac498a,9ef432eb6251297304e76186b10a928d,3504c0cb71d7fa48d967e0e4c94d59d9,1,29.99,8.72,500.0,sao paulo,SP,-23.576983,-46.587161,maua,SP,-23.680729,-46.444238,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-06 11:07:15,2017-10-10 21:25:13,2017-10-18 00:00:00,1976.0
1,53cdb2fc8bc7dce0b6741e2150273451,595fac2a385ac33a80bd5114aec74eb8,b0830fb4747a6c6d20dea0b8c802d7ef,289cdb325fb7e7f891c38608bf9e0962,1,118.7,22.76,400.0,barreiras,BA,-12.177924,-44.660711,belo horizonte,SP,-19.807681,-43.980427,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-07-30 03:24:27,2018-08-07 15:27:45,2018-08-13 00:00:00,4693.0


In [64]:
%%time
data_all_agg = data_merge.groupby(['order_id']).agg({ 'seller_id':'max',
                                                      'price':'sum',
                                                      'freight_value':'sum',
                                                      'product_weight_g':'sum',
                                                      'product_volume':'sum',
                                                      'customer_city':'max',
                                                      'customer_state':'max',
                                                      'customer_lat':'max',
                                                      'customer_lng':'max',
                                                      'seller_city':'max',
                                                      'seller_state':'max',
                                                      'seller_lat':'max',
                                                      'seller_lng':'max',
                                                      'order_purchase_timestamp':'max',
                                                      'order_approved_at':'max',
                                                      'shipping_limit_date':'max',
                                                      'order_delivered_carrier_date':'max',
                                                      'order_delivered_customer_date':'max',
                                                      'order_estimated_delivery_date':'max'})

Wall time: 1min 32s


In [65]:
data_all_agg

Unnamed: 0_level_0,seller_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
00010242fe8c5a6d1ba2dd792cb16214,48436dade18ac8b2bce089ec2a041202,58.90,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29 00:00:00
00018f77f2f0320c557190d7a144bdd3,dd7ddc04e1b6c2c614352b383efe2d36,239.90,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-03 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15 00:00:00
000229ec398224ef6ca0657da4fc703e,5b51032eddd242adc84c38acab88f23d,199.00,17.87,3050.0,14157.0,para de minas,MG,-19.870305,-44.593326,borda da mata,MG,-22.262584,-46.171124,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-18 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05 00:00:00
00024acbcdf0a6daa1e931b038114c75,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,200.0,2400.0,atibaia,SP,-23.089925,-46.611654,franca,SP,-20.553624,-47.387359,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-15 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20 00:00:00
00042b26cf59d7ce69dfabb4e55b4fd9,df560393f3a51e74553ab94004ba5c87,199.90,18.14,3750.0,42000.0,varzea paulista,SP,-23.243402,-46.827614,loanda,PR,-22.929384,-53.135873,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-13 13:57:51,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffc94f6ce00a00581880bf54a75a037,b8bc237ba3788b23da09c0f1f3a3288c,299.99,43.41,10150.0,53400.0,sao luis,MA,-2.497993,-44.297761,itajai,SC,-26.912574,-48.673980,2018-04-23 13:57:06,2018-04-25 04:11:01,2018-05-02 04:11:01,2018-04-25 12:09:00,2018-05-10 22:56:40,2018-05-18 00:00:00
fffcd46ef2263f404302a634eb57f7eb,f3c38ab652836d21de61fb8314b69182,350.00,36.53,8950.0,44460.0,curitiba,PR,-25.566904,-49.309115,sao paulo,SP,-23.535864,-46.642819,2018-07-14 10:26:46,2018-07-17 04:31:48,2018-07-20 04:31:48,2018-07-17 08:05:00,2018-07-23 20:31:55,2018-08-01 00:00:00
fffce4705a9662cd70adb13d4a31832d,c3cfdc648177fdbbbb35635a37472c53,99.90,16.95,967.0,9576.0,sao paulo,SP,-23.597794,-46.643923,curitiba,PR,-25.469955,-49.289821,2017-10-23 17:07:56,2017-10-24 17:14:25,2017-10-30 17:14:25,2017-10-26 15:13:14,2017-10-28 12:22:22,2017-11-10 00:00:00
fffe18544ffabc95dfada21779c9644f,2b3e4a2a3ea8e01938cabda2a3e5cc79,55.99,8.72,100.0,8000.0,vinhedo,SP,-23.040252,-46.979782,sao paulo,SP,-23.635530,-46.694031,2017-08-14 23:02:59,2017-08-15 00:04:32,2017-08-21 00:04:32,2017-08-15 19:02:53,2017-08-16 21:59:40,2017-08-25 00:00:00


In [68]:
# Save our results
data_all_agg.to_csv('data_raw/data_all_agg.csv', index=False)

In [2]:
data_all_agg.reset_index(inplace=True)

NameError: name 'data_all_agg' is not defined

In [69]:
import pandas as pd
import numpy as np

In [70]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

In [71]:
# This time we are going to import dataframe in an appropriate data types format
data_all_agg = pd.read_csv('data_raw/data_all_agg.csv',
                           
                           dtype = {'order_id': str, 
                                    'seller_id': str, 
                                    'order_item_id': int, 
                                    'price': float, 
                                    'freight_value': float, 
                                    'product_weight_g':float, 
                                    'product_volume': float,
                                    'customer_city':str,
                                    'customer_state':str,
                                    'customer_lat': float,
                                    'customer_lng':float,
                                    'seller_city':str,
                                    'seller_state':str,
                                    'seller_lat': float,
                                    'seller_lng':float},
                           parse_dates=['order_purchase_timestamp',
                                         'order_approved_at',
                                         'shipping_limit_date',
                                         'order_delivered_carrier_date',
                                         'order_delivered_customer_date',
                                         'order_estimated_delivery_date'])

In [72]:
# Check after import
data_all_agg

Unnamed: 0,seller_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,48436dade18ac8b2bce089ec2a041202,58.90,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29
1,dd7ddc04e1b6c2c614352b383efe2d36,239.90,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-03 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15
2,5b51032eddd242adc84c38acab88f23d,199.00,17.87,3050.0,14157.0,para de minas,MG,-19.870305,-44.593326,borda da mata,MG,-22.262584,-46.171124,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-18 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05
3,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,200.0,2400.0,atibaia,SP,-23.089925,-46.611654,franca,SP,-20.553624,-47.387359,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-15 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20
4,df560393f3a51e74553ab94004ba5c87,199.90,18.14,3750.0,42000.0,varzea paulista,SP,-23.243402,-46.827614,loanda,PR,-22.929384,-53.135873,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-13 13:57:51,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94648,b8bc237ba3788b23da09c0f1f3a3288c,299.99,43.41,10150.0,53400.0,sao luis,MA,-2.497993,-44.297761,itajai,SC,-26.912574,-48.673980,2018-04-23 13:57:06,2018-04-25 04:11:01,2018-05-02 04:11:01,2018-04-25 12:09:00,2018-05-10 22:56:40,2018-05-18
94649,f3c38ab652836d21de61fb8314b69182,350.00,36.53,8950.0,44460.0,curitiba,PR,-25.566904,-49.309115,sao paulo,SP,-23.535864,-46.642819,2018-07-14 10:26:46,2018-07-17 04:31:48,2018-07-20 04:31:48,2018-07-17 08:05:00,2018-07-23 20:31:55,2018-08-01
94650,c3cfdc648177fdbbbb35635a37472c53,99.90,16.95,967.0,9576.0,sao paulo,SP,-23.597794,-46.643923,curitiba,PR,-25.469955,-49.289821,2017-10-23 17:07:56,2017-10-24 17:14:25,2017-10-30 17:14:25,2017-10-26 15:13:14,2017-10-28 12:22:22,2017-11-10
94651,2b3e4a2a3ea8e01938cabda2a3e5cc79,55.99,8.72,100.0,8000.0,vinhedo,SP,-23.040252,-46.979782,sao paulo,SP,-23.635530,-46.694031,2017-08-14 23:02:59,2017-08-15 00:04:32,2017-08-21 00:04:32,2017-08-15 19:02:53,2017-08-16 21:59:40,2017-08-25


In [73]:
# Check dtypes
data_all_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94653 entries, 0 to 94652
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   seller_id                      94653 non-null  object        
 1   price                          94653 non-null  float64       
 2   freight_value                  94653 non-null  float64       
 3   product_weight_g               94653 non-null  float64       
 4   product_volume                 94653 non-null  float64       
 5   customer_city                  94653 non-null  object        
 6   customer_state                 94653 non-null  object        
 7   customer_lat                   94653 non-null  float64       
 8   customer_lng                   94653 non-null  float64       
 9   seller_city                    94653 non-null  object        
 10  seller_state                   94653 non-null  object        
 11  seller_lat     

In [74]:
# This is a function to calculate the distance between to points 
# I have found this function in the internet
def haversine_distance(lat1, lon1, lat2, lon2):
    r = 6371
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
    res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return np.round(res, 2)

In [75]:
# Calculate the distance between customer and seller
data_all_agg['distance_km'] = haversine_distance(data_all_agg.customer_lat, data_all_agg.customer_lng,
                                                 data_all_agg.seller_lat, data_all_agg.seller_lng)

In [76]:
# Delivery time in hours
data_all_agg['delivery_time_hours'] = (data_all_agg.order_delivered_customer_date - data_all_agg.order_purchase_timestamp).dt.total_seconds() / 3600

In [82]:
data_all_agg

Unnamed: 0,seller_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,distance_km,delivery_time_hours
0,48436dade18ac8b2bce089ec2a041202,58.90,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29,301.50,182.746111
1,dd7ddc04e1b6c2c614352b383efe2d36,239.90,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-03 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15,585.56,389.188333
2,5b51032eddd242adc84c38acab88f23d,199.00,17.87,3050.0,14157.0,para de minas,MG,-19.870305,-44.593326,borda da mata,MG,-22.262584,-46.171124,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-18 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05,312.34,190.762500
3,9d7a1d34a5052409006425275ba1c2b4,12.99,12.79,200.0,2400.0,atibaia,SP,-23.089925,-46.611654,franca,SP,-20.553624,-47.387359,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-15 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20,293.17,147.534444
4,df560393f3a51e74553ab94004ba5c87,199.90,18.14,3750.0,42000.0,varzea paulista,SP,-23.243402,-46.827614,loanda,PR,-22.929384,-53.135873,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-13 13:57:51,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17,646.16,602.744444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94648,b8bc237ba3788b23da09c0f1f3a3288c,299.99,43.41,10150.0,53400.0,sao luis,MA,-2.497993,-44.297761,itajai,SC,-26.912574,-48.673980,2018-04-23 13:57:06,2018-04-25 04:11:01,2018-05-02 04:11:01,2018-04-25 12:09:00,2018-05-10 22:56:40,2018-05-18,2754.54,416.992778
94649,f3c38ab652836d21de61fb8314b69182,350.00,36.53,8950.0,44460.0,curitiba,PR,-25.566904,-49.309115,sao paulo,SP,-23.535864,-46.642819,2018-07-14 10:26:46,2018-07-17 04:31:48,2018-07-20 04:31:48,2018-07-17 08:05:00,2018-07-23 20:31:55,2018-08-01,351.73,226.085833
94650,c3cfdc648177fdbbbb35635a37472c53,99.90,16.95,967.0,9576.0,sao paulo,SP,-23.597794,-46.643923,curitiba,PR,-25.469955,-49.289821,2017-10-23 17:07:56,2017-10-24 17:14:25,2017-10-30 17:14:25,2017-10-26 15:13:14,2017-10-28 12:22:22,2017-11-10,339.06,115.240556
94651,2b3e4a2a3ea8e01938cabda2a3e5cc79,55.99,8.72,100.0,8000.0,vinhedo,SP,-23.040252,-46.979782,sao paulo,SP,-23.635530,-46.694031,2017-08-14 23:02:59,2017-08-15 00:04:32,2017-08-21 00:04:32,2017-08-15 19:02:53,2017-08-16 21:59:40,2017-08-25,72.34,46.944722


In [25]:
data_all_agg.delivery_speed_km_per_hour.sort_values()

31738     0.000000
67623     0.000000
55631     0.000000
93568     0.000000
88872     0.000000
           ...    
27772    28.902218
93552    40.014472
11128    40.367696
17601    52.541239
48394    57.095228
Name: delivery_speed_km_per_hour, Length: 94653, dtype: float64

In [83]:
# Save our results
data_all_agg.to_csv('data_raw/data_all_agg.csv', index=False)