### Import required libraries

In [50]:
import pandas as pd

### Set pd display options

In [51]:
#pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 10000)

### Load required data files to create initial consolidated file
#### Make a final orders df which has order_id, payment_value, order_item_id, product_id, product_category_name_english, product_weight_gms, product_length_cm, product_height_cm, product_width_cm, customer_order_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state, customer_geolocation_lat, customer_geolocation_lng, seller_id, seller_zip_code_prefix, seller_city, seller_state, seller_geolocation_lat, seller_geolocation_lng, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date

In [103]:
sellers_df = pd.read_csv('../data/olist_sellers_dataset.csv')
customers_df = pd.read_csv('../data/olist_customers_dataset.csv')
geolocation_df = pd.read_csv('../data/olist_geolocation_dataset.csv', encoding='utf-32')
orders_df = pd.read_csv('../data/olist_orders_dataset.csv')
order_items_df = pd.read_csv('../data/olist_order_items_dataset.csv')
order_payments_df = pd.read_csv('../data/olist_order_payments_dataset.csv')
products_df = pd.read_csv('../data/olist_products_dataset.csv')
products_names_translation_df = pd.read_csv('../data/product_category_name_translation.csv')

UnicodeDecodeError: 'utf-32-le' codec can't decode bytes in position 0-3: code point not in range(0x110000)

### Rename/clean initial column names/dfs as required before merging
#### customer_id: customer ID token that is generated for every order. If the same customer makes multiple orders, he has multiple customer_id identifiers.  
#### customer_unique_id: which is unique to each purchaser and can be used to track their purchases over time. 
#### So, rename customer_id in orders_df and customers_df to customer_id_for_order in orders_df

In [53]:
orders_df.rename({'customer_id': 'customer_id_for_order'}, axis=1, inplace=True)
customers_df.rename({'customer_id': 'customer_id_for_order'}, axis=1, inplace=True)

### Explort the files to get basic info, stats
### Check for any missing or unusual values with any formatting errors
*Already see this in geolocation_df (check if in other files too) - s√£o paulo for sao paulo - need to fix*

*ara√ßariguama - aracariguama*

*s√£o roque - sao roque*

*ara√ßoiaba da serra - aracoiaba da serra* 

and so on ..

### utility function to explore, check, clean up files

In [54]:
def check_basic_data_issues_geo_dist_stats(df, df_type, id_type):
    print(f'Num of {df_type}: ', df.shape[0], '\n')

    # get number of unique entries
    print(f'Num of {df_type} after dropping duplicates: ', len(df[f'{id_type}']), '\n') # still 3095, so no duplicates

    # check if there are any missing values
    print(df.isnull().sum()) # None

    # check how many entries from each state
    print(f'{df_type} by state: \n', df[f'{df_type}_state'].value_counts(), '\n')

    # get the count of entries by state,city combination
    print(f'{df_type} by state and city: \n', df[[f'{df_type}_state', f'{df_type}_city']].value_counts(), '\n')

    # get count of entries by zip-code prefix
    print(f'{df_type} count by zipcode prefix: \n', df[f'{df_type}_zip_code_prefix'].value_counts(), '\n')

In [55]:
## explore, check, cleanup sellers info
## sellers df - 3095 records
check_basic_data_issues_geo_dist_stats(sellers_df, df_type='seller', id_type='seller_id')

Num of seller:  3095 

Num of seller after dropping duplicates:  3095 

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
seller by state: 
 SP    1849
PR     349
MG     244
SC     190
RJ     171
RS     129
GO      40
DF      30
ES      23
BA      19
CE      13
PE       9
PB       6
MS       5
RN       5
MT       4
SE       2
RO       2
AC       1
MA       1
PA       1
PI       1
AM       1
Name: seller_state, dtype: int64 

seller by state and city: 
 seller_state  seller_city                             
SP            sao paulo                                   694
PR            curitiba                                    124
RJ            rio de janeiro                               93
MG            belo horizonte                               66
SP            ribeirao preto                               52
              guarulhos                                    50
              ibitinga                   

In [56]:
## explore, check, cleanup customers info
# use customer unique id since that is the unique id for a given customer
pd.set_option('display.max_rows', 5000)
check_basic_data_issues_geo_dist_stats(customers_df, df_type='customer', id_type='customer_unique_id')

Num of customer:  99441 

Num of customer after dropping duplicates:  99441 

customer_id_for_order       0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
customer by state: 
 SP    41746
RJ    12852
MG    11635
RS     5466
PR     5045
SC     3637
BA     3380
DF     2140
ES     2033
GO     2020
PE     1652
CE     1336
PA      975
MT      907
MA      747
MS      715
PB      536
PI      495
RN      485
AL      413
SE      350
TO      280
RO      253
AM      148
AC       81
AP       68
RR       46
Name: customer_state, dtype: int64 

customer by state and city: 
 customer_state  customer_city                   
SP              sao paulo                           15540
RJ              rio de janeiro                       6882
MG              belo horizonte                       2773
DF              brasilia                             2131
PR              curitiba                             1521
SP      

### Explore orders dataset

In [57]:
print('Different types of order statuses: \n', orders_df['order_status'].value_counts())

Different types of order statuses: 
 delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64


*Consider only 'delivered' order_status records*

In [58]:
orders_dlvrd_df = orders_df.query('order_status == "delivered"')
print('Total delivered orders: \n', orders_dlvrd_df['order_status'].value_counts())

Total delivered orders: 
 delivered    96478
Name: order_status, dtype: int64


In [59]:
# drop duplicate orders if any
orders_dlvrd_df = orders_dlvrd_df.drop_duplicates('order_id')
print('Total delivered orders without duplicates: \n', orders_dlvrd_df.shape[0])

Total delivered orders without duplicates: 
 96478


In [60]:
# check fo rany null or missign values
print('Missing values: \n', orders_dlvrd_df.isnull().sum(), '\n') # 24 records with atleast 1 missing value

Missing values: 
 order_id                          0
customer_id_for_order             0
order_status                      0
order_purchase_timestamp          0
order_approved_at                14
order_delivered_carrier_date      2
order_delivered_customer_date     8
order_estimated_delivery_date     0
dtype: int64 



In [61]:
# drop any missing value rows since we want only complete rows for analysis
orders_dlvrd_df.dropna(inplace=True) # 96455 records
print('Total delivered orders after dropping null/missing duplicates: \n', orders_dlvrd_df.shape[0])

Total delivered orders after dropping null/missing duplicates: 
 96455


### Get customer information into the orders dataset
#### Merge both dfs on customer_id_for_order - this should get customer_unique_id, customer_zip_code_prefix, customer_city, customer_city into the orders_df

In [73]:
## left merge on orders because we want to have all orders
orders_dlvrd_customers_df = orders_dlvrd_df.merge(customers_df, on='customer_id_for_order', how='left')
orders_dlvrd_customers_df.to_csv('/Users/salma/Research/shopping/brazilian_ecommerce_data/data/orders_dlvrd_customers_left.csv', index=False)

print('delivered orders with customers info: \n', orders_dlvrd_customers_df.shape[0]) # length of delivered orders df (96455)

#Now check for any missing values to see if orders and order items df have any missing values
print('orders_items_dlvrd_customers_df null/missing values: \n', orders_dlvrd_customers_df.isnull().sum())


#null_cust_unq_id = orders_dlvrd_customers_df[orders_dlvrd_customers_df['customer_unique_id'].isnull()]
#null_cust_unq_id.to_csv('/Users/salma/Research/shopping/brazilian_ecommerce_data/data/orders_null_customer_uniq_id.csv', index=False)

#Was dropping duplicates in sellers and customers df!!! so some customer order ids were getting dropped
#and hence not matching up when trying to merge orders and customers
#3215 customer_id_for_order values from orders df not matching up in customers df even though present

#sample customer id: 738b086814c6fcc74b8cc583f8516ee3

#check whether these 2 strings are considered equal or not
#print(orders_dlvrd_df.query('order_id == "403b97836b0c04a622354cf531062e5f"')['customer_id_for_order'])
#print(customers_df.query('customer_unique_id == "51838d41add414a0b1b989b7d251d9ee"')['customer_id_for_order'])
#print(customers_df.query('customer_id_for_order == "911e4c37f5cafe1604fe6767034bf1ae"')['customer_unique_id'])

#print(customers_df.query('customer_unique_id == "51838d41add414a0b1b989b7d251d9ee"'))

delivered orders with customers info: 
 96455
orders_items_dlvrd_customers_df null/missing values: 
 order_id                         0
customer_id_for_order            0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
dtype: int64


### Get orders geo_dist info using customer's geo dist info

In [76]:
## get orders by state
print('Orders by state: \n', orders_dlvrd_customers_df['customer_state'].value_counts())

Orders by state: 
 SP    40488
RJ    12348
MG    11351
RS     5342
PR     4923
SC     3546
BA     3256
DF     2080
ES     1995
GO     1957
PE     1593
CE     1278
PA      946
MT      886
MA      716
MS      701
PB      517
PI      476
RN      474
AL      397
SE      335
TO      274
RO      243
AM      145
AC       80
AP       67
RR       41
Name: customer_state, dtype: int64


In [77]:
## get orders by state,city combo
print('Orders by state and city: \n', orders_dlvrd_customers_df[['customer_state', 'customer_city']].value_counts())

Orders by state and city: 
 customer_state  customer_city                   
SP              sao paulo                           15043
RJ              rio de janeiro                       6600
MG              belo horizonte                       2697
DF              brasilia                             2071
PR              curitiba                             1489
SP              campinas                             1406
RS              porto alegre                         1341
BA              salvador                             1188
SP              guarulhos                            1143
                sao bernardo do campo                 911
RJ              niteroi                               825
SP              santo andre                           777
                osasco                                724
                santos                                699
                sao jose dos campos                   667
GO              goiania                              

### Get sellers info into orders_dlvrd_customers_df
#### first merge orders_dlvrd_customers_df with order_items_df to get seller_id then merge with sellers_d to get sellers geo info

In [81]:
## merge orders_dlvrd_customers_df with order_items_df on 'seller_id'
# this should get order_item_id, product_id, seller_id, shipping_limit_date, price, freight_value into orders_dlvrd_customers_df
# rename price to order_item_price since the price in this df is for each item in the order
print('order_items_df: ', order_items_df.shape[0]) #112650

# left since want to retain each order even if some corresponding info from order items is missing for that order
orders_items_dlvrd_customers_df = orders_dlvrd_customers_df.merge(order_items_df, on='order_id', how='left')
# num of rows in orders_items_dlvrd_customers_df should be 96455
print('orders_items_dlvrd_customers_df rows: ', orders_items_dlvrd_customers_df.shape[0]) # 110173 - same as order_items_df size

# Now check for any missing values to see if orders and order items df have any missing values
print('orders_items_dlvrd_customers_df null/missing values: \n', orders_items_dlvrd_customers_df.isnull().sum())

order_items_df:  112650
orders_items_dlvrd_customers_df rows:  110173
orders_items_dlvrd_customers_df null/missing values: 
 order_id                         0
customer_id_for_order            0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
order_item_id                    0
product_id                       0
seller_id                        0
shipping_limit_date              0
price                            0
freight_value                    0
dtype: int64


#### now get sellers state, city, zip info from sellers_df into orders_items_dlvrd_customers_df by merging both on the basis of seller_id

In [82]:
orders_items_dlvrd_customers_sellers_df = orders_items_dlvrd_customers_df.merge(sellers_df, on='seller_id')
print(orders_items_dlvrd_customers_sellers_df.isnull().sum())

order_id                         0
customer_id_for_order            0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
order_item_id                    0
product_id                       0
seller_id                        0
shipping_limit_date              0
price                            0
freight_value                    0
seller_zip_code_prefix           0
seller_city                      0
seller_state                     0
dtype: int64


### Get orders fulfilled by sellers geo_dist info using sellers's geo dist info

In [83]:
# get orders fulfilled from state
orders_items_dlvrd_customers_sellers_df['seller_state'].value_counts()

SP    78585
MG     8601
PR     8485
RJ     4685
SC     3999
RS     2169
DF      883
BA      624
GO      508
PE      445
MA      402
ES      364
MT      144
CE       90
RN       56
MS       50
PB       37
RO       14
PI       11
SE       10
PA        8
AM        3
Name: seller_state, dtype: int64

In [84]:
# get orders fulfilled from state, city combo
orders_items_dlvrd_customers_sellers_df[['seller_state', 'seller_city']].value_counts()

seller_state  seller_city                             
SP            sao paulo                                   27349
              ibitinga                                     7617
              santo andre                                  2886
PR            curitiba                                     2885
SP            sao jose do rio preto                        2544
MG            belo horizonte                               2388
RJ            rio de janeiro                               2350
SP            guarulhos                                    2308
              ribeirao preto                               2208
PR            maringa                                      2194
SP            piracicaba                                   1893
              itaquaquecetuba                              1639
              campinas                                     1374
              salto                                        1326
              praia grande                       

### Explore, check, clean geolocation file

In [88]:
geolocation_df['geolocation_state'].value_counts()

SP    404268
MG    126336
RJ    121169
RS     61851
PR     57859
SC     38328
BA     36045
GO     20139
ES     16748
PE     16432
DF     12986
MT     12031
CE     11674
PA     10853
MS     10431
MA      7853
PB      5538
RN      5041
PI      4549
AL      4183
TO      3576
SE      3563
RO      3478
AM      2432
AC      1301
AP       853
RR       646
Name: geolocation_state, dtype: int64

In [102]:
geolocation_df['geolocation_city'].value_counts()

sao paulo                 135800
rio de janeiro             62151
belo horizonte             27805
s├úo paulo                 24918
curitiba                   16593
                           ...  
barao de melgaco               1
prudencio thomaz               1
davin├│polis                   1
santo antonio do norte         1
altamira do paran├í            1
Name: geolocation_city, Length: 8011, dtype: int64

### Get the corresponding geolocation lat and long for each customer and seller in customers_df and sellers_df
#### So merge customers_df with geolocation_df and sellers_df with geolocation_df based on zip_code_prefix, city and state

In [67]:
## Get geo-location info for customers
print('customers df length before geo merging: ', customers_df.shape[0])
print('geolocation_df length before geo merging: ', geolocation_df.shape[0])
customers_location_df = customers_df.merge(geolocation_df, left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'],
                                                 right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'],
                                                  how = 'left')
print('customers df length after geo merging: ', customers_location_df.shape[0])
print(list(customers_location_df))

customers df length before geo merging:  99441
geolocation_df length before geo merging:  1000163
customers df length after geo merging:  14129681
['customer_id_for_order', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state', 'geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']


In [68]:
## Get geo-location info for sellers
print('sellers df length before geo merging: ', sellers_df.shape[0])
sellers_location_df = sellers_df.merge(geolocation_df, left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'],
                                                 right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'],
                                                  how = 'left')
print('sellers df length after geo merging: ', sellers_location_df.shape[0])
print(list(sellers_location_df))

sellers df length before geo merging:  3095
sellers df length after geo merging:  385721
['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state', 'geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']
