### Import required libraries

In [73]:
import pandas as pd

### Set pd display options

In [74]:
#pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 10000)

### Load required data files to create initial consolidated file
#### Make a final orders df which has order_id, payment_value, order_item_id, product_id, product_category_name_english, product_weight_gms, product_length_cm, product_height_cm, product_width_cm, customer_order_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state, customer_geolocation_lat, customer_geolocation_lng, seller_id, seller_zip_code_prefix, seller_city, seller_state, seller_geolocation_lat, seller_geolocation_lng, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date

In [75]:
sellers_df = pd.read_csv('../data/olist_sellers_dataset.csv')
customers_df = pd.read_csv('../data/olist_customers_dataset.csv')
geolocation_df = pd.read_csv('../data/olist_geolocation_dataset.csv')
orders_df = pd.read_csv('../data/olist_orders_dataset.csv')
order_items_df = pd.read_csv('../data/olist_order_items_dataset.csv')
order_payments_df = pd.read_csv('../data/olist_order_payments_dataset.csv')
products_df = pd.read_csv('../data/olist_products_dataset.csv')
products_names_translation_df = pd.read_csv('../data/product_category_name_translation.csv')

### Rename/clean initial column names/dfs as required before merging
#### customer_id: customer ID token that is generated for every order. If the same customer makes multiple orders, he has multiple customer_id identifiers.  
#### customer_unique_id: which is unique to each purchaser and can be used to track their purchases over time. 
#### So, rename customer_id in orders_df and customers_df to customer_id_for_order in orders_df

In [76]:
orders_df.rename({'customer_id': 'customer_id_for_order'}, axis=1, inplace=True)
customers_df.rename({'customer_id': 'customer_id_for_order'}, axis=1, inplace=True)

### Explort the files to get basic info, stats
### Check for any missing or unusual values with any formatting errors
*Already see this in geolocation_df (check if in other files too) - s√£o paulo for sao paulo - need to fix*

*ara√ßariguama - aracariguama*

*s√£o roque - sao roque*

*ara√ßoiaba da serra - aracoiaba da serra* 

and so on ..

### utility function to explore, check, clean up files

In [77]:
def check_basic_data_issues_geo_dist_stats(df, df_type, id_type):
    print(f'Num of {df_type}: ', df.shape[0], '\n')

    ## drop duplicates if any
    df.drop_duplicates(f'{id_type}', inplace=True)

    # get number of unique entries
    print(f'Num of {df_type} after dropping duplicates: ', len(df[f'{id_type}']), '\n') # still 3095, so no duplicates

    # check if there are any missing values
    print(df.isnull().sum()) # None

    # check how many entries from each state
    print(f'{df_type} by state: \n', df[f'{df_type}_state'].value_counts(), '\n')

    # get the count of entries by state,city combination
    print(f'{df_type} by state and city: \n', df[[f'{df_type}_state', f'{df_type}_city']].value_counts(), '\n')

    # get count of entries by zip-code prefix
    print(f'{df_type} count by zipcode prefix: \n', df[f'{df_type}_zip_code_prefix'].value_counts(), '\n')

In [78]:
## explore, check, cleanup sellers info
## sellers df - 3095 records
check_basic_data_issues_geo_dist_stats(sellers_df, df_type='seller', id_type='seller_id')

Num of seller:  3095 

Num of seller after dropping duplicates:  3095 

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64
seller by state: 
 SP    1849
PR     349
MG     244
SC     190
RJ     171
RS     129
GO      40
DF      30
ES      23
BA      19
CE      13
PE       9
PB       6
RN       5
MS       5
MT       4
RO       2
SE       2
PI       1
AC       1
MA       1
PA       1
AM       1
Name: seller_state, dtype: int64 

seller by state and city: 
 seller_state  seller_city                             
SP            sao paulo                                   694
PR            curitiba                                    124
RJ            rio de janeiro                               93
MG            belo horizonte                               66
SP            ribeirao preto                               52
              guarulhos                                    50
              ibitinga                   

In [79]:
## explore, check, cleanup customers info
# use customer unique id since that is the unique id for a given customer
pd.set_option('display.max_rows', 5000)
check_basic_data_issues_geo_dist_stats(customers_df, df_type='customer', id_type='customer_unique_id')

Num of customer:  99441 

Num of customer after dropping duplicates:  96096 

customer_id_for_order       0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
customer by state: 
 SP    40295
RJ    12377
MG    11255
RS     5277
PR     4882
SC     3529
BA     3276
DF     2073
ES     1963
GO     1951
PE     1604
CE     1311
PA      949
MT      875
MA      725
MS      693
PB      519
PI      482
RN      474
AL      401
SE      341
TO      273
RO      239
AM      143
AC       77
AP       67
RR       45
Name: customer_state, dtype: int64 

customer by state and city: 
 customer_state  customer_city                   
SP              sao paulo                           14971
RJ              rio de janeiro                       6611
MG              belo horizonte                       2671
DF              brasilia                             2066
PR              curitiba                             1462
SP      

customer count by zipcode prefix: 
 22790    136
22793    119
24220    113
24230    112
22775    105
        ... 
70402      1
57130      1
62725      1
2123       1
68420      1
Name: customer_zip_code_prefix, Length: 14982, dtype: int64 



### Explore orders dataset

In [80]:
print('Different types of order statuses: \n', orders_df['order_status'].value_counts())

Different types of order statuses: 
 delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64


*Consider only 'delivered' order_status records*

In [81]:
orders_dlvrd_df = orders_df.query('order_status == "delivered"')
print('Total delivered orders: \n', orders_dlvrd_df['order_status'].value_counts())

Total delivered orders: 
 delivered    96478
Name: order_status, dtype: int64


In [82]:
# drop duplicate orders if any
orders_dlvrd_df = orders_dlvrd_df.drop_duplicates('order_id')
print('Total delivered orders without duplicates: \n', orders_dlvrd_df.shape[0])

Total delivered orders without duplicates: 
 96478


In [83]:
# check fo rany null or missign values
print('Missing values: \n', orders_dlvrd_df.isnull().sum(), '\n') # 24 records with atleast 1 missing value

Missing values: 
 order_id                          0
customer_id_for_order             0
order_status                      0
order_purchase_timestamp          0
order_approved_at                14
order_delivered_carrier_date      2
order_delivered_customer_date     8
order_estimated_delivery_date     0
dtype: int64 



In [84]:
# drop any missing value rows since we want only complete rows for analysis
orders_dlvrd_df.dropna(inplace=True) # 96455 records
print('Total delivered orders after dropping null/missing duplicates: \n', orders_dlvrd_df.shape[0])

Total delivered orders after dropping null/missing duplicates: 
 96455


### Get customer information into the orders dataset
#### Merge both dfs on customer_id_for_order - this should get customer_unique_id, customer_zip_code_prefix, customer_city, customer_city into the orders_df

In [111]:
## left merge on orders because we want to have all orders
# orders_dlvrd_customers_df = orders_dlvrd_df.merge(customers_df, on='customer_id_for_order', how='left')
# some 3964 customer geo info not getting merged eventhough seems like ids match

# Let's try inner merge to ensure that only matching records remain
# orders_dlvrd_customers_df = orders_dlvrd_df.merge(customers_df, on='customer_id_for_order')
# Same 96455 records. Dont seem like any missing. But check next merge and with initial missing customer id for order

# Didn't match. So check outer
orders_dlvrd_customers_df = orders_dlvrd_df.merge(customers_df, on='customer_id_for_order', how='outer')
orders_dlvrd_customers_df.to_csv('/Users/salma/Research/shopping/brazilian_ecommerce_data/data/orders_dlvrd_customers_outer.csv', index=False)

print('delivered orders with customers info: \n', orders_dlvrd_df.shape[0]) # length of delivered orders df (96455)
print('orders with cust info: ', list(orders_dlvrd_customers_df))

delivered orders with customers info: 
 96455
orders with cust info:  ['order_id', 'customer_id_for_order', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']


### Get orders geo_dist info using customer's ge dist info

In [105]:
## get orders by state
print('Orders by state: \n', orders_dlvrd_customers_df['customer_state'].value_counts())

Orders by state: 
 SP    39090
RJ    11898
MG    10986
RS     5159
PR     4765
SC     3441
BA     3156
DF     2015
ES     1927
GO     1889
PE     1545
CE     1255
PA      922
MT      855
MA      697
MS      679
PB      504
PI      464
RN      463
AL      385
SE      327
TO      267
RO      229
AM      140
AC       76
AP       66
RR       40
Name: customer_state, dtype: int64


In [106]:
## get orders by state,city combo
print('Orders by state and city: \n', orders_dlvrd_customers_df[['customer_state', 'customer_city']].value_counts())

Orders by state and city: 
 customer_state  customer_city                   
SP              sao paulo                           14494
RJ              rio de janeiro                       6344
MG              belo horizonte                       2599
DF              brasilia                             2008
PR              curitiba                             1432
SP              campinas                             1361
RS              porto alegre                         1289
BA              salvador                             1153
SP              guarulhos                            1105
                sao bernardo do campo                 881
RJ              niteroi                               787
SP              santo andre                           750
                osasco                                695
                santos                                677
                sao jose dos campos                   642
GO              goiania                              

### Get sellers info into orders_dlvrd_customers_df
#### first merge orders_dlvrd_customers_df with order_items_df to get seller_id then merge with sellers_d to get sellers geo info

In [107]:
## merge orders_dlvrd_customers_df with order_items_df on 'seller_id'
# this should get order_item_id, product_id, seller_id, shipping_limit_date, price, freight_value into orders_dlvrd_customers_df
# rename price to order_item_price since the price in this df is for each item in the order
print('order_items_df: ', order_items_df.shape[0]) #112650

# left since want to retain each order even if some corresponding info from order items is missing for that order
orders_items_dlvrd_customers_df = orders_dlvrd_customers_df.merge(order_items_df, on='order_id', how='left')
# num of rows in orders_items_dlvrd_customers_df should be 96455
print('orders_items_dlvrd_customers_df rows: ', orders_items_dlvrd_customers_df.shape[0]) # 110173 - same as order_items_df size

# Now check for any missing values to see if orders and order items df have any missing values
print('orders_items_dlvrd_customers_df null/missing values: \n', orders_items_dlvrd_customers_df.isnull().sum())

order_items_df:  112650
orders_items_dlvrd_customers_df rows:  106209
orders_items_dlvrd_customers_df null/missing values: 
 order_id                         0
customer_id_for_order            0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
order_item_id                    0
product_id                       0
seller_id                        0
shipping_limit_date              0
price                            0
freight_value                    0
dtype: int64


In [108]:
# ok lot of missing values from orders df for customers of certain order_items df. Let's check a couple
orders_items_dlvrd_customers_df[orders_items_dlvrd_customers_df['customer_unique_id'].isnull()].head(2)

Unnamed: 0,order_id,customer_id_for_order,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value


### Get the corresponding geolocation lat and long for each customer and seller in customers_df and sellers_df
#### So merge customers_df with geolocation_df and sellers_df with geolocation_df based on zip_code_prefix, city and state

In [85]:
## Get geo-location info for customers
print('customers df length before geo merging: ', customers_df.shape[0])
print('geolocation_df length before geo merging: ', geolocation_df.shape[0])
customers_location_df = customers_df.merge(geolocation_df, left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'],
                                                 right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'],
                                                  how = 'left')
print('customers df length after geo merging: ', customers_location_df.shape[0])
print(list(customers_location_df))

customers df length before geo merging:  96096
geolocation_df length before geo merging:  1000163
customers df length after geo merging:  13629145
['customer_id_for_order', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state', 'geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']


In [86]:
## Get geo-location info for sellers
print('sellers df length before geo merging: ', sellers_df.shape[0])
sellers_location_df = sellers_df.merge(geolocation_df, left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'],
                                                 right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'],
                                                  how = 'left')
print('sellers df length after geo merging: ', sellers_location_df.shape[0])
print(list(sellers_location_df))

sellers df length before geo merging:  3095
sellers df length after geo merging:  385721
['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state', 'geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']
