In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 4000)
## to supress scientific notation
# pd.options.display.float_format = '{:.4f}'.format

#pd.options.display.float_format = '{:20,.2f}'.format
print(os.getcwd())

/Users/salma/Research/e-commerce/brazilian_e-commerce_data/brazilian_e-commerce_dataset_analysis/analysis


In [2]:
br_ecomm_feb17_sep17 = pd.read_csv('/Users/salma/Research/e-commerce/brazilian_e-commerce_data/data/br_ecomm_feb17_sep17.csv')
br_ecomm_nov17_dec17 = pd.read_csv('/Users/salma/Research/e-commerce/brazilian_e-commerce_data/data/br_ecomm_nov17_dec17.csv')

#### Create a new identifier for order_id+order_item_id - a truly unique id for every record

In [3]:
br_ecomm_feb17_sep17['order_id_item_id'] = br_ecomm_feb17_sep17['order_id'] + '_' + br_ecomm_feb17_sep17['order_item_id'].astype('str')
br_ecomm_nov17_dec17['order_id_item_id'] = br_ecomm_nov17_dec17['order_id'] + '_' + br_ecomm_nov17_dec17['order_item_id'].astype('str')

In [4]:
print('br_ecomm_feb17_sep17 order items: ', len(br_ecomm_feb17_sep17['order_id_item_id']))
print('br_ecomm_nov17_dec17 order items: ', len(br_ecomm_nov17_dec17['order_id_item_id']))

br_ecomm_feb17_sep17 order items:  242
br_ecomm_nov17_dec17 order items:  593


### Obtain only the required columns for stats and ghg calcs

In [5]:
req_cols = ['order_purchase_date', 'order_id_item_id', 'customer_zip_code_prefix',
                                                    'customer_lat', 'customer_long', 'seller_zip_code_prefix', 'seller_lat', 'seller_long',
                                                    'seller_cust_geo_dist', 'product_weight_kgs', 'product_category']

br_ecomm_feb17_sep17_req = br_ecomm_feb17_sep17.loc[:, req_cols]
br_ecomm_feb17_sep17_req.to_csv('/Users/salma/Research/e-commerce/brazilian_e-commerce_data/data/br_ecomm_feb17_sep17_req.csv')

br_ecomm_nov17_dec17_req = br_ecomm_nov17_dec17.loc[:, req_cols]
br_ecomm_nov17_dec17_req.to_csv('/Users/salma/Research/e-commerce/brazilian_e-commerce_data/data/br_ecomm_nov17_dec17_req.csv')

### Get basic descriptives

In [6]:
def get_basic_desc(df_list, df_names):
    for df, df_name in zip(df_list, df_names):
        num_order_items = df.shape[0]
        ave_product_wt = df['product_weight_kgs'].mean().round(3)
        std_product_wt = df['product_weight_kgs'].std().round(3)
        ave_dist = df['seller_cust_geo_dist'].mean().round(3)
        std_dist = df['seller_cust_geo_dist'].std().round(3)
        
        print(df_name,': ')
        print('Num of items ordered: ', num_order_items)
        print('Average product weight of order items: ', ave_product_wt, ' kgs')
        print('Std Dev of product weight of order items: ', std_product_wt, ' kgs')
        print('Average customer-seller distance: ', ave_dist, 'kms')
        print('Std Dev of customer-seller distance: ', std_dist, 'kms')
        print()

### Identify and exclude outliers

#### Create a kg_kms column to identify outliers based on that quantity; round to 3 decimals

In [7]:
br_ecomm_feb17_sep17_req['kg_kms'] = (br_ecomm_feb17_sep17_req['product_weight_kgs'] * br_ecomm_feb17_sep17_req['seller_cust_geo_dist']).round(3)
br_ecomm_nov17_dec17_req['kg_kms'] = (br_ecomm_nov17_dec17_req['product_weight_kgs'] * br_ecomm_nov17_dec17_req['seller_cust_geo_dist']).round(3)

#### Utility function to identify and exclude outliers

In [8]:
def calc_exclude_ols(df_list, df_names, num_stds):
    ret_dfs = []
    for df, df_name in zip(df_list, df_names):
        df['kg_kms_z'] = np.abs(stats.zscore(df['kg_kms'])).round(3)
        df = df[df['kg_kms_z'] < num_stds]
        print(df_name, ' rec: ', df.shape[0])
        print(df.loc[:, ['order_id_item_id', 'product_weight_kgs', 'seller_cust_geo_dist', 'kg_kms', 'kg_kms_z']].sort_values('kg_kms_z', ascending=False).head())
        print()
        ret_dfs.append(df)
    return ret_dfs 

#### Calculate total miles and total weight of the products for regular and holiday shopping months

In [9]:
def calc_total_wt_kms(df_list, df_names):
    for df,df_name in zip(df_list, df_names):
        df_product_wt = df['product_weight_kgs'].sum().round(3)
        df_total_kms = df['seller_cust_geo_dist'].sum().round(3)
        print(df_name)
        print('Total products wt: ', df_product_wt, 'kgs')
        print('Total kms: ', df_total_kms, 'kms')
        print()

### Calculate GHG Emissions considering US EPA Emissions factor

#### Overall GHG emissions value for regular and holiday shopping

Example: Calculating emissions from a truck-load move
Let’s start with calculating the emissions for a truck that travels 1,000 miles with 20 short tons of cargo (a short ton is
2,000lbs).
1. Step One: Determine the total amount of ton-miles. Multiply 1,000 miles times 20 tons, which gives us a total of
20,000 ton-miles.
2. Step Two: Get the weight-based truck emissions factor from the Freight Emissions Factors chart. This tells us that
the average freight truck in the U.S. emits 161.8 grams of CO2 per ton-mile.
3. Step Three: Multiply this emissions factor with the total ton-miles {161.8 X 20,000), which gives us a total of
3,236,000 grams of CO2. Note that this calculation does not factor in emissions of other greenhouse gases, such as
methane or HFCs.
4. Step Four: Convert the total grams into metric tons. Metric tons are the standard measurement unit for corporate
emissions of greenhouse gases. There are 1,000,000 grams in a metric ton. To convert our answer from step three we
divide it by 1,000,000. This gives us 3.24 metric tons of CO2 for this one move.

src: https://storage.googleapis.com/scsc/Green%20Freight/EDF-Green-Freight-Handbook.pdf

In [10]:
def calc_ghg_emissions_mt(df_list, df_names):
    df_co2 = []
    for df, df_name in zip(df_list, df_names):    
        #### Determine the total amount of ton-miles.
        df_ton_miles = ((df['product_weight_kgs']/1000).sum() * (df['seller_cust_geo_dist']*0.6214).sum())

        #### Multiply 161.8 grams of CO2 per ton-mile with the total ton-miles 
        df_co2_gms = df_ton_miles * 161.8

        #### Convert total grams of co2 into metric tons 
        df_co2_metric_tons = (df_co2_gms / 1000000).round(3)
        df_co2.append(df_co2_metric_tons)
        
        print(f'{df_name} co2_metric_tons: ', df_co2_metric_tons)
        print()
        
    return df_co2

In [11]:
def calc_ghg_emissions_gms(df_list, df_names, df_file_names):
    df_co2_list = []
    for df, df_name, df_file_name in zip(df_list, df_names, df_file_names):
        df.loc[:, 'co2_gms'] = (df.loc[:, 'product_weight_kgs']* df.loc[:, 'seller_cust_geo_dist']) * 0.204 # gms of co2 per kg-kms
        df_co2_list.append(df['co2_gms'].sum())
        
        print(f'{df_name} total co2 gms: ', df['co2_gms'].sum())
        df.to_csv(f'/Users/salma/Research/e-commerce/brazilian_e-commerce_data/data/{df_file_name}_co2_gms.csv', index=False)
        print()
        
    return df_co2_list

#### WITH ALL DATA POINTS

#### Get total weight and miles

In [12]:
calc_total_wt_kms(df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], df_names=['REGULAR', 'HOLIDAY'])

REGULAR
Total products wt:  353.017 kgs
Total kms:  3834.396 kms

HOLIDAY
Total products wt:  718.064 kgs
Total kms:  8578.35 kms



#### Get basic descriptives

In [13]:
get_basic_desc(df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], df_names=['REGULAR', 'HOLIDAY'])

REGULAR : 
Num of items ordered:  242
Average product weight of order items:  1.459  kgs
Std Dev of product weight of order items:  3.862  kgs
Average customer-seller distance:  15.845 kms
Std Dev of customer-seller distance:  8.376 kms

HOLIDAY : 
Num of items ordered:  593
Average product weight of order items:  1.211  kgs
Std Dev of product weight of order items:  2.549  kgs
Average customer-seller distance:  14.466 kms
Std Dev of customer-seller distance:  8.017 kms



#### GHG emissions - CO2 metric tons

In [14]:
br_ecomm_feb17_sep17_req_co2, br_ecomm_nov17_dec17_req_co2 = calc_ghg_emissions_mt(df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], df_names=['REGULAR', 'HOLIDAY'])
print()
print((((br_ecomm_nov17_dec17_req_co2 - br_ecomm_feb17_sep17_req_co2)/br_ecomm_feb17_sep17_req_co2)*100).round(3))

REGULAR co2_metric_tons:  0.136

HOLIDAY co2_metric_tons:  0.619


355.147


#### GHG emissions - CO2 grams

In [15]:
br_ecomm_feb17_sep17_req_co2, br_ecomm_nov17_dec17_req_co2 = calc_ghg_emissions_gms(
    df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], 
    df_names=['regular', 'holiday'],
    df_file_names=['br_ecomm_feb17_sep17_req', 'br_ecomm_nov17_dec17_req']
)
print()
print((((br_ecomm_nov17_dec17_req_co2 - br_ecomm_feb17_sep17_req_co2)/br_ecomm_feb17_sep17_req_co2)*100).round(3))

regular total co2 gms:  1102.4699770095986

holiday total co2 gms:  2214.4067286094905


100.859


#### WITHOUT 3-Z OUTLIERS (same for without 2.5z outliers too)

#### Exclude 3z ols

In [16]:
br_ecomm_feb17_sep17_req_wdt_3z, br_ecomm_nov17_dec17_req_wdt_3z = calc_exclude_ols(df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], df_names=['REGULAR', 'HOLIDAY'], num_stds=3)

REGULAR  rec:  236
                       order_id_item_id  product_weight_kgs  seller_cust_geo_dist   kg_kms  kg_kms_z
2    ffb5af8b918083c3291c62b20fa89319_1                6.70             25.835597  173.098     2.413
183  0653c5290cf561492eae4f626799994d_1                9.75             17.144112  167.155     2.318
136  7b5677c6b812459afed4f29edef9debc_1               12.00             13.806578  165.679     2.294
65   184603c2b2f23fd8585d48649c714ba2_1                6.50             22.029726  143.193     1.934
195  daf2a6548710139ae39eec18beb70a17_1                9.75             12.645357  123.292     1.616

HOLIDAY  rec:  584
                       order_id_item_id  product_weight_kgs  seller_cust_geo_dist   kg_kms  kg_kms_z
286  cf1ff716be04ebeff7831c79c17f9151_1               7.050             19.403270  136.793     2.489
506  cd22ef4012ae92aa8729264045bee0eb_1              12.250             10.706754  131.158     2.371
86   bc01a8209b5f19c81653540c57ca921e_1             

#### Get total weight and miles

In [17]:
calc_total_wt_kms(df_list=[br_ecomm_feb17_sep17_req_wdt_3z, br_ecomm_nov17_dec17_req_wdt_3z], df_names=['REGULAR', 'HOLIDAY'])

REGULAR
Total products wt:  222.817 kgs
Total kms:  3721.596 kms

HOLIDAY
Total products wt:  580.031 kgs
Total kms:  8371.895 kms



#### Get basic descriptives

In [18]:
get_basic_desc(df_list=[br_ecomm_feb17_sep17_req_wdt_3z, br_ecomm_nov17_dec17_req_wdt_3z], df_names=['REGULAR', 'HOLIDAY'])

REGULAR : 
Num of items ordered:  236
Average product weight of order items:  0.944  kgs
Std Dev of product weight of order items:  1.635  kgs
Average customer-seller distance:  15.769 kms
Std Dev of customer-seller distance:  8.429 kms

HOLIDAY : 
Num of items ordered:  584
Average product weight of order items:  0.993  kgs
Std Dev of product weight of order items:  1.725  kgs
Average customer-seller distance:  14.335 kms
Std Dev of customer-seller distance:  7.982 kms



#### GHG emissions - CO2 metric tons

In [19]:
br_ecomm_feb17_sep17_req_wdt_3z_co2, br_ecomm_nov17_dec17_req_wdt_3z_co2 = calc_ghg_emissions_mt(df_list=[br_ecomm_feb17_sep17_req_wdt_3z, br_ecomm_nov17_dec17_req_wdt_3z], df_names=['REGULAR', 'HOLIDAY'])
print()
print((((br_ecomm_nov17_dec17_req_wdt_3z_co2 - br_ecomm_feb17_sep17_req_wdt_3z_co2)/br_ecomm_feb17_sep17_req_wdt_3z_co2)*100).round(3))

REGULAR co2_metric_tons:  0.083

HOLIDAY co2_metric_tons:  0.488


487.952


#### GHG emissions - CO2 metric gms

In [20]:
br_ecomm_feb17_sep17_req_wdt_3z_co2, br_ecomm_nov17_dec17_req_wdt_3z_co2 = calc_ghg_emissions_gms(
    df_list=[br_ecomm_feb17_sep17_req_wdt_3z, br_ecomm_nov17_dec17_req_wdt_3z], 
    df_names=['regular', 'holiday'],
    df_file_names=['br_ecomm_feb17_sep17_req_wdt_3z', 'br_ecomm_nov17_dec17_req_wdt_3z'])
print()
print((((br_ecomm_nov17_dec17_req_wdt_3z_co2 - br_ecomm_feb17_sep17_req_wdt_3z_co2)/br_ecomm_feb17_sep17_req_wdt_3z_co2)*100).round(3))

regular total co2 gms:  649.9651159915547

holiday total co2 gms:  1594.9297711985357


145.387


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


#### WITHOUT 2-Z OUTLIERS

#### Exclude 2z ols

In [21]:
br_ecomm_feb17_sep17_req_wdt_2z, br_ecomm_nov17_dec17_req_wdt_2z = calc_exclude_ols(df_list=[br_ecomm_feb17_sep17_req, br_ecomm_nov17_dec17_req], df_names=['REGULAR', 'HOLIDAY'], num_stds=2)

REGULAR  rec:  233
                       order_id_item_id  product_weight_kgs  seller_cust_geo_dist   kg_kms  kg_kms_z
65   184603c2b2f23fd8585d48649c714ba2_1                6.50             22.029726  143.193     1.934
195  daf2a6548710139ae39eec18beb70a17_1                9.75             12.645357  123.292     1.616
55   1ad9392f23bf7f28facc4ed0c2fcb7b6_1                3.25             35.492486  115.351     1.489
106  3a0ad73a22a4ca1af66f3f7ee5c6c3fd_1                2.60             39.418362  102.488     1.283
236  a6288d192f3399f630e89e96f1079cf2_1                9.20              8.255397   75.950     0.858

HOLIDAY  rec:  579
                       order_id_item_id  product_weight_kgs  seller_cust_geo_dist   kg_kms  kg_kms_z
532  e69a8cc400ec0d55147d287df178cfe3_1               10.00             11.123107  111.231     1.952
507  cf3bd7d805afecc817b4c3abadafced6_1                6.05             18.130669  109.691     1.920
326  e134db9b2f9178ed55725a25b33e48c4_1             

#### Get total weight and miles

In [22]:
calc_total_wt_kms(df_list=[br_ecomm_feb17_sep17_req_wdt_2z, br_ecomm_nov17_dec17_req_wdt_2z], df_names=['REGULAR', 'HOLIDAY'])

REGULAR
Total products wt:  194.367 kgs
Total kms:  3664.81 kms

HOLIDAY
Total products wt:  543.976 kgs
Total kms:  8263.822 kms



#### Get basic descriptives

In [23]:
get_basic_desc(df_list=[br_ecomm_feb17_sep17_req_wdt_2z, br_ecomm_nov17_dec17_req_wdt_2z], df_names=['REGULAR', 'HOLIDAY'])

REGULAR : 
Num of items ordered:  233
Average product weight of order items:  0.834  kgs
Std Dev of product weight of order items:  1.301  kgs
Average customer-seller distance:  15.729 kms
Std Dev of customer-seller distance:  8.456 kms

HOLIDAY : 
Num of items ordered:  579
Average product weight of order items:  0.94  kgs
Std Dev of product weight of order items:  1.607  kgs
Average customer-seller distance:  14.273 kms
Std Dev of customer-seller distance:  7.909 kms



#### GHG emissions - CO2 metric tons

In [24]:
br_ecomm_feb17_sep17_req_wdt_2z_co2, br_ecomm_nov17_dec17_req_wdt_2z_co2 = calc_ghg_emissions_mt(df_list=[br_ecomm_feb17_sep17_req_wdt_2z, br_ecomm_nov17_dec17_req_wdt_2z], df_names=['REGULAR', 'HOLIDAY'])
print()
print((((br_ecomm_nov17_dec17_req_wdt_2z_co2 - br_ecomm_feb17_sep17_req_wdt_2z_co2)/br_ecomm_feb17_sep17_req_wdt_2z_co2)*100).round(3))

REGULAR co2_metric_tons:  0.072

HOLIDAY co2_metric_tons:  0.452


527.778


#### GHG emissions - CO2 grams

In [25]:
br_ecomm_feb17_sep17_req_wdt_2z_co2_gms, br_ecomm_nov17_dec17_req_wdt_2z_co2_gms = calc_ghg_emissions_gms(
    df_list=[br_ecomm_feb17_sep17_req_wdt_2z, br_ecomm_nov17_dec17_req_wdt_2z], 
    df_names=['regular', 'holiday'],
    df_file_names=['br_ecomm_feb17_sep17_req_wdt_2z', 'br_ecomm_nov17_dec17_req_wdt_2z'])
print()
print((((br_ecomm_nov17_dec17_req_wdt_2z_co2_gms - br_ecomm_feb17_sep17_req_wdt_2z_co2_gms)/br_ecomm_feb17_sep17_req_wdt_2z_co2_gms)*100).round(3))

regular total co2 gms:  546.7548811113174

holiday total co2 gms:  1469.0382029892494


168.683


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
