# Data Exploration

In [1]:
import pandas as pd

In [2]:
# Read in data
commissions = pd.read_csv('data/commissions.csv', parse_dates = ['date'], index_col=['date', 'vendor_id'])
order_lines = pd.read_csv('data/order_lines.csv', index_col=['order_id', 'product_id'])
orders = pd.read_csv('data/orders.csv',  parse_dates = ['created_at'], index_col='id')
product_promotions = pd.read_csv('data/product_promotions.csv', parse_dates = ['date'], index_col=['date', 'product_id'])
products = pd.read_csv('data/products.csv', index_col='id')
promotions = pd.read_csv('data/promotions.csv', index_col='id')

In [3]:
order_lines.isnull().any()

product_description    False
product_price          False
product_vat_rate       False
discount_rate          False
quantity               False
full_price_amount      False
discounted_amount      False
vat_amount             False
total_amount           False
dtype: bool

In [4]:
test = order_lines[(order_lines.index.get_level_values('order_id') >= 2) & (order_lines.index.get_level_values('order_id') <= 10) ]
test['quantity'].sum()

2895

In [5]:
orders.head()

Unnamed: 0_level_0,created_at,vendor_id,customer_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2019-08-01 18:43:57.052767,3,1398
3,2019-08-01 11:51:07.349383,2,7449
4,2019-08-01 15:26:46.084807,5,7082
5,2019-08-01 03:34:05.665990,5,5727
6,2019-08-01 16:30:57.126590,5,5281


In [6]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438 entries, 2 to 439
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   created_at   438 non-null    datetime64[ns]
 1   vendor_id    438 non-null    int64         
 2   customer_id  438 non-null    int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 13.7 KB


In [7]:
products.head()

Unnamed: 0_level_0,description
id,Unnamed: 1_level_1
1,Oréos 2X
2,Škoda 3Tr
3,Oréos 4X
4,Škoda 6Tr
5,Škoda 7Tr


In [8]:
order_day = orders[orders['created_at'].dt.strftime('%Y-%m-%d') == '2019-08-02']
order_day

Unnamed: 0_level_0,created_at,vendor_id,customer_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11,2019-08-02 01:20:59.124548,9,5405
12,2019-08-02 19:23:56.135468,6,4212
13,2019-08-02 21:03:47.395720,9,4478
14,2019-08-02 21:49:39.176606,4,3250
15,2019-08-02 22:29:26.731292,2,4014
16,2019-08-02 11:19:59.721380,8,408
17,2019-08-02 12:02:30.370169,1,9593
18,2019-08-02 11:54:15.391115,4,5456
19,2019-08-02 23:13:07.183946,5,4742
20,2019-08-02 01:01:55.866474,4,8310


In [19]:
test_order = order_lines.merge(orders, left_on='order_id', right_index=True)\
                .merge(products, left_on='product_id', right_index=True)
test_order.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_description,product_price,product_vat_rate,discount_rate,quantity,full_price_amount,discounted_amount,vat_amount,total_amount,created_at,vendor_id,customer_id,description
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,794,IBM 032,21873,0.11,0.0,49,1071777,1071777.0,117895.47,1189672.0,2019-08-01 18:43:57.052767,3,1398,IBM 032
6,794,IBM 032,19050,0.11,0.388048,43,819150,501280.2,55140.819282,556421.0,2019-08-01 16:30:57.126590,5,5281,IBM 032
15,794,IBM 032,49138,0.2,0.0,43,2112934,2112934.0,422586.8,2535521.0,2019-08-02 22:29:26.731292,2,4014,IBM 032
146,794,IBM 032,46555,0.0,0.0,11,512105,512105.0,0.0,512105.0,2019-08-18 02:23:35.836475,1,9874,IBM 032
278,794,IBM 032,17948,0.2,0.423644,27,484596,279300.1,55860.01001,335160.1,2019-09-06 18:07:11.210956,8,7308,IBM 032


In [None]:
# Order lines, orders and products
order_products = order_lines.merge(orders, left_on='order_id', right_index=True)
order_products = order_products.merge(products, left_on='product_id', right_index=True)
order_products.head()

In [17]:
# 1. Total number of items sold on a day
date_str = '2019-08-01'
day_orders = order_products[order_products['created_at'].dt.strftime('%Y-%m-%d') == date_str].sort_values(by=['order_id', 'product_id'])
num_items_sold = day_orders['quantity'].sum()
num_items_sold

Unnamed: 0_level_0,Unnamed: 1_level_0,product_description,product_price,product_vat_rate,discount_rate,quantity,full_price_amount,discounted_amount,vat_amount,total_amount,created_at,vendor_id,customer_id,description
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,794,IBM 032,21873,0.11,0.0,49,1071777,1071777.0,117895.47,1189672.0,2019-08-01 18:43:57.052767,3,1398,IBM 032
6,794,IBM 032,19050,0.11,0.388048,43,819150,501280.2,55140.819282,556421.0,2019-08-01 16:30:57.126590,5,5281,IBM 032
15,794,IBM 032,49138,0.2,0.0,43,2112934,2112934.0,422586.8,2535521.0,2019-08-02 22:29:26.731292,2,4014,IBM 032
146,794,IBM 032,46555,0.0,0.0,11,512105,512105.0,0.0,512105.0,2019-08-18 02:23:35.836475,1,9874,IBM 032
278,794,IBM 032,17948,0.2,0.423644,27,484596,279300.1,55860.01001,335160.1,2019-09-06 18:07:11.210956,8,7308,IBM 032


In [224]:
# 1. Total number of items sold on a day
date_str = '2019-08-01'
day_orders = order_products[order_products['created_at'].dt.strftime('%Y-%m-%d') == date_str].sort_values(by=['order_id', 'product_id'])
num_items_sold = day_orders['quantity'].sum()
num_items_sold

2895

In [225]:
# 2. Total number of customers
unique_customers = pd.unique(day_orders['customer_id'])
unique_customers, len(unique_customers)

(array([1398, 7449, 7082, 5727, 5281, 9636, 5080, 3872, 4516]), 9)

In [226]:
# 3. Total maount of discount given on day
day_orders['discount_value'] = day_orders['full_price_amount'] - day_orders['discounted_amount']
total_discount = day_orders['discount_value'].sum()
total_discount

15152814.736907508

In [227]:
# 4. Average discount rate applied to items sold
average_discount = day_orders.loc[day_orders['discount_rate'] > 0, 'discount_rate'].mean()
average_discount

0.44574189572522194

In [228]:
# 5. Average order total for that day
total_day_revenue = day_orders['total_amount'].sum()
num_orders_for_day = day_orders['total_amount'].size
average_day_revenue = total_day_revenue / num_orders_for_day
average_day_revenue

1182286.0960463746

In [229]:
day_prod_promos = product_promotions[product_promotions.index.get_level_values('date') == date_str].reset_index()
day_prod_promos

Unnamed: 0,date,product_id,promotion_id
0,2019-08-01,572,3
1,2019-08-01,242,4
2,2019-08-01,416,2
3,2019-08-01,272,2
4,2019-08-01,126,3
5,2019-08-01,486,3
6,2019-08-01,227,2
7,2019-08-01,139,4
8,2019-08-01,98,3
9,2019-08-01,835,5


In [230]:
commissions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rate
date,vendor_id,Unnamed: 2_level_1
2019-08-01,1,0.29
2019-08-01,2,0.07
2019-08-01,3,0.27
2019-08-01,4,0.14
2019-08-01,5,0.05


In [231]:
day_commissions = commissions[commissions.index.get_level_values('date') == date_str].reset_index()
day_commissions

Unnamed: 0,date,vendor_id,rate
0,2019-08-01,1,0.29
1,2019-08-01,2,0.07
2,2019-08-01,3,0.27
3,2019-08-01,4,0.14
4,2019-08-01,5,0.05
5,2019-08-01,6,0.24
6,2019-08-01,7,0.29
7,2019-08-01,8,0.22
8,2019-08-01,9,0.24


In [232]:
day_orders.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_description,product_price,product_vat_rate,discount_rate,quantity,full_price_amount,discounted_amount,vat_amount,total_amount,created_at,vendor_id,customer_id,description_x,description_y,discount_value
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,365,QM3,33152,0.2,0.0,11,364672,364672.0,72934.4,437606.4,2019-08-01 18:43:57.052767,3,1398,QM3,QM3,0.0
2,417,GS8,96066,0.11,0.0,22,2113452,2113452.0,232479.72,2345932.0,2019-08-01 18:43:57.052767,3,1398,GS8,GS8,0.0
2,428,XC60,76782,0.0,0.0,30,2303460,2303460.0,0.0,2303460.0,2019-08-01 18:43:57.052767,3,1398,XC60,XC60,0.0
2,462,Deskjet 610c / 610cl,1037,0.2,0.522203,33,34221,16350.69,3270.137627,19620.83,2019-08-01 18:43:57.052767,3,1398,Deskjet 610c / 610cl,Deskjet 610c / 610cl,17870.311866
2,477,Deskjet 693c,38209,0.0,0.674717,28,1069852,348004.2,0.0,348004.2,2019-08-01 18:43:57.052767,3,1398,Deskjet 693c,Deskjet 693c,721847.758224


In [233]:
# 6. Total amount of commissions for that day
day_orders_commissions = day_orders.reset_index()
# day_orders_commissions = day_orders.merge(day_prod_promos, how='left', left_on='product_id', right_on='product_id', suffixes=('_orders','_promos'))
day_orders_commissions = day_orders_commissions.merge(day_commissions, how='left', left_on='vendor_id', right_on='vendor_id', suffixes=('','_commissions'))
day_orders_commissions['commission_amount'] = day_orders_commissions['total_amount'] * day_orders_commissions['rate']
day_orders_commissions

Unnamed: 0,order_id,product_id,product_description,product_price,product_vat_rate,discount_rate,quantity,full_price_amount,discounted_amount,vat_amount,total_amount,created_at,vendor_id,customer_id,description_x,description_y,discount_value,date,rate,commission_amount
0,2,365,QM3,33152,0.20,0.000000,11,364672,3.646720e+05,72934.400000,4.376064e+05,2019-08-01 18:43:57.052767,3,1398,QM3,QM3,0.000000e+00,2019-08-01,0.27,118153.728000
1,2,417,GS8,96066,0.11,0.000000,22,2113452,2.113452e+06,232479.720000,2.345932e+06,2019-08-01 18:43:57.052767,3,1398,GS8,GS8,0.000000e+00,2019-08-01,0.27,633401.564400
2,2,428,XC60,76782,0.00,0.000000,30,2303460,2.303460e+06,0.000000,2.303460e+06,2019-08-01 18:43:57.052767,3,1398,XC60,XC60,0.000000e+00,2019-08-01,0.27,621934.200000
3,2,462,Deskjet 610c / 610cl,1037,0.20,0.522203,33,34221,1.635069e+04,3270.137627,1.962083e+04,2019-08-01 18:43:57.052767,3,1398,Deskjet 610c / 610cl,Deskjet 610c / 610cl,1.787031e+04,2019-08-01,0.27,5297.622955
4,2,477,Deskjet 693c,38209,0.00,0.674717,28,1069852,3.480042e+05,0.000000,3.480042e+05,2019-08-01 18:43:57.052767,3,1398,Deskjet 693c,Deskjet 693c,7.218478e+05,2019-08-01,0.27,93961.145280
5,2,480,Deskjet 697c,62009,0.00,0.000000,27,1674243,1.674243e+06,0.000000,1.674243e+06,2019-08-01 18:43:57.052767,3,1398,Deskjet 697c,Deskjet 697c,0.000000e+00,2019-08-01,0.27,452045.610000
6,2,517,Deskjet 960c / 960cse / 960cxi,77184,0.00,0.000000,3,231552,2.315520e+05,0.000000,2.315520e+05,2019-08-01 18:43:57.052767,3,1398,Deskjet 960c / 960cse / 960cxi,Deskjet 960c / 960cse / 960cxi,0.000000e+00,2019-08-01,0.27,62519.040000
7,2,521,Deskjet 995c / 995ck,47082,0.11,0.000000,20,941640,9.416400e+05,103580.400000,1.045220e+06,2019-08-01 18:43:57.052767,3,1398,Deskjet 995c / 995ck,Deskjet 995c / 995ck,0.000000e+00,2019-08-01,0.27,282209.508000
8,2,549,Deskjet 6620,13463,0.00,0.000000,26,350038,3.500380e+05,0.000000,3.500380e+05,2019-08-01 18:43:57.052767,3,1398,Deskjet 6620,Deskjet 6620,0.000000e+00,2019-08-01,0.27,94510.260000
9,2,580,Officejet 330,15927,0.11,0.000000,46,732642,7.326420e+05,80590.620000,8.132326e+05,2019-08-01 18:43:57.052767,3,1398,Officejet 330,Officejet 330,0.000000e+00,2019-08-01,0.27,219572.807400


In [234]:
total_commissions_for_day = day_orders_commissions['commission_amount'].sum()
total_commissions_for_day

20833236.938148525

In [235]:
day_orders_commissions.loc[day_orders_commissions['order_id'] == 97, 'commission_amount'].mean()

nan

In [236]:
# 7. Average amount of commission per order
average_commissions = day_orders_commissions[['order_id','commission_amount']].groupby('order_id').mean().mean()
average_commissions[0], type(average_commissions[0])

(166299.43931740624, numpy.float64)

In [241]:
# 8. Total amount of commissions earned per promotion
commissions_per_promo = day_orders_commissions.merge(day_prod_promos, how='inner', left_on='product_id', right_on='product_id', suffixes=('','_promos'))
commissions_per_promo[['commission_amount', 'promotion_id', ]].groupby('promotion_id').sum()

Unnamed: 0_level_0,commission_amount
promotion_id,Unnamed: 1_level_1
2,188049.4
5,1153804.8


In [245]:
commissions_check = day_orders_commissions.loc[day_orders_commissions['product_id'] == 272, 'commission_amount'].sum()
commissions_check

188049.40000000002

In [240]:
prods = pd.unique(day_orders_commissions['product_id']).tolist()
len(prods)
check_prods = day_prod_promos['product_id'].tolist()
check_prods
found = [cp for cp in check_prods if cp in prods]
found

[272, 835]