Read in the `./data/merged-sales.csv` from the previous exercise:

In [1]:
import pandas as pd

merged_sales = pd.read_csv('./data/merged-sales.csv')
merged_sales.head()

Unnamed: 0,order_num,line_num,date,sku,qty,category,price,revenue
0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4
1,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2
2,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93
3,1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65
4,1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86


In [2]:
def make_orderid(row):
    return f'{row.order_num}_{row.line_num}'

merged_sales['orderid'] = merged_sales.apply(make_orderid, axis=1)
merged_sales = merged_sales.set_index('orderid')
merged_sales.head()

Unnamed: 0_level_0,order_num,line_num,date,sku,qty,category,price,revenue
orderid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0_0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4
641_3,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2
0_1,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93
1_0,1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65
1_1,1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86


In [3]:
merged_sales.set_index(['order_num', 'line_num']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,sku,qty,category,price,revenue
order_num,line_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4
641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2
0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93
1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65
1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86


Generate a dataframe showing the total quantity sold and mean price per category

In [5]:
merged_sales.groupby('category').agg({
    'qty': ['sum'], 
    'price': ['mean']
})

Unnamed: 0_level_0,qty,price
Unnamed: 0_level_1,sum,mean
category,Unnamed: 1_level_2,Unnamed: 2_level_2
Faucets,2888,2431.517419
Misc,3238,2501.622278
Showers,3057,2469.608597
Toilets,2857,2450.498562


In [6]:
g = merged_sales.groupby('category')

In [7]:
pd.concat([
    g.qty.sum().rename('qty_sum'),
    g.price.mean().rename('price_mean')
], axis=1)

Unnamed: 0_level_0,qty_sum,price_mean
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Faucets,2888,2431.517419
Misc,3238,2501.622278
Showers,3057,2469.608597
Toilets,2857,2450.498562


Add a column `order_share` that shows what percentage of the order total each line item is (in revenue)

In [8]:
g = merged_sales.groupby('order_num')
g.revenue.agg('sum').head(20)

order_num
0     31237.33
1     32879.28
2     32284.50
3     50845.71
4     17308.97
5     13462.40
6     29113.67
7     35934.42
8     25182.55
9     11458.38
10     4338.16
11     2891.96
12     3644.78
13    57723.10
14    29416.54
15    23799.06
16    48692.40
17     8353.31
18    70804.24
19    55060.08
Name: revenue, dtype: float64

In [9]:
merged_sales.revenue.head()

orderid
0_0       9383.40
641_3    12511.20
0_1      21853.93
1_0       2458.65
1_1      22783.86
Name: revenue, dtype: float64

In [10]:
order_sales = g.revenue.transform('sum')
order_sales.head(20)

orderid
0_0      31237.33
641_3    61857.12
0_1      31237.33
1_0      32879.28
1_1      32879.28
1_2      32879.28
2_0      32284.50
2_1      32284.50
2_2      32284.50
3_0      50845.71
3_1      50845.71
3_2      50845.71
4_0      17308.97
4_1      17308.97
4_2      17308.97
4_3      17308.97
5_0      13462.40
6_0      29113.67
6_1      29113.67
6_2      29113.67
Name: revenue, dtype: float64

In [11]:
merged_sales['order_share'] = merged_sales['revenue'] / order_sales
merged_sales.head()

Unnamed: 0_level_0,order_num,line_num,date,sku,qty,category,price,revenue,order_share
orderid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0_0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4,0.300391
641_3,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2,0.20226
0_1,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93,0.699609
1_0,1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65,0.074778
1_1,1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86,0.692955


Show rows for orders 0-5, in sorted order.

In [12]:
merged_sales[merged_sales.order_num < 6].sort_values(['order_num', 'line_num'])

Unnamed: 0_level_0,order_num,line_num,date,sku,qty,category,price,revenue,order_share
orderid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0_0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4,0.300391
0_1,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93,0.699609
1_0,1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65,0.074778
1_1,1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86,0.692955
1_2,1,2,2011-01-02,sku77480,9,Faucets,848.53,7636.77,0.232267
2_0,2,0,2011-01-05,sku45155,3,Misc,2682.97,8048.91,0.249312
2_1,2,1,2011-01-05,sku67327,9,Misc,1593.51,14341.59,0.444225
2_2,2,2,2011-01-05,sku27757,8,Misc,1236.75,9894.0,0.306463
3_0,3,0,2011-01-06,sku82064,4,Misc,771.72,3086.88,0.060711
3_1,3,1,2011-01-06,sku15816,3,Showers,4399.92,13199.76,0.259604


Without transform...

In [13]:
df = pd.read_csv('./data/merged-sales.csv')
df.head()

Unnamed: 0,order_num,line_num,date,sku,qty,category,price,revenue
0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4
1,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2
2,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93
3,1,0,2011-01-02,sku75108,3,Misc,819.55,2458.65
4,1,1,2011-01-02,sku78838,9,Showers,2531.54,22783.86


In [14]:
order_total = df.groupby('order_num').revenue.sum().rename('order_total')
order_total

order_num
0      31237.33
1      32879.28
2      32284.50
3      50845.71
4      17308.97
         ...   
986    92766.81
987    62268.66
988    14172.84
989    80476.62
990    32148.18
Name: order_total, Length: 991, dtype: float64

In [15]:
df = pd.merge(df, order_total, left_on='order_num', right_index=True)
df.head()

Unnamed: 0,order_num,line_num,date,sku,qty,category,price,revenue,order_total
0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4,31237.33
2,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93,31237.33
1,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2,61857.12
1613,641,0,2012-10-10,sku93325,9,Showers,1650.08,14850.72,61857.12
1614,641,1,2012-10-10,sku69868,2,Showers,2625.48,5250.96,61857.12


In [16]:
df['order_share'] = df['revenue'] / df['order_total']

In [17]:
df.head()

Unnamed: 0,order_num,line_num,date,sku,qty,category,price,revenue,order_total,order_share
0,0,0,2011-01-01,sku4333,6,Showers,1563.9,9383.4,31237.33,0.300391
2,0,1,2011-01-01,sku76536,7,Faucets,3121.99,21853.93,31237.33,0.699609
1,641,3,2012-10-10,sku4333,8,Showers,1563.9,12511.2,61857.12,0.20226
1613,641,0,2012-10-10,sku93325,9,Showers,1650.08,14850.72,61857.12,0.240081
1614,641,1,2012-10-10,sku69868,2,Showers,2625.48,5250.96,61857.12,0.084889
