In [1]:
# Loading libraries and files
import numpy as np
import pandas as pd

market_df = pd.read_csv("./global_sales_data/market_fact.csv")
customer_df = pd.read_csv("./global_sales_data/cust_dimen.csv")
product_df = pd.read_csv("./global_sales_data/prod_dimen.csv")
shipping_df = pd.read_csv("./global_sales_data/shipping_dimen.csv")
orders_df = pd.read_csv("./global_sales_data/orders_dimen.csv")

# Merging the dataframes to create a master_df
df_1 = pd.merge(market_df, customer_df, how='inner', on='Cust_id')
df_2 = pd.merge(df_1, product_df, how='inner', on='Prod_id')
df_3 = pd.merge(df_2, shipping_df, how='inner', on='Ship_id')
master_df = pd.merge(df_3, orders_df, how='inner', on='Ord_id')

master_df.head()

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin,...,Region,Customer_Segment,Product_Category,Product_Sub_Category,Order_ID_x,Ship_Mode,Ship_Date,Order_ID_y,Order_Date,Order_Priority
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56,...,WEST,CORPORATE,OFFICE SUPPLIES,"SCISSORS, RULERS AND TRIMMERS",36262,REGULAR AIR,28-07-2010,36262,27-07-2010,NOT SPECIFIED
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54,...,WEST,CORPORATE,OFFICE SUPPLIES,PENS & ART SUPPLIES,20513,EXPRESS AIR,08-07-2009,20513,07-07-2009,HIGH
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59,...,WEST,CORPORATE,TECHNOLOGY,TELEPHONES AND COMMUNICATION,36262,EXPRESS AIR,27-07-2010,36262,27-07-2010,NOT SPECIFIED
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37,...,WEST,CORPORATE,OFFICE SUPPLIES,PAPER,39682,EXPRESS AIR,11-11-2010,39682,09-11-2010,MEDIUM
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38,...,WEST,CORPORATE,TECHNOLOGY,OFFICE MACHINES,54019,DELIVERY TRUCK,08-07-2009,54019,01-07-2009,LOW


In [3]:
master_df.groupby("Customer_Segment")["Sales"].mean()

Customer_Segment
CONSUMER          1857.859965
CORPORATE         1787.680389
HOME OFFICE       1754.312931
SMALL BUSINESS    1698.124841
Name: Sales, dtype: float64

In [2]:
# E.g. Compare average Sales across customer segments
master_df.pivot_table(index="Customer_Segment", values="Sales", aggfunc="mean")

Unnamed: 0_level_0,Sales
Customer_Segment,Unnamed: 1_level_1
CONSUMER,1857.859965
CORPORATE,1787.680389
HOME OFFICE,1754.312931
SMALL BUSINESS,1698.124841


In [7]:
def is_positive(x):
    return x>0

master_df["is_profitable"] = master_df["Profit"].apply(is_positive)
master_df.head()

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin,...,Customer_Segment,Product_Category,Product_Sub_Category,Order_ID_x,Ship_Mode,Ship_Date,Order_ID_y,Order_Date,Order_Priority,is_profitable
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56,...,CORPORATE,OFFICE SUPPLIES,"SCISSORS, RULERS AND TRIMMERS",36262,REGULAR AIR,28-07-2010,36262,27-07-2010,NOT SPECIFIED,False
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54,...,CORPORATE,OFFICE SUPPLIES,PENS & ART SUPPLIES,20513,EXPRESS AIR,08-07-2009,20513,07-07-2009,HIGH,True
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59,...,CORPORATE,TECHNOLOGY,TELEPHONES AND COMMUNICATION,36262,EXPRESS AIR,27-07-2010,36262,27-07-2010,NOT SPECIFIED,True
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37,...,CORPORATE,OFFICE SUPPLIES,PAPER,39682,EXPRESS AIR,11-11-2010,39682,09-11-2010,MEDIUM,True
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38,...,CORPORATE,TECHNOLOGY,OFFICE MACHINES,54019,DELIVERY TRUCK,08-07-2009,54019,01-07-2009,LOW,True


In [8]:
master_df["Profit"].apply(lambda x: x>0)

0       False
1        True
2        True
3        True
4        True
        ...  
8394     True
8395    False
8396    False
8397     True
8398     True
Name: Profit, Length: 8399, dtype: bool

In [10]:
# Creating a column Profit / Order_Quantity
master_df["profit_per_qty"] = master_df["Profit"]/master_df["Order_Quantity"]
master_df.head()

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin,...,Product_Category,Product_Sub_Category,Order_ID_x,Ship_Mode,Ship_Date,Order_ID_y,Order_Date,Order_Priority,is_profitable,profit_per_qty
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56,...,OFFICE SUPPLIES,"SCISSORS, RULERS AND TRIMMERS",36262,REGULAR AIR,28-07-2010,36262,27-07-2010,NOT SPECIFIED,False,-1.326522
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54,...,OFFICE SUPPLIES,PENS & ART SUPPLIES,20513,EXPRESS AIR,08-07-2009,20513,07-07-2009,HIGH,True,0.350769
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59,...,TECHNOLOGY,TELEPHONES AND COMMUNICATION,36262,EXPRESS AIR,27-07-2010,36262,27-07-2010,NOT SPECIFIED,True,44.188462
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37,...,OFFICE SUPPLIES,PAPER,39682,EXPRESS AIR,11-11-2010,39682,09-11-2010,MEDIUM,True,16.961395
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38,...,TECHNOLOGY,OFFICE MACHINES,54019,DELIVERY TRUCK,08-07-2009,54019,01-07-2009,LOW,True,34.853429


In [11]:
# E.g. compare total number of profitable orders across regions
master_df.pivot_table(index="Region", values="is_profitable", aggfunc="sum")

Unnamed: 0_level_0,is_profitable
Region,Unnamed: 1_level_1
ATLANTIC,544
NORTHWEST TERRITORIES,194
NUNAVUT,38
ONTARIO,916
PRARIE,852
QUEBEC,360
WEST,969
YUKON,262


In [13]:
master_df.groupby("Region")["is_profitable"].sum()

Region
ATLANTIC                 544
NORTHWEST TERRITORIES    194
NUNAVUT                   38
ONTARIO                  916
PRARIE                   852
QUEBEC                   360
WEST                     969
YUKON                    262
Name: is_profitable, dtype: int64

In [16]:
sum([0, 1, 0])

1

In [18]:
master_df.groupby("Region")["is_profitable"].value_counts()

Region                 is_profitable
ATLANTIC               True              544
                       False             536
NORTHWEST TERRITORIES  False             200
                       True              194
NUNAVUT                False              41
                       True               38
ONTARIO                True              916
                       False             910
PRARIE                 False             854
                       True              852
QUEBEC                 False             421
                       True              360
WEST                   False            1022
                       True              969
YUKON                  False             280
                       True              262
Name: count, dtype: int64

In [19]:
# Compare the total profit across product categories and customer segments
master_df.pivot_table(index="Product_Category", columns="Customer_Segment", values="Profit", aggfunc="sum")

Customer_Segment,CONSUMER,CORPORATE,HOME OFFICE,SMALL BUSINESS
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FURNITURE,42728.26,22008.08,23979.2,28717.49
OFFICE SUPPLIES,88532.29,203037.38,121145.65,105306.11
TECHNOLOGY,156699.39,374700.54,173229.18,181684.41


In [None]:
# Assignment: Try to use the groupby for the above question "Compare the total profit across product categories and customer segments"