# Tutorial: Joining data
In this tutorial, we will calculate the total net revenue for all our bikes sold via our online store. We will use Microsoft's AdventureWorks data.

### Imports & Data Warehouse Connection

In [None]:
!pip install rasgoql, rasgotransforms --upgrade

In [2]:
import pandas as pd
import rasgoql

In [4]:
# Action needed: Set env_file to your filepath
env_file = "<path_to_env_file>"
creds = rasgoql.SnowflakeCredentials.from_env(env_file)
rql = rasgoql.connect(creds)

### Gather relevant data
To calculate the total net revenue, we need to join the individual order information with any promotion data for a given sale. This data is spread across two tables: 'FACTINTERNETSALES' and 'DIMPROMOTION'.

We have two options to load this data:

#### Option 1: Use Existing Tables in your Data Warehouse

In [None]:
# Action needed: Change your_namespace to the db.schema your data resides in
your_namespace = 'db.schema'

sales_ds = rql.dataset(f'{your_namespace}.FACTINTERNETSALES')
promotion_ds = rql.dataset(f'{your_namespace}.DIMPROMOTION')


#### Option 2: Load Data from csvs
Note: The csv files used in this demo are available at https://github.com/rasgointelligence/RasgoQL/blob/8a02062e3fea458a4ffcfe9a908f7224fa86588a/tutorials/

In [8]:
# Load Fact file into csv
sales_pd = pd.read_csv(
    'FactInternetSales.csv',
    delimiter='|',
    names=['PRODUCTKEY', 'ORDERDATEKEY', 'DUEDATEKEY', 'SHIPDATEKEY',
       'CUSTOMERKEY', 'PROMOTIONKEY', 'CURRENCYKEY', 'SALESTERRITORYKEY',
       'SALESORDERNUMBER', 'SALESORDERLINENUMBER', 'REVISIONNUMBER',
       'ORDERQUANTITY', 'UNITPRICE', 'EXTENDEDAMOUNT', 'UNITPRICEDISCOUNTPCT',
       'DISCOUNTAMOUNT', 'PRODUCTSTANDARDCOST', 'TOTALPRODUCTCOST',
       'SALESAMOUNT', 'TAXAMT', 'FREIGHT', 'CARRIERTRACKINGNUMBER',
       'CUSTOMERPONUMBER', 'ORDERDATE', 'DUEDATE', 'SHIPDATE'],
    parse_dates=True,
    encoding='utf_16_le'
    )

# Convert known date fields
sales_pd['ORDERDATE'] = pd.to_datetime(sales_pd['ORDERDATE']).dt.date
sales_pd['DUEDATE'] = pd.to_datetime(sales_pd['DUEDATE']).dt.date
sales_pd['SHIPDATE'] = pd.to_datetime(sales_pd['SHIPDATE']).dt.date

# Load Dim file into csv
promotion_pd = pd.read_csv(
    'DimPromotion.csv',
    delimiter='|',
    names=['PROMOTIONKEY', 'PROMOTIONALTERNATEKEY', 'ENGLISHPROMOTIONNAME',
       'SPANISHPROMOTIONNAME', 'FRENCHPROMOTIONNAME', 'DISCOUNTPCT',
       'ENGLISHPROMOTIONTYPE', 'SPANISHPROMOTIONTYPE', 'FRENCHPROMOTIONTYPE',
       'ENGLISHPROMOTIONCATEGORY', 'SPANISHPROMOTIONCATEGORY',
       'FRENCHPROMOTIONCATEGORY', 'STARTDATE', 'ENDDATE', 'MINQTY', 'MAXQTY'],
    parse_dates=True,
    encoding='utf_16_le'
    )

# Convert known date fields
promotion_pd['STARTDATE'] = pd.to_datetime(promotion_pd['STARTDATE']).dt.date
promotion_pd['ENDDATE'] = pd.to_datetime(promotion_pd['ENDDATE']).dt.date

In [17]:
# Load csvs into your DW as a Dataset
sales_ds = rql.dataset_from_df(
    df=sales_pd,
    table_name='FACTINTERNETSALES',
    method='replace'
)

promotion_ds = rql.dataset_from_df(
    df=promotion_pd,
    table_name='DIMPROMOTION',
    method='replace'
)

### Pivot data by date
Starting with Pandas, lets pivot the local csv containing sales data to group things by date.

In [9]:
gross_sales = pd.pivot_table(sales_pd,
                             index='ORDERDATE',
                             aggfunc= sum)
gross_sales

Unnamed: 0_level_0,CARRIERTRACKINGNUMBER,CURRENCYKEY,CUSTOMERKEY,CUSTOMERPONUMBER,DISCOUNTAMOUNT,DUEDATEKEY,EXTENDEDAMOUNT,FREIGHT,ORDERDATEKEY,ORDERQUANTITY,...,PROMOTIONKEY,REVISIONNUMBER,SALESAMOUNT,SALESORDERLINENUMBER,SALESTERRITORYKEY,SHIPDATEKEY,TAXAMT,TOTALPRODUCTCOST,UNITPRICE,UNITPRICEDISCOUNTPCT
ORDERDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-29,0.0,264,101524,0.0,0,100550550,14477.3382,361.9337,100506145,5,...,5,5,14477.3382,5,27,100550525,1158.1871,8320.9037,14477.3382,0
2010-12-30,0.0,118,66285,0.0,0,80440444,13931.5200,348.2882,80404920,4,...,4,4,13931.5200,4,31,80440424,1114.5216,8152.8372,13931.5200,0
2010-12-31,0.0,310,108147,0.0,0,100550560,15012.1782,375.3047,100506155,5,...,5,5,15012.1782,5,36,100550535,1200.9743,9098.3231,15012.1782,0
2011-01-01,0.0,129,41119,0.0,0,40220226,7156.5400,178.9136,40220202,2,...,2,2,7156.5400,2,9,40220216,572.5232,4342.5884,7156.5400,0
2011-01-02,0.0,216,99453,0.0,0,100550570,15012.1782,375.3047,100550510,5,...,5,5,15012.1782,5,41,100550545,1200.9743,9098.3231,15012.1782,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-01-24,0.0,6500,1298903,0.0,0,1309113325,1502.8500,37.5742,1309108060,65,...,65,65,1502.8500,117,374,1309108515,120.2280,632.1362,1502.8500,0
2014-01-25,0.0,8200,1541874,0.0,0,1651496892,1747.6700,43.6954,1651490250,82,...,82,82,1747.6700,166,538,1651496482,139.8136,782.6954,1747.6700,0
2014-01-26,0.0,6800,1232559,0.0,0,1369534076,1847.4600,46.1895,1369528568,68,...,68,68,1847.4600,122,420,1369533736,147.7968,849.7083,1847.4600,0
2014-01-27,0.0,6100,1177830,0.0,0,1228552688,1477.6100,36.9429,1228547747,61,...,61,61,1477.6100,123,299,1228552383,118.2088,660.3122,1477.6100,0


Before we join the promotion data in, lets isolate things to only the data that's necessary to calculate net revenue. Here we just create a new dataframe only containing the promotion key information and the discount percentage.

In [10]:
lean_promo = pd.DataFrame(data=promotion_pd,
                            columns=['PROMOTIONKEY', 'DISCOUNTPCT']
                            )
lean_promo

Unnamed: 0,PROMOTIONKEY,DISCOUNTPCT
0,1,0.0
1,2,0.02
2,3,0.05
3,4,0.1
4,5,0.15
5,6,0.2
6,7,0.35
7,8,0.1
8,9,0.3
9,10,0.5


In [11]:
# Merge the two datasets together
sales_with_promo = pd.merge(sales_pd, lean_promo, left_on='PROMOTIONKEY', right_index=True, how='left')

sales_with_promo

Unnamed: 0,PRODUCTKEY,ORDERDATEKEY,DUEDATEKEY,SHIPDATEKEY,CUSTOMERKEY,PROMOTIONKEY_x,CURRENCYKEY,SALESTERRITORYKEY,SALESORDERNUMBER,SALESORDERLINENUMBER,...,SALESAMOUNT,TAXAMT,FREIGHT,CARRIERTRACKINGNUMBER,CUSTOMERPONUMBER,ORDERDATE,DUEDATE,SHIPDATE,PROMOTIONKEY_y,DISCOUNTPCT
0,310,20101229,20110110,20110105,21768,1,19,6,SO43697,1,...,3578.2700,286.2616,89.4568,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
1,346,20101229,20110110,20110105,28389,1,39,7,SO43698,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
2,346,20101229,20110110,20110105,25863,1,100,1,SO43699,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
3,336,20101229,20110110,20110105,14501,1,100,4,SO43700,1,...,699.0982,55.9279,17.4775,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
4,346,20101229,20110110,20110105,11003,1,6,9,SO43701,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60393,485,20140128,20140209,20140204,15868,1,100,6,SO75122,1,...,21.9800,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60394,225,20140128,20140209,20140204,15868,1,100,6,SO75122,2,...,8.9900,0.7192,0.2248,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60395,485,20140128,20140209,20140204,18759,1,100,6,SO75123,1,...,21.9800,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60396,486,20140128,20140209,20140204,18759,1,100,6,SO75123,2,...,159.0000,12.7200,3.9750,,,2014-01-28,2014-02-09,2014-02-04,2,0.02


In [12]:
# or 
sales_pd.merge(lean_promo, left_on='PROMOTIONKEY', right_index=True, how='left')

Unnamed: 0,PRODUCTKEY,ORDERDATEKEY,DUEDATEKEY,SHIPDATEKEY,CUSTOMERKEY,PROMOTIONKEY_x,CURRENCYKEY,SALESTERRITORYKEY,SALESORDERNUMBER,SALESORDERLINENUMBER,...,SALESAMOUNT,TAXAMT,FREIGHT,CARRIERTRACKINGNUMBER,CUSTOMERPONUMBER,ORDERDATE,DUEDATE,SHIPDATE,PROMOTIONKEY_y,DISCOUNTPCT
0,310,20101229,20110110,20110105,21768,1,19,6,SO43697,1,...,3578.2700,286.2616,89.4568,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
1,346,20101229,20110110,20110105,28389,1,39,7,SO43698,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
2,346,20101229,20110110,20110105,25863,1,100,1,SO43699,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
3,336,20101229,20110110,20110105,14501,1,100,4,SO43700,1,...,699.0982,55.9279,17.4775,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
4,346,20101229,20110110,20110105,11003,1,6,9,SO43701,1,...,3399.9900,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60393,485,20140128,20140209,20140204,15868,1,100,6,SO75122,1,...,21.9800,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60394,225,20140128,20140209,20140204,15868,1,100,6,SO75122,2,...,8.9900,0.7192,0.2248,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60395,485,20140128,20140209,20140204,18759,1,100,6,SO75123,1,...,21.9800,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02
60396,486,20140128,20140209,20140204,18759,1,100,6,SO75123,2,...,159.0000,12.7200,3.9750,,,2014-01-28,2014-02-09,2014-02-04,2,0.02


### Calculate Net Revenue

In [13]:
# Subtract DISCOUNT_PCT from SALESAMOUNT
sales_with_promo['SALESAMOUNT_NET'] = sales_with_promo['SALESAMOUNT'] - (sales_with_promo['SALESAMOUNT'] * sales_with_promo['DISCOUNTPCT'])
sales_with_promo

Unnamed: 0,PRODUCTKEY,ORDERDATEKEY,DUEDATEKEY,SHIPDATEKEY,CUSTOMERKEY,PROMOTIONKEY_x,CURRENCYKEY,SALESTERRITORYKEY,SALESORDERNUMBER,SALESORDERLINENUMBER,...,TAXAMT,FREIGHT,CARRIERTRACKINGNUMBER,CUSTOMERPONUMBER,ORDERDATE,DUEDATE,SHIPDATE,PROMOTIONKEY_y,DISCOUNTPCT,SALESAMOUNT_NET
0,310,20101229,20110110,20110105,21768,1,19,6,SO43697,1,...,286.2616,89.4568,,,2010-12-29,2011-01-10,2011-01-05,2,0.02,3506.704600
1,346,20101229,20110110,20110105,28389,1,39,7,SO43698,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02,3331.990200
2,346,20101229,20110110,20110105,25863,1,100,1,SO43699,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02,3331.990200
3,336,20101229,20110110,20110105,14501,1,100,4,SO43700,1,...,55.9279,17.4775,,,2010-12-29,2011-01-10,2011-01-05,2,0.02,685.116236
4,346,20101229,20110110,20110105,11003,1,6,9,SO43701,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,2,0.02,3331.990200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60393,485,20140128,20140209,20140204,15868,1,100,6,SO75122,1,...,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02,21.540400
60394,225,20140128,20140209,20140204,15868,1,100,6,SO75122,2,...,0.7192,0.2248,,,2014-01-28,2014-02-09,2014-02-04,2,0.02,8.810200
60395,485,20140128,20140209,20140204,18759,1,100,6,SO75123,1,...,1.7584,0.5495,,,2014-01-28,2014-02-09,2014-02-04,2,0.02,21.540400
60396,486,20140128,20140209,20140204,18759,1,100,6,SO75123,2,...,12.7200,3.9750,,,2014-01-28,2014-02-09,2014-02-04,2,0.02,155.820000


# RASGOQL
Now lets repeat the same excersise with RasgoQL and this time have all the compute done directly in the cloud data warehouse.

In [22]:
# Grab only what we need from promo

promo_data = promotion_ds.drop_columns(
    include_cols=['PROMOTIONKEY', 'DISCOUNTPCT']
).save(table_name='PROMO_DATA')

In [23]:
# Combine them into one dataset
sales_and_promo = sales_ds.join(join_table=promo_data.fqtn,
                                 join_type='RIGHT',
                                 join_columns={'PROMOTIONKEY':'PROMOTIONKEY'}
                                )
sales_and_promo.preview()

Unnamed: 0,PRODUCTKEY,ORDERDATEKEY,DUEDATEKEY,SHIPDATEKEY,CUSTOMERKEY,PROMOTIONKEY,CURRENCYKEY,SALESTERRITORYKEY,SALESORDERNUMBER,SALESORDERLINENUMBER,...,TOTALPRODUCTCOST,SALESAMOUNT,TAXAMT,FREIGHT,CARRIERTRACKINGNUMBER,CUSTOMERPONUMBER,ORDERDATE,DUEDATE,SHIPDATE,DISCOUNTPCT
0,310,20101229,20110110,20110105,21768,1,19,6,SO43697,1,...,2171.2942,3578.27,286.2616,89.4568,,,2010-12-29,2011-01-10,2011-01-05,0.0
1,346,20101229,20110110,20110105,28389,1,39,7,SO43698,1,...,1912.1544,3399.99,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0
2,346,20101229,20110110,20110105,25863,1,100,1,SO43699,1,...,1912.1544,3399.99,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0
3,336,20101229,20110110,20110105,14501,1,100,4,SO43700,1,...,413.1463,699.0982,55.9279,17.4775,,,2010-12-29,2011-01-10,2011-01-05,0.0
4,346,20101229,20110110,20110105,11003,1,6,9,SO43701,1,...,1912.1544,3399.99,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0
5,311,20101230,20110111,20110106,27645,1,100,4,SO43702,1,...,2171.2942,3578.27,286.2616,89.4568,,,2010-12-30,2011-01-11,2011-01-06,0.0
6,310,20101230,20110111,20110106,16624,1,6,9,SO43703,1,...,2171.2942,3578.27,286.2616,89.4568,,,2010-12-30,2011-01-11,2011-01-06,0.0
7,351,20101230,20110111,20110106,11005,1,6,9,SO43704,1,...,1898.0944,3374.99,269.9992,84.3748,,,2010-12-30,2011-01-11,2011-01-06,0.0
8,344,20101230,20110111,20110106,11011,1,6,9,SO43705,1,...,1912.1544,3399.99,271.9992,84.9998,,,2010-12-30,2011-01-11,2011-01-06,0.0
9,312,20101231,20110112,20110107,27621,1,100,4,SO43706,1,...,2171.2942,3578.27,286.2616,89.4568,,,2010-12-31,2011-01-12,2011-01-07,0.0


Lastly, we'll use the RasgoQL `math` transform to subtract DISCOUNTPCT amount from SALESAMOUNT for all orders in the warehouse. 

In [24]:
# Calculate net sales 

net_sales = sales_and_promo.math(math_ops=['SALESAMOUNT * DISCOUNTPCT', 'SALESAMOUNT - DISCOUNTTOTAL'],
                                 names=['DISCOUNTTOTAL', 'NET_SALE']
                                )
net_sales.preview()

Unnamed: 0,PRODUCTKEY,ORDERDATEKEY,DUEDATEKEY,SHIPDATEKEY,CUSTOMERKEY,PROMOTIONKEY,CURRENCYKEY,SALESTERRITORYKEY,SALESORDERNUMBER,SALESORDERLINENUMBER,...,TAXAMT,FREIGHT,CARRIERTRACKINGNUMBER,CUSTOMERPONUMBER,ORDERDATE,DUEDATE,SHIPDATE,DISCOUNTPCT,DISCOUNTTOTAL,NET_SALE
0,310,20101229,20110110,20110105,21768,1,19,6,SO43697,1,...,286.2616,89.4568,,,2010-12-29,2011-01-10,2011-01-05,0.0,0.0,3578.27
1,346,20101229,20110110,20110105,28389,1,39,7,SO43698,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0,0.0,3399.99
2,346,20101229,20110110,20110105,25863,1,100,1,SO43699,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0,0.0,3399.99
3,336,20101229,20110110,20110105,14501,1,100,4,SO43700,1,...,55.9279,17.4775,,,2010-12-29,2011-01-10,2011-01-05,0.0,0.0,699.0982
4,346,20101229,20110110,20110105,11003,1,6,9,SO43701,1,...,271.9992,84.9998,,,2010-12-29,2011-01-10,2011-01-05,0.0,0.0,3399.99
5,311,20101230,20110111,20110106,27645,1,100,4,SO43702,1,...,286.2616,89.4568,,,2010-12-30,2011-01-11,2011-01-06,0.0,0.0,3578.27
6,310,20101230,20110111,20110106,16624,1,6,9,SO43703,1,...,286.2616,89.4568,,,2010-12-30,2011-01-11,2011-01-06,0.0,0.0,3578.27
7,351,20101230,20110111,20110106,11005,1,6,9,SO43704,1,...,269.9992,84.3748,,,2010-12-30,2011-01-11,2011-01-06,0.0,0.0,3374.99
8,344,20101230,20110111,20110106,11011,1,6,9,SO43705,1,...,271.9992,84.9998,,,2010-12-30,2011-01-11,2011-01-06,0.0,0.0,3399.99
9,312,20101231,20110112,20110107,27621,1,100,4,SO43706,1,...,286.2616,89.4568,,,2010-12-31,2011-01-12,2011-01-07,0.0,0.0,3578.27


As you can see, rasgoQL produces the same results as our pandas workflow above.

Performing these operations using rasgoQL has two added benefits:
1. we can print the SQL used to replicate these calculations in our Data Warehouse
2. we can export the SQL as a model to use in dbt

In [25]:
# Review SQL
print(net_sales.sql())

WITH RQL_ESLZTLXZPB AS (
SELECT
  t1.PRODUCTKEY, 
  t1.ORDERDATEKEY, 
  t1.DUEDATEKEY, 
  t1.SHIPDATEKEY, 
  t1.CUSTOMERKEY, 
  t1.PROMOTIONKEY, 
  t1.CURRENCYKEY, 
  t1.SALESTERRITORYKEY, 
  t1.SALESORDERNUMBER, 
  t1.SALESORDERLINENUMBER, 
  t1.REVISIONNUMBER, 
  t1.ORDERQUANTITY, 
  t1.UNITPRICE, 
  t1.EXTENDEDAMOUNT, 
  t1.UNITPRICEDISCOUNTPCT, 
  t1.DISCOUNTAMOUNT, 
  t1.PRODUCTSTANDARDCOST, 
  t1.TOTALPRODUCTCOST, 
  t1.SALESAMOUNT, 
  t1.TAXAMT, 
  t1.FREIGHT, 
  t1.CARRIERTRACKINGNUMBER, 
  t1.CUSTOMERPONUMBER, 
  t1.ORDERDATE, 
  t1.DUEDATE, 
  t1.SHIPDATE, t2.DISCOUNTPCT
    
FROM rasgolocal.public.FACTINTERNETSALES as t1
RIGHT JOIN rasgolocal.public.PROMO_DATA as t2
ON t1.PROMOTIONKEY = t2.PROMOTIONKEY
) SELECT *
    , SALESAMOUNT * DISCOUNTPCT as DISCOUNTTOTAL
    , SALESAMOUNT - DISCOUNTTOTAL as NET_SALE
FROM RQL_ESLZTLXZPB


In [26]:
# Export to DBT 
net_sales.to_dbt('./online_store_net_salesv2')

INFO:rasgoQL:to_dbt is a beta feature. You are among the first rasgoql users to preview this feature! There may still be bugs to discover. If you experience any unexpected behaviors, please raise an issue in our github repo (https://github.com/rasgointelligence/RasgoQL/issues) or contact us directly on slack.


'./online_store_net_salesv2/rasgoql/dbt_project.yml'