In [None]:
# data ingest into postgres
psql --host=<> --port=5432 --username=<> --dbname=<>
\i src/create_db_tables_pg.sql
\i data/load_db_tables_pg.sql

In [4]:
import boto3
import json

In [5]:
%config SqlMagic.autopandas=True
%config SqlMagic.displaycon=False
%config SqlMagic.feedback=False
%config SqlMagic.displaylimit=5
%reload_ext sql

In [6]:
def get_secret(secret_name, region_name="us-east-1"):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name)
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    get_secret_value_response = json.loads(get_secret_value_response['SecretString'])
    return get_secret_value_response

In [7]:
secret_vals = get_secret("wysde")

postgres_endpoint = secret_vals['RDS_POSTGRES_HOST']
postgres_user = secret_vals['RDS_POSTGRES_USERNAME']
postgres_pass = secret_vals['RDS_POSTGRES_PASSWORD']
port = secret_vals['RDS_POSTGRES_PORT']
dbname = "sparsh"

conn = "postgresql+psycopg2://%s:%s@%s:%s/%s" \
% (postgres_user, postgres_pass, postgres_endpoint, port, dbname)

In [40]:
%sql {conn}

## Standard Transformations

1. Projection of data
2. Filtering Data
3. Perfoming Aggregations
4. Joins
5. Sorting
6. Ranking

### Selecting or Projecting Data

In [8]:
%%sql

SELECT * FROM orders LIMIT 10;

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25,11599,CLOSED
1,2,2013-07-25,256,PENDING_PAYMENT
2,3,2013-07-25,12111,COMPLETE
3,4,2013-07-25,8827,CLOSED
4,5,2013-07-25,11318,COMPLETE
5,6,2013-07-25,7130,COMPLETE
6,7,2013-07-25,4530,COMPLETE
7,8,2013-07-25,2911,PROCESSING
8,9,2013-07-25,5657,PENDING_PAYMENT
9,10,2013-07-25,5648,PENDING_PAYMENT


In [13]:
%%sql

SELECT * FROM information_schema.columns
WHERE table_catalog = '{dbname}'
AND table_name = 'orders'

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,sparsh,public,orders,order_id,1,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,sparsh,public,orders,order_date,2,,NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
2,sparsh,public,orders,order_customer_id,3,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
3,sparsh,public,orders,order_status,4,,NO,character varying,45.0,180.0,...,NO,,,,,,NO,NEVER,,YES


In [14]:
%%sql

SELECT order_customer_id,
    to_char(order_date, 'yyyy-MM') AS order_month, 
    order_status
FROM orders
LIMIT 10;

Unnamed: 0,order_customer_id,order_month,order_status
0,11599,2013-07,CLOSED
1,256,2013-07,PENDING_PAYMENT
2,12111,2013-07,COMPLETE
3,8827,2013-07,CLOSED
4,11318,2013-07,COMPLETE
5,7130,2013-07,COMPLETE
6,4530,2013-07,COMPLETE
7,2911,2013-07,PROCESSING
8,5657,2013-07,PENDING_PAYMENT
9,5648,2013-07,PENDING_PAYMENT


In [15]:
%%sql

SELECT count(DISTINCT to_char(order_date, 'yyyy-MM')) AS distinct_month_count
FROM orders

Unnamed: 0,distinct_month_count
0,13


### Filtering Data

In [16]:
%%sql

SELECT COUNT(1) FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10

Unnamed: 0,count
0,30455


In [17]:
%%sql

SELECT * FROM orders
WHERE order_date = '2014-01-01 00:00:00.0'
LIMIT 3

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,25876,2014-01-01,3414,PENDING_PAYMENT
1,25877,2014-01-01,5549,PENDING_PAYMENT
2,25878,2014-01-01,9084,PENDING


In [18]:
%%sql

SELECT * FROM orders
WHERE order_date = '2014-01-01'
LIMIT 3

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,25876,2014-01-01,3414,PENDING_PAYMENT
1,25877,2014-01-01,5549,PENDING_PAYMENT
2,25878,2014-01-01,9084,PENDING


In [20]:
%%sql

SELECT * FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM-dd') LIKE '2014-01-%'
LIMIT 10

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,25882,2014-01-01,4598,COMPLETE
1,25888,2014-01-01,6735,COMPLETE
2,25889,2014-01-01,10045,COMPLETE
3,25891,2014-01-01,3037,CLOSED
4,25895,2014-01-01,1044,COMPLETE
5,25897,2014-01-01,6405,COMPLETE
6,25898,2014-01-01,3950,COMPLETE
7,25899,2014-01-01,8068,CLOSED
8,25900,2014-01-01,2382,CLOSED
9,25901,2014-01-01,3099,COMPLETE


In [21]:
%%sql

SELECT * FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM') = '2014-01'
LIMIT 10

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,25882,2014-01-01,4598,COMPLETE
1,25888,2014-01-01,6735,COMPLETE
2,25889,2014-01-01,10045,COMPLETE
3,25891,2014-01-01,3037,CLOSED
4,25895,2014-01-01,1044,COMPLETE
5,25897,2014-01-01,6405,COMPLETE
6,25898,2014-01-01,3950,COMPLETE
7,25899,2014-01-01,8068,CLOSED
8,25900,2014-01-01,2382,CLOSED
9,25901,2014-01-01,3099,COMPLETE


In [22]:
%%sql

SELECT COUNT(1) FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM-dd') ~ '2014-01'

Unnamed: 0,count
0,2544


In [25]:
%%sql

SELECT count(1), min(order_date), max(order_date), count(DISTINCT order_date)
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND order_date BETWEEN '2014-01-01' AND '2014-03-31'

Unnamed: 0,count,min,max,count.1
0,7594,2014-01-01,2014-03-31,89


### Table Joins

In [23]:
%%sql

SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_subtotal
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
LIMIT 10

Unnamed: 0,order_id,order_date,order_status,order_item_subtotal
0,1,2013-07-25,CLOSED,299.98
1,2,2013-07-25,PENDING_PAYMENT,199.99
2,2,2013-07-25,PENDING_PAYMENT,250.0
3,2,2013-07-25,PENDING_PAYMENT,129.99
4,4,2013-07-25,CLOSED,49.98
5,4,2013-07-25,CLOSED,299.95
6,4,2013-07-25,CLOSED,150.0
7,4,2013-07-25,CLOSED,199.92
8,5,2013-07-25,COMPLETE,299.98
9,5,2013-07-25,COMPLETE,299.95


In [24]:
%%sql

SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_order_id,
    oi.order_item_subtotal
FROM orders o LEFT OUTER JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
ORDER BY o.order_id
LIMIT 10

Unnamed: 0,order_id,order_date,order_status,order_item_order_id,order_item_subtotal
0,1,2013-07-25,CLOSED,1.0,299.98
1,2,2013-07-25,PENDING_PAYMENT,2.0,129.99
2,2,2013-07-25,PENDING_PAYMENT,2.0,250.0
3,2,2013-07-25,PENDING_PAYMENT,2.0,199.99
4,3,2013-07-25,COMPLETE,,
5,4,2013-07-25,CLOSED,4.0,199.92
6,4,2013-07-25,CLOSED,4.0,150.0
7,4,2013-07-25,CLOSED,4.0,299.95
8,4,2013-07-25,CLOSED,4.0,49.98
9,5,2013-07-25,COMPLETE,5.0,299.98


### Aggregations

In [26]:
%%sql

SELECT round(sum(order_item_subtotal::numeric),2) AS order_revenue
FROM order_items
WHERE order_item_order_id = 2

Unnamed: 0,order_revenue
0,579.98


In [27]:
%%sql

SELECT order_date,
    count(1)
FROM orders
GROUP BY order_date
LIMIT 10

Unnamed: 0,order_date,count
0,2014-07-17,162
1,2014-07-14,161
2,2013-09-03,218
3,2014-07-01,189
4,2013-08-04,187
5,2014-05-10,196
6,2014-07-24,185
7,2013-10-15,174
8,2014-05-19,97
9,2014-01-02,111


In [28]:
%%sql

SELECT o.order_date,
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal)::numeric,2) AS revenue
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE
    o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date,
    oi.order_item_product_id
HAVING round(sum(oi.order_item_subtotal)::numeric, 2) >= 500
ORDER BY o.order_date, revenue
LIMIT 10

Unnamed: 0,order_date,order_item_product_id,revenue
0,2013-07-25,226,599.99
1,2013-07-25,627,1079.73
2,2013-07-25,502,1650.0
3,2013-07-25,403,1949.85
4,2013-07-25,1014,2798.88
5,2013-07-25,1073,2999.85
6,2013-07-25,365,3359.44
7,2013-07-25,957,4499.7
8,2013-07-25,191,5099.49
9,2013-07-25,1004,5599.72


In [29]:
%%sql

SELECT count(1)
FROM
(SELECT o.order_date,
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal)::numeric,2) AS revenue
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE
    o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date,
    oi.order_item_product_id
HAVING round(sum(oi.order_item_subtotal)::numeric, 2) >= 500
ORDER BY o.order_date, revenue)q

Unnamed: 0,count
0,3339


## Basic Strings and Dates

In [26]:
%%sql

SELECT lower('HeLLo WoRld') AS lower_result, 
upper('HeLLo WoRld') AS upper_result, 
initcap('HeLLo WoRld') AS initcap_result

Unnamed: 0,lower_result,upper_result,initcap_result
0,hello world,HELLO WORLD,Hello World


In [27]:
%%sql

WITH email_ids AS (
    SELECT 'bsellan0@yellowbook.com' AS email_id UNION
    SELECT 'rstelljes1@illinois.edu' UNION
    SELECT 'mmalarkey2@webeden.co.uk' UNION
    SELECT 'emussared3@redcross.org' UNION
    SELECT 'livashin4@bloglovin.com' UNION
    SELECT 'gkeach5cbc.ca' UNION
    SELECT 'emasham6@xing.com' UNION
    SELECT 'rcobbald7@house.gov' UNION
    SELECT 'rdrohan8@washingtonpost.com' UNION
    SELECT 'aebben9@arstechnica.com'
) SELECT email_id, position('@' IN email_id),
    strpos(email_id, '@')
FROM email_ids
ORDER BY 2, 1

Unnamed: 0,email_id,position,strpos
0,gkeach5cbc.ca,0,0
1,aebben9@arstechnica.com,8,8
2,bsellan0@yellowbook.com,9,9
3,emasham6@xing.com,9,9
4,rdrohan8@washingtonpost.com,9,9
5,livashin4@bloglovin.com,10,10
6,rcobbald7@house.gov,10,10
7,emussared3@redcross.org,11,11
8,mmalarkey2@webeden.co.uk,11,11
9,rstelljes1@illinois.edu,11,11


In [28]:
%%sql

WITH unique_ids AS (
    SELECT '241-80-7115' AS unique_id UNION
    SELECT '694-30-6851' UNION
    SELECT '586-92-5361' UNION
    SELECT '884-65-284' UNION
    SELECT '876-99-585' UNION
    SELECT '831-59-5593' UNION
    SELECT '399-88-3617' UNION
    SELECT '733-17-4217' UNION
    SELECT '873-68-9778' UNION
    SELECT '48'
) SELECT unique_id,
     position('-' IN unique_id) AS pos,
    position('-' IN unique_id) + position('-' IN substring(unique_id FROM 5)) AS pos_2nd
    
FROM unique_ids
ORDER BY unique_id

Unnamed: 0,unique_id,pos,pos_2nd
0,241-80-7115,4,7
1,399-88-3617,4,7
2,48,0,0
3,586-92-5361,4,7
4,694-30-6851,4,7
5,733-17-4217,4,7
6,831-59-5593,4,7
7,873-68-9778,4,7
8,876-99-585,4,7
9,884-65-284,4,7


In [29]:
%%sql

SELECT concat(year, '-', lpad(month::varchar, 2, '0'), '-',
              lpad(myDate::varchar, 2, '0')) AS order_date
FROM
    (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q

Unnamed: 0,order_date
0,2013-07-25


In [30]:
%%sql

SELECT current_date + INTERVAL '32 DAYS' AS result

Unnamed: 0,result
0,2023-02-06


In [31]:
%%sql

SELECT current_date + INTERVAL '3 MONTHS' AS result

Unnamed: 0,result
0,2023-04-05


In [32]:
%%sql

SELECT '2019-01-31'::date + INTERVAL '3 MONTHS' AS result

Unnamed: 0,result
0,2019-04-30


In [33]:
%%sql

SELECT '2019-01-31'::date + INTERVAL '3 MONTHS 3 DAYS 3 HOURS' AS result

Unnamed: 0,result
0,2019-05-03 03:00:00


In [34]:
%%sql

SELECT * FROM orders
WHERE order_date BETWEEN date_trunc('MONTH', '2014-01-10'::date) AND '2014-01-10'::date
ORDER BY order_date
LIMIT 10

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,25877,2014-01-01,5549,PENDING_PAYMENT
1,25878,2014-01-01,9084,PENDING
2,25879,2014-01-01,5118,PENDING
3,25880,2014-01-01,10146,CANCELED
4,25881,2014-01-01,3205,PENDING_PAYMENT
5,25882,2014-01-01,4598,COMPLETE
6,25883,2014-01-01,11764,PENDING
7,25884,2014-01-01,7904,PENDING_PAYMENT
8,25885,2014-01-01,7253,PENDING
9,25876,2014-01-01,3414,PENDING_PAYMENT


In [35]:
%%sql

SELECT current_timestamp AS current_timestamp, 
    to_char(current_timestamp, 'Mon') AS month_name

Unnamed: 0,current_timestamp,month_name
0,2023-01-05 08:11:12.428553+00:00,Jan


In [36]:
%%sql

SELECT current_timestamp AS current_timestamp, 
    to_char('2020-11-17'::date, 'Day') AS dayname,
    length(to_char('2020-11-17'::date, 'Day')) AS dayname_length,
    length(trim(to_char('2020-11-17'::date, 'Day'))) AS dayname_trimmed_length

Unnamed: 0,current_timestamp,dayname,dayname_length,dayname_trimmed_length
0,2023-01-05 08:11:20.648444+00:00,Tuesday,9,7


In [37]:
%%sql

SELECT split_part('2020-09-30', '-', 2) AS month

Unnamed: 0,month
0,9
