## STANDARD TRANSFORMATIONS

1. Projection of data
2. Filtering Data
3. Perfoming Aggregations
4. Joins
5. Sorting
6. Ranking

In [None]:
%load_ext sql
%env DATABASE_URL = postgresql://marcio_gabriel:123456@localhost:5432/data_engineering

# SELECTING OR PROJECTING DATA

In [None]:
%sql SELECT * FROM orders LIMIT 10;

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'orders'

In [None]:
%%sql
#specify columns in any order
SELECT order_customer_id, order_date, order_status
FROM orders
LIMIT 10;

In [None]:
%%sql
SELECT order_customer_id,
    to_char(order_date, 'yyyy-MM') AS order_month, 
    order_status
FROM orders
LIMIT 10;

In [None]:
%%sql
SELECT DISTINCT to_char(order_date, 'yyyy-MM') AS order_month
FROM orders

In [None]:
%sql SELECT count(1) FROM orders

In [None]:
%%sql
SELECT count(DISTINCT to_char(order_date, 'yyyy-MM')) AS distinct_month_count
FROM orders

# FILTERING DATA

In [None]:
%%sql
SELECT * FROM orders
WHERE order_status = 'COMPLETE'
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1) FROM orders
WHERE order_status = 'COMPLETE'

In [None]:
%%sql
SELECT DISTINCT order_status FROM orders;

In [None]:
%%sql
SELECT * FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1) FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1) FROM orders
WHERE order_status = 'COMPLETE' OR order_status = 'CLOSED'
LIMIT 10

In [None]:
%%sql
SELECT * FROM orders
WHERE order_date = '2014-01-01 00:00:00.0'
LIMIT 3

In [None]:
%%sql
SELECT * FROM orders
WHERE order_date = '2014-01-01'
LIMIT 3

This query will not work as LIKE cannot be used to compare agains columns with date data type

In [None]:
%%sql
SELECT * FROM orders
WHERE order_date LIKE '2014-01%'
LIMIT 3

In [None]:
%%sql
SELECT * FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM-dd') LIKE '2014-01-%'
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1) FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM-dd') LIKE '2014-01-%'


In [None]:
%%sql
SELECT * FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM') = '2014-01'
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1) FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(order_date,'yyyy-MM-dd') ~ '2014-01'

In [None]:
%%sql
SELECT count(1), min(order_date), max(order_date), count(DISTINCT order_date)
FROM orders
WHERE order_status IN ('COMPLETE', 'CLOSED')
    AND order_date BETWEEN '2014-01-01' AND '2014-03-31'

In [None]:
%%sql
SELECT * FROM users
WHERE user_password IS NULL

# TABLES JOINS

In [None]:
%%sql
SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_subtotal
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
LIMIT 10

In [None]:
%sql SELECT COUNT(1) FROM orders

In [None]:
%sql SELECT COUNT(1) FROM order_items

In [None]:
%%sql
SELECT COUNT(1)
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
LIMIT 10

In [None]:
%%sql
SELECT COUNT(1)
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE o.order_status IN ('COMPLETE', 'CLOSED')
LIMIT 10

# JOIN TABLES - OUTER

In [None]:
%%sql
SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_order_id,
    oi.order_item_subtotal
FROM orders o LEFT OUTER JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
ORDER BY o.order_id
LIMIT 10

In [None]:
%%sql
SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_order_id,
    oi.order_item_subtotal
FROM orders o LEFT OUTER JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE oi.order_item_order_id IS NULL
ORDER BY o.order_id
LIMIT 10

In [None]:
%%sql
SELECT o.order_id,
    o.order_date,
    o.order_status,
    oi.order_item_order_id,
    oi.order_item_subtotal
FROM orders o RIGHT OUTER JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
ORDER BY o.order_id
LIMIT 10

# PERFORMING AGGREGATIONS

In [None]:
%sql SELECT count(order_id) FROM orders;

In [None]:
%sql SELECT count(DISTINCT order_date) FROM orders

In [None]:
%%sql
SELECT round(sum(order_item_subtotal::numeric),2) AS order_revenue
FROM order_items
WHERE order_item_order_id = 2

In [None]:
%%sql
SELECT order_date,
    count(1)
FROM orders
GROUP BY order_date
LIMIT 10

In [None]:
%%sql
SELECT order_status,
    count(1) AS status_count
FROM orders
GROUP BY order_status
ORDER BY order_status
LIMIT 10

In [None]:
%%sql
SELECT order_item_order_id,
    sum(order_item_subtotal) AS order_revenue
FROM order_items
GROUP BY order_item_order_id
ORDER BY order_item_order_id
LIMIT 10

In [None]:
%%sql
SELECT order_item_order_id,
    round(sum(order_item_subtotal)::numeric,2) AS order_revenue
FROM order_items
GROUP BY order_item_order_id
ORDER BY order_item_order_id
LIMIT 10

In [None]:
%%sql
SELECT o.order_date,
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal)::numeric,2) AS revenue
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE
    o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date,
    oi.order_item_product_id
HAVING round(sum(oi.order_item_subtotal)::numeric, 2) >= 500
ORDER BY o.order_date, revenue
LIMIT 10

In [None]:
%%sql
SELECT count(1)
FROM
(SELECT o.order_date,
    oi.order_item_product_id,
    round(sum(oi.order_item_subtotal)::numeric,2) AS revenue
FROM orders o JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE
    o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY o.order_date,
    oi.order_item_product_id
HAVING round(sum(oi.order_item_subtotal)::numeric, 2) >= 500
ORDER BY o.order_date, revenue)q

# DAILY PRODUCT REVENUE

In [None]:
%%sql
SELECT 
    o.order_date,
    oi.order_item_product_id,
    p.product_name,
    round(sum(oi.order_item_subtotal::numeric),2) AS product_revenue
FROM orders o
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
    JOIN products p
        ON p.product_id = oi.order_item_product_id
WHERE 
o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY
o.order_date,
oi.order_item_product_id,
p.product_name
LIMIT 10

Exercise 1 - Customer order count¶
Get order count per customer for the month of 2014 January.

Tables - orders and customers

Data should be sorted in descending order by count and ascending order by customer id.

Output should contain customer_id, customer_first_name, customer_last_name and customer_order_count.



In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'orders'

In [None]:
%%sql 
SELECT DISTINCT table_schema, table_name 
FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_schema = 'public'

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'customers'

In [None]:
%%sql
SELECT
    c.customer_id,
    c.customer_fname,
    c.customer_lname,
    COUNT(o.order_id) AS customer_order_count
FROM orders o
JOIN customers c
    ON o.order_customer_id = c.customer_id
WHERE
o.order_date >= '01-01-2014' AND o.order_date < '01-02-2014'
GROUP BY
    c.customer_id,
    c.customer_fname,
    c.customer_lname
ORDER BY
    customer_order_count DESC,
    customer_id ASC
LIMIT 20;

Exercise 2 – Dormant Customers¶
Get the customer details who have not placed any order for the month of 2014 January.

Tables – orders and customers
Data should be sorted in ascending order by customer_id
Output should contain all the fields from customers

In [None]:
%%sql

select c.* 
FROM customers c LEFT OUTER JOIN 
    (SELECT DISTINCT order_customer_id 
     FROM orders WHERE
     to_char(order_date, 'yyyy-MM') = '2014-01'
    ) o
    ON c.customer_id = o.order_customer_id
WHERE  o.order_customer_id IS NULL 
ORDER BY c.customer_id
LIMIT 10

Exercise 3 - Revenue Per Customer¶
Get the revenue generated by each customer for the month of 2014 January

Tables - orders, order_items and customers

Data should be sorted in descending order by revenue and then ascending order by customer_id

Output should contain customer_id, customer_first_name, customer_last_name, customer_revenue.

If there are no orders placed by customer, then the corresponding revenue for a give customer should be 0.

Consider only COMPLETE and CLOSED orders



In [None]:
%%sql
SELECT
    c.customer_id,
    c.customer_fname,
    c.customer_lname,
    sum(round(oi.order_item_subtotal::numeric,2)) AS customer_revenue
FROM orders o
JOIN customers c
    ON o.order_customer_id = c.customer_id
JOIN order_items oi
    ON o.order_id = oi.order_item_order_id
WHERE
    o.order_status IN ('COMPLETE', 'CLOSED')
    AND to_char(o.order_date,'mm/yyyy') = '01/2014' 
GROUP BY
    c.customer_id,
    c.customer_fname,
    c.customer_lname
ORDER BY
    customer_revenue DESC,
    c.customer_id ASC
LIMIT 10


Exercise 4 - Revenue Per Category¶
Get the revenue generated for each category for the month of 2014 January

Tables - orders, order_items, products and categories

Data should be sorted in ascending order by category_id.

Output should contain all the fields from category along with the revenue as category_revenue.

Consider only COMPLETE and CLOSED orders



In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'products'

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'order_items'

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'categories'

In [None]:
%%sql
SELECT c.*, 
    round(sum(oi.order_item_subtotal)::numeric, 2) AS category_revenue 
FROM categories c JOIN products p 
        ON c.category_id = p.product_category_id 
    JOIN order_items oi 
        ON p.product_id = oi.order_item_product_id 
    JOIN orders o
        ON oi.order_item_order_id = o.order_id
WHERE to_char(o.order_date, 'yyyy-MM') = '2014-01' 
    AND o.order_status IN ('COMPLETE', 'CLOSED')
GROUP BY c.category_id
ORDER BY c.category_id 
LIMIT 10;

Exercise 5 - Product Count Per Department¶
Get the products for each department.

Tables - departments, categories, products

Data should be sorted in ascending order by department_id

Output should contain all the fields from department and the product count as product_count

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'departments'

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'categories'

In [None]:
%%sql 
SELECT * FROM information_schema.columns
WHERE table_catalog = 'data_engineering'
AND table_name = 'products'

In [None]:
%%sql
SELECT 
    d.*,
    count(p.product_id) AS product_count
FROM departments d
JOIN categories c
    ON d.department_id = c.category_department_id
JOIN products p 
    ON p.product_category_id = c.category_id
GROUP BY
d.department_id
ORDER BY
d.department_id ASC
LIMIT 20