

```
# This is formatted as code
```

# Nike Database Advanced SQL

## Preparations

### Installing and importing packages

In [None]:
import pandas as pd
import sqlalchemy as sa

### Making a connection

In [None]:
nike_url = "postgresql://Test:bQNxVzJL4g6u@ep-noisy-flower-846766.us-east-2.aws.neon.tech/SQL_2_3"

Remember: Connecting with SQLAlchemy always works in two steps:

1.   Create an engine
2.   Make a connection

In [None]:
engine = sa.create_engine(nike_url)
connection = engine.connect().execution_options(isolation_level="AUTOCOMMIT")

### Schema

### Tasks

In [None]:


query = """
WITH orders_per_customer AS (

SELECT user_id,
	   COUNT(order_id) AS total_orders

FROM orders

GROUP BY user_id

)

SELECT CASE WHEN total_orders = 1 THEN 'Single Customer'
       		  WHEN total_orders > 1 THEN 'Recurring Customer'
            ELSE 'Other'
      END AS customer_group,
      COUNT(user_id) AS total_customers

FROM orders_per_customer

GROUP BY customer_group

;


"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,customer_group,total_customers
0,Recurring Customer,478
1,Single Customer,8027


In [None]:
# Using a WITH statement,
# count the number of people by age group who have Novak Djokovic as their favorite tennis player.


query = """
WITH people_with_Novak_as_fav AS (

SELECT COUNT(customer_id) , age_group

FROM customers

WHERE fav_tennis_player='Novak Djokovic'

GROUP BY age_group

)

SELECT * FROM people_with_Novak_as_fav



"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,count,age_group
0,131,25-34
1,124,45+
2,123,34-45
3,215,18-24


In [None]:
query = """

WITH total_order_items AS (

SELECT * FROM order_items

UNION ALL

SELECT * FROM order_items_vintage

)

SELECT SUM(sale_price) AS total_revenue FROM total_order_items

;


"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,total_revenue
0,461654.190568


In [None]:
query = """
WITH completed_orders AS (

	SELECT * FROM orders WHERE status = 'Complete'

	)

	SELECT COUNT(orders.order_id) AS total_orders,
				 COUNT(completed_orders.order_id) AS total_completed_orders

	FROM orders
			 LEFT JOIN completed_orders ON completed_orders.order_id = orders.order_id

       """
pd.read_sql(sa.text(query),connection)

Unnamed: 0,total_orders,total_completed_orders
0,9009,2228


In [None]:
query = """
WITH customers_purchasing_more_1_product AS (
SELECT user_id, COUNT(DISTINCT product_id), SUM(sale_price) as total_sale_price_per_customer
FROM order_items
GROUP BY user_id
HAVING COUNT(DISTINCT product_id)>1
)

SELECT SUM(total_sale_price_per_customer) from customers_purchasing_more_1_product
       """
pd.read_sql(sa.text(query),connection)

Unnamed: 0,sum
0,41966.410077


In [None]:
query = """

WITH items_combined AS (

SELECT * FROM order_items

UNION ALL

SELECT * FROM order_items_vintage

),

     orders_per_customer AS (

SELECT user_id,
			 COUNT(order_item_id) AS total_orders_items

FROM items_combined

GROUP BY user_id

)

SELECT CASE WHEN total_orders_items = 1 THEN 'Single Customer'
       		  WHEN total_orders_items > 1 THEN 'Recurring Customer'
            ELSE 'Other'
       END AS customer_group,
       COUNT(user_id) AS total_customers

FROM orders_per_customer

GROUP BY customer_group
       """
pd.read_sql(sa.text(query),connection)

Unnamed: 0,customer_group,total_customers
0,Recurring Customer,758
1,Single Customer,7851


In [None]:
# What is the total revenue
# generated by Nike Official and Nike Vintage combined from customers that purchased more than 1 product?

query = """

WITH items_combined AS (

SELECT * FROM order_items

UNION ALL

SELECT * FROM order_items_vintage

),

customers_purchasing_more_1_product AS (
SELECT user_id, COUNT(DISTINCT product_id), SUM(sale_price) as total_sale_price_per_customer
FROM  items_combined
GROUP BY user_id
HAVING COUNT(DISTINCT product_id)>1
)

SELECT SUM(total_sale_price_per_customer) FROM customers_purchasing_more_1_product
"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,sum
0,50170.350076


In [None]:
# How many orders were created on the most recent created_at date available in the data?


query = """


SELECT COUNT(order_id) from orders
WHERE created_at = (SELECT MAX(created_at) FROM orders)


"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,count
0,2


In [None]:
# Question #1:
# What are the top customers by the total amount of revenue (aggregate of the sales price)
# for the Nike Official and Nike Vintage business units combined?

# Include the customer id, the total revenue, and the number of order items each customer has purchased.

# Only include orders that have not been cancelled or returned.
# part1 - Getting entire business Nike official and Nike Vintage Combined in one table as tb
query = """

WITH tb AS (

  	SELECT *
  	FROM order_items
  	UNION ALL
  	SELECT *
  	FROM order_items_vintage
)

SELECT * FROM tb
"""

pd.read_sql(sa.text(query),connection)

Unnamed: 0,order_item_id,order_id,user_id,product_id,created_at,shipped_at,delivered_at,returned_at,sale_price
0,OI13793,O9492,U7649,P011,2022-06-22,2022-06-25,,,8.5
1,OI93625,O64584,U51698,P011,2021-11-06,2021-11-08,,,8.5
2,OI111549,O76948,U61600,P002,2023-02-27,2023-02-26,,,8.5
3,OI115525,O79675,U63681,P002,2022-02-27,2022-02-27,,,8.5
4,OI125455,O86550,U69272,P011,2022-02-12,2022-02-12,,,8.5
...,...,...,...,...,...,...,...,...,...
9445,VI11080,V2539,U99843,P013,2023-06-20,2023-06-23,2023-06-30,2023-07-12,170.0
9446,VI13050,V3013,U99876,P014,2019-11-16,2019-11-18,2019-11-24,2019-12-06,60.0
9447,VI15708,V1373,U99900,P013,2022-12-23,2022-12-26,2023-01-02,,170.0
9448,VI19997,V8814,U99954,,2021-01-08,,,,150.0


In [None]:
# Question #1:
# What are the top customers by the total amount of revenue (aggregate of the sales price)
# for the Nike Official and Nike Vintage business units combined?

# Include the customer id, the total revenue, and the number of order items each customer has purchased.

# Only include orders that have not been cancelled or returned.
# part2
query = """

WITH tb AS (

  	SELECT *
  	FROM order_items
  	UNION ALL
  	SELECT *
  	FROM order_items_vintage
)

SELECT tb.user_id, SUM(tb.sale_price) AS total_revenue, COUNT(tb.order_item_id) as total_items_ordered
FROM tb
FULL JOIN orders ord
ON tb.order_id=ord.order_id
WHERE ord.status NOT IN ('Returned','Cancelled')
GROUP BY tb.user_id
ORDER BY total_revenue DESC
"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,user_id,total_revenue,total_items_ordered
0,U107237,618.00,5
1,U100067,420.00,3
2,U106401,350.00,2
3,U114456,330.00,2
4,U102325,300.00,3
...,...,...,...
6370,U23973,5.99,1
6371,U44890,5.99,1
6372,U41044,3.11,1
6373,U8563,3.11,1


In [None]:
# Question #2:
# Combine the order item data from Nike Official and Nike Vintage, and segment customers into three segments.
# (1) Customers that only purchased a single product;
# (2) Customers that purchased more than 1 product;
# (3) “Missing Data” (if none of these conditions match)

# How many customers and how much revenue (aggregate of the sales price) falls in each segment?

# Only include orders that have not been cancelled or returned.
# To make you think: what type of data could fall under the third bucket?
# part1
query = """
WITH tb AS (
    SELECT *
    FROM order_items
    UNION ALL
    SELECT *
    FROM order_items_vintage
),
customer_counts AS (
    SELECT
        user_id,
        COUNT(order_item_id) AS num_products
    FROM tb
    GROUP BY user_id
)
SELECT
    CASE
        WHEN num_products = 1 THEN 'Single_product_customers'
        WHEN num_products > 1 THEN 'Multi_product_customers'
        ELSE 'Missing_data'
    END AS customer_segments,
    COUNT(user_id) AS customer_count
FROM customer_counts
GROUP BY customer_segments;
"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,customer_segments,customer_count
0,Single_product_customers,7851
1,Multi_product_customers,758


In [None]:
# Question #2:
# Combine the order item data from Nike Official and Nike Vintage, and segment customers into three segments.
# (1) Customers that only purchased a single product;
# (2) Customers that purchased more than 1 product;
# (3) “Missing Data” (if none of these conditions match)

# How many customers and how much revenue (aggregate of the sales price) falls in each segment?

# Only include orders that have not been cancelled or returned.
# To make you think: what type of data could fall under the third bucket?
# part2

query = """
WITH tb AS (
    SELECT *
    FROM order_items
    UNION ALL
    SELECT *
    FROM order_items_vintage
),
customer_type AS (
    SELECT tb.user_id,
        CASE
        WHEN COUNT(DISTINCT tb.product_id) = 1 THEN 'One_time_customers'
        WHEN COUNT(DISTINCT tb.product_id) > 1 THEN 'Recurring_customers'
        ELSE 'Missing_data'
    END AS customer_segments,
        SUM(tb.sale_price) AS total_revenue

    FROM tb
    FULL JOIN orders ord
    ON tb.order_id=ord.order_id
    WHERE ord.status NOT IN ('Returned','Cancelled')
    GROUP BY tb.user_id

)
SELECT
customer_segments,
COUNT(DISTINCT user_id) AS customer_count,
SUM(total_revenue) AS total_revenue
FROM customer_type
GROUP BY customer_segments

"""
pd.read_sql(sa.text(query),connection)



Unnamed: 0,customer_segments,customer_count,total_revenue
0,Missing_data,500,32505.170046
1,One_time_customers,5491,255107.470334
2,Recurring_customers,384,30747.85005


In [None]:
# Question #3:
# The Nike Official leadership team is keen to understand what % of the total revenue per state
# is coming from the Nike Official business.

# Create list that shows the total revenue (aggregate of the sales price) per state,
# the revenue generated from Nike Official,
# and the % of the Nike Official revenue compared to the total revenue for every state.

# Only include orders that have not been cancelled or returned and order the table
# to show the state with the highest amount of revenue first, even is there is no information available about the state.
# part1
query = """
WITH tb AS (
    SELECT *
    FROM order_items
    UNION ALL
    SELECT *
    FROM order_items_vintage
)

SELECT c.state,SUM(sale_price) as total_revenue_combined_per_state
FROM tb
FULL JOIN customers c
ON c.customer_id=tb.user_id
FULL JOIN orders ord
ON ord.order_id=tb.order_id
WHERE ord.status NOT IN ('Returned','Cancelled')
GROUP BY c.state

"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,state,total_revenue_combined_per_state
0,,74396.250098
1,Pennsylvania,38534.780054
2,California,29574.010065
3,Florida,35336.120051
4,Illinois,35620.060029
5,New York,34993.620029
6,Texas,32268.810068
7,Ohio,32292.960028
8,US State,5343.880009


In [None]:
# Question #3:
# The Nike Official leadership team is keen to understand what % of the total revenue per state
# is coming from the Nike Official business.

# Create list that shows the total revenue (aggregate of the sales price) per state,
# the revenue generated from Nike Official,
# and the % of the Nike Official revenue compared to the total revenue for every state.

# Only include orders that have not been cancelled or returned and order the table
# to show the state with the highest amount of revenue first, even is there is no information available about the state.
# part2

query = """
SELECT
c.state, SUM(oi.sale_price) AS total_nike_official_revenue_per_state
FROM order_items oi
FULL JOIN orders ord
ON ord.order_id=oi.order_id
FULL JOIN customers c
ON c.customer_id=oi.user_id
WHERE ord.status NOT IN ('Returned','Cancelled')
GROUP BY c.state
"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,state,total_nike_official_revenue_per_state
0,,44971.250098
1,Pennsylvania,24257.780054
2,California,20938.010065
3,Florida,23946.120051
4,Illinois,24054.060029
5,New York,22893.620029
6,Texas,21518.810068
7,Ohio,22567.960028
8,US State,3263.880009


In [None]:
# Question #3:
# The Nike Official leadership team is keen to understand what % of the total revenue per state
# is coming from the Nike Official business.

# Create list that shows the total revenue (aggregate of the sales price) per state,
# the revenue generated from Nike Official,
# and the % of the Nike Official revenue compared to the total revenue for every state.

# Only include orders that have not been cancelled or returned and order the table
# to show the state with the highest amount of revenue first, even is there is no information available about the state.
# part3


query = """
WITH tb AS (
    SELECT *
    FROM order_items
    UNION ALL
    SELECT *
    FROM order_items_vintage
),
overall_revenue AS (
SELECT c.state as state,SUM(sale_price) as total_revenue_combined_per_state
FROM tb
FULL JOIN customers c
ON c.customer_id=tb.user_id
FULL JOIN orders ord
ON ord.order_id=tb.order_id
WHERE ord.status NOT IN ('Returned','Cancelled')
GROUP BY c.state
),
nike_official_revenue AS (
SELECT
c.state,SUM(oi.sale_price) AS total_nike_official_revenue_per_state
FROM order_items oi
FULL JOIN orders ord
ON ord.order_id=oi.order_id
FULL JOIN customers c
ON c.customer_id=oi.user_id
WHERE ord.status NOT IN ('Returned','Cancelled')
GROUP BY c.state
)

SELECT ovr.state,ovr.total_revenue_combined_per_state ,
nor.total_nike_official_revenue_per_state/ovr.total_revenue_combined_per_state as perc_nike_official
FROM overall_revenue ovr
LEFT JOIN nike_official_revenue nor
ON COALESCE(ovr.state,'')=COALESCE(nor.state,'')

ORDER BY ovr.total_revenue_combined_per_state DESC

"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,state,total_revenue_combined_per_state,perc_nike_official
0,,74396.250098,0.604483
1,Pennsylvania,38534.780054,0.629504
2,Illinois,35620.060029,0.675295
3,Florida,35336.120051,0.677667
4,New York,34993.620029,0.654223
5,Ohio,32292.960028,0.698851
6,Texas,32268.810068,0.666861
7,California,29574.010065,0.707987
8,US State,5343.880009,0.61077


In [None]:
# Question #4:
# Create an overview of the orders by state.
# Summarize for each customer the number of orders that have status of Complete,
# or Canceled (Returned or Cancelled).

# Exclude all orders that are still in progress (Processing or Shipped)
# and only include orders for customers that have a state available.
# part1 - total orders
query = """

SELECT customers.state,COUNT(DISTINCT orders.order_id) AS total_orders FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status NOT IN ('Processing','Shipped')
GROUP BY customers.state

"""
pd.read_sql(sa.text(query),connection)




Unnamed: 0,state,total_orders
0,California,493
1,Florida,498
2,Illinois,532
3,New York,524
4,Ohio,501
5,Pennsylvania,481
6,Texas,491
7,US State,64


In [None]:
# Question #4:
# Create an overview of the orders by state.
# Summarize for each customer the number of orders that have status of Complete,
# or Canceled (Returned or Cancelled).

# Exclude all orders that are still in progress (Processing or Shipped)
# and only include orders for customers that have a state available.
# part2 - completed orders



query = """

SELECT customers.state,COUNT(DISTINCT orders.order_id) FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status='Complete'
GROUP BY customers.state

"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,state,count
0,California,235
1,Florida,253
2,Illinois,263
3,New York,260
4,Ohio,238
5,Pennsylvania,243
6,Texas,221
7,US State,30


In [None]:
# Question #4:
# Create an overview of the orders by state.
# Summarize for each customer the number of orders that have status of Complete,
# or Canceled (Returned or Cancelled).

# Exclude all orders that are still in progress (Processing or Shipped)
# and only include orders for customers that have a state available.
# part3 - cancelled orders


query = """

SELECT customers.state,COUNT(DISTINCT orders.order_id) FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status IN ('Cancelled','Returned')
GROUP BY customers.state

"""
pd.read_sql(sa.text(query),connection)

Unnamed: 0,state,count
0,California,258
1,Florida,245
2,Illinois,269
3,New York,264
4,Ohio,263
5,Pennsylvania,238
6,Texas,270
7,US State,34


In [None]:
# Question #4:
# Create an overview of the orders by state.
# Summarize for each customer the number of orders that have status of Complete,
# or Canceled (Returned or Cancelled).

# Exclude all orders that are still in progress (Processing or Shipped)
# and only include orders for customers that have a state available.
# part4 -

query = """

WITH total AS (SELECT customers.state AS state,COUNT(DISTINCT orders.order_id) AS total_orders FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status NOT IN ('Processing','Shipped')
GROUP BY customers.state),

completed AS (
SELECT customers.state AS state,COUNT(DISTINCT orders.order_id) AS completed_orders FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status='Complete'
GROUP BY customers.state
),

cancelled AS (
SELECT customers.state AS state,COUNT(DISTINCT orders.order_id) AS cancelled_orders FROM orders
JOIN customers
ON customers.customer_id=orders.user_id
WHERE orders.status IN ('Cancelled','Returned')
GROUP BY customers.state
)

SELECT total.state, total.total_orders,completed.completed_orders, cancelled.cancelled_orders
FROM total
LEFT JOIN completed
ON total.state=completed.state
LEFT JOIN cancelled
ON total.state=cancelled.state
"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,total_orders,completed_orders,cancelled_orders
0,California,493,235,258
1,Florida,498,253,245
2,Illinois,532,263,269
3,New York,524,260,264
4,Ohio,501,238,263
5,Pennsylvania,481,243,238
6,Texas,491,221,270
7,US State,64,30,34


In [None]:
# Question #4:
# Create an overview of the orders by state.
# Summarize for each customer the number of orders that have status of Complete,
# or Canceled (Returned or Cancelled).

# Exclude all orders that are still in progress (Processing or Shipped)
# and only include orders for customers that have a state available.
# Alternate method - simpler

query = """
SELECT state,COUNT(*),
COUNT(CASE WHEN status='Complete' THEN 1 END) AS completed_orders,
COUNT(CASE WHEN status IN ('Cancelled','Returned') THEN 1 END) AS cancelled_orders

FROM orders
JOIN customers
ON orders.user_id=customers.customer_id
WHERE status IN ('Complete','Cancelled','Returned')
GROUP BY state
"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,count,completed_orders,cancelled_orders
0,Pennsylvania,481,243,238
1,California,493,235,258
2,Florida,498,253,245
3,Illinois,532,263,269
4,New York,524,260,264
5,Ohio,501,238,263
6,Texas,491,221,270
7,US State,64,30,34


In [None]:
#rank states based on number of customers and age-group
# ROW_NUMBER() OVER(ORDER BY __) window function

query = """

WITH total_customers AS (

SELECT state,
         age_group,
         COUNT(customer_id) AS num_customers


FROM customers

GROUP BY state,
         age_group

  )

SELECT *,
			 ROW_NUMBER() OVER (ORDER BY num_customers DESC) AS rank

FROM total_customers
"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,age_group,num_customers,rank
0,Illinois,18-24,389,1
1,New York,18-24,384,2
2,Florida,18-24,377,3
3,Pennsylvania,18-24,366,4
4,Texas,18-24,357,5
5,Ohio,18-24,346,6
6,California,18-24,345,7
7,New York,45+,227,8
8,Illinois,25-34,220,9
9,Texas,45+,219,10


In [None]:
# rank age group within states based on number of customers
# Using PARTITION BY window function
query = """

WITH total_customers AS (

  SELECT state, age_group, COUNT(DISTINCT customer_id) as num_customers
  FROM customers
  GROUP BY state,age_group
)

SELECT *,
ROW_NUMBER() OVER(PARTITION BY state ORDER BY num_customers DESC) AS rank
FROM total_customers

"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,age_group,num_customers,rank
0,California,18-24,345,1
1,California,25-34,197,2
2,California,45+,174,3
3,California,34-45,159,4
4,Florida,18-24,377,1
5,Florida,34-45,211,2
6,Florida,25-34,188,3
7,Florida,45+,177,4
8,Illinois,18-24,389,1
9,Illinois,25-34,220,2


In [None]:
#Calculate the difference in the number of customers between the
# maximum number of customers per state and
# the number of customers for each state and age_group combination.
query = """

WITH total_customers AS (

  SELECT state, age_group, COUNT(DISTINCT customer_id) as num_customers
  FROM customers
  GROUP BY state,age_group
)

SELECT *,
(num_customers) - (MAX(num_customers) OVER(PARTITION BY state ORDER BY num_customers DESC)) AS difference_to_max
FROM total_customers

"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,age_group,num_customers,difference_to_max
0,California,18-24,345,0
1,California,25-34,197,-148
2,California,45+,174,-171
3,California,34-45,159,-186
4,Florida,18-24,377,0
5,Florida,34-45,211,-166
6,Florida,25-34,188,-189
7,Florida,45+,177,-200
8,Illinois,18-24,389,0
9,Illinois,25-34,220,-169


In [None]:
# creating subtotals and total using ROLLUP in GROUP BY
# part1 - we see null values in the place of subttotal and total
query = """
SELECT state,
age_group,
COUNT(DISTINCT customer_id) AS num_customers
FROM customers

GROUP BY ROLLUP(state,age_group)


"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,state,age_group,num_customers
0,California,18-24,345
1,California,25-34,197
2,California,34-45,159
3,California,45+,174
4,California,,875
5,Florida,18-24,377
6,Florida,25-34,188
7,Florida,34-45,211
8,Florida,45+,177
9,Florida,,953


In [None]:
# creating subtotals and total using ROLLUP in GROUP BY
# part2 - replacing null values using COALESCE

query = """
SELECT COALESCE(state,'Total'),
COALESCE(age_group,'Subtotal'),
COUNT(DISTINCT customer_id) AS num_customers
FROM customers

GROUP BY ROLLUP(state,age_group)


"""
pd.read_sql(sa.text(query),connection)


Unnamed: 0,coalesce,coalesce.1,num_customers
0,California,18-24,345
1,California,25-34,197
2,California,34-45,159
3,California,45+,174
4,California,Subtotal,875
5,Florida,18-24,377
6,Florida,25-34,188
7,Florida,34-45,211
8,Florida,45+,177
9,Florida,Subtotal,953
