# Step 7 - Answering Data Questions

Importing packages and connecting to database:

In [1]:
import pandas as pd
import os
import sqlalchemy

In [2]:
host = os.environ.get('mysql_host')
user = os.environ.get('mysql_user')
password = os.environ.get('mysql_password')
engine = sqlalchemy.create_engine(f'mysql+pymysql://{user}:{password}@{host}/trading')

## Answering the Questions

**Question 1**

What is the total portfolio value for each mentor at the end of 2020?

In [3]:
# Tables in trading DB:
pd.read_sql_query("""SHOW TABLES""", engine)

Unnamed: 0,Tables_in_trading
0,base_table_step6
1,members
2,prices
3,transactions


To answer this question, we can find all we need in the base table created 
in the [Step 6](inserire link) and in the prices table

In [4]:
pd.read_sql_query("SELECT * FROM base_table_step6 LIMIT 5", engine)

Unnamed: 0,index,first_name,region,ticker,year_end,yearly_quantity,cumulative_quantity
0,0,Vipul,United States,BTC,2017-12-31,433.56,433.56
1,1,Charlie,United States,BTC,2017-12-31,590.32,590.32
2,2,Nandita,United States,BTC,2017-12-31,1021.56,1021.56
3,3,Rowan,United States,BTC,2017-12-31,713.25,713.25
4,4,Ayush,United States,BTC,2017-12-31,794.53,794.53


In [5]:
pd.read_sql_query("SELECT * FROM prices LIMIT 5", engine)

Unnamed: 0,ticker,market_date,price,open,high,low,volume,change
0,ETH,2021-08-29,3177.84,3243.96,3282.21,3162.79,582.04K,-2.04%
1,ETH,2021-08-28,3243.9,3273.78,3284.58,3212.24,466.21K,-0.91%
2,ETH,2021-08-27,3273.58,3093.78,3279.93,3063.37,839.54K,5.82%
3,ETH,2021-08-26,3093.54,3228.03,3249.62,3057.48,118.44K,-4.17%
4,ETH,2021-08-25,3228.15,3172.12,3247.43,3080.7,923.13K,1.73%


In [6]:
pd.read_sql_query("""
    SELECT
      b.first_name,
      SUM(b.cumulative_quantity * p.price) AS portfolio_value 
    FROM base_table_step6 b
    INNER JOIN prices p
      ON b.year_end=p.market_date AND b.ticker=p.ticker
    WHERE b.year_end='2020-12-31'
    GROUP BY b.first_name
    ORDER BY portfolio_value DESC;
    """, 
    engine)

Unnamed: 0,first_name,portfolio_value
0,Nandita,105391800.0
1,Leah,100724100.0
2,Ayush,100071600.0
3,Abe,95203770.0
4,Ben,92722790.0
5,Enoch,88346420.0
6,Vikram,88000690.0
7,Danny,84696270.0
8,Sonia,67931910.0
9,Rowan,67241010.0


**Question 2**

What is the toal portfolio value for each region at the end of 2019?

We can use the same query used to solve the first question replacing the name
column with the region.

In [7]:
pd.read_sql_query("""
    SELECT
      b.region,
      SUM(b.cumulative_quantity * p.price) AS portfolio_value 
    FROM base_table_step6 b
    INNER JOIN prices p
      ON b.year_end=p.market_date AND b.ticker=p.ticker
    WHERE b.year_end='2019-12-31'
    GROUP BY b.region
    ORDER BY portfolio_value DESC;
    """,
    engine)

Unnamed: 0,region,portfolio_value
0,United States,98795320.0
1,Australia,52861330.0
2,Asia,18305320.0
3,India,16168850.0
4,Africa,16078850.0


**Question 3**

What percentage of regional portfolio values does each mentor contribute at the end of 2018? 

To answer this question we have to create two temporary tables, one in order 
to have the porfolio value by mentor and one by region.

In [8]:
pd.read_sql_query("""
    WITH cte_mentor_portfolio AS (
        SELECT
          b.region,
          b.first_name,
          ROUND(
            SUM(b.cumulative_quantity * p.price),
              2
               ) AS portfolio_value 
        FROM base_table_step6 b
        INNER JOIN prices p
          ON b.year_end=p.market_date AND b.ticker=p.ticker
        WHERE b.year_end='2018-12-31'
        GROUP BY b.first_name, b.region),
    
    cte_region_portfolio AS (
        SELECT
          b.region,
          ROUND(
            SUM(b.cumulative_quantity * p.price),
              2
               ) AS portfolio_value 
        FROM base_table_step6 b
        INNER JOIN prices p
          ON b.year_end=p.market_date AND b.ticker=p.ticker
        WHERE b.year_end='2018-12-31'
        GROUP BY b.region
    )

    SELECT
      m.region,
      m.first_name,
      ROUND(100 * m.portfolio_value / r.portfolio_value, 2) AS mentor_contribution_percentage
    FROM cte_mentor_portfolio m
    INNER JOIN cte_region_portfolio r
      ON m.region=r.region
    ORDER BY r.portfolio_value DESC, mentor_contribution_percentage DESC;
    ;
    """,
    engine)

Unnamed: 0,region,first_name,mentor_contribution_percentage
0,United States,Nandita,20.0
1,United States,Ayush,18.61
2,United States,Abe,17.45
3,United States,Rowan,14.34
4,United States,Charlie,12.52
5,United States,Alex,10.0
6,United States,Vipul,7.08
7,Australia,Danny,31.55
8,Australia,Ben,30.41
9,Australia,Sonia,24.53


**Question 4**

Does this region contribution percentage change when we look across both Bitcoin and Ethereum portfolios independently at the end of 2017?

We can use a similar approach to question 3.

In [9]:
pd.read_sql_query("""
        SELECT
          b.region,
          b.first_name,
          b.ticker,
          ROUND(
            SUM(b.cumulative_quantity * p.price),
              2
               ) AS portfolio_value 
        FROM base_table_step6 b
        INNER JOIN prices p
          ON b.year_end=p.market_date 
          AND b.ticker=p.ticker
        WHERE b.year_end='2017-12-31'
        GROUP BY b.first_name, b.region
        LIMIT 5;""",
    engine)

Unnamed: 0,region,first_name,ticker,portfolio_value
0,Australia,Sonia,ETH,8596292.6
1,Australia,Pavan,ETH,5663369.72
2,Asia,Leah,ETH,12593095.5
3,India,Vikram,ETH,10574383.43
4,Africa,Enoch,ETH,10883584.91


In [10]:
query = """
WITH cte_mentor_portfolio AS (
  SELECT
    base.region,
    base.first_name,
    base.ticker,
    base.cumulative_quantity * prices.price AS portfolio_value
  FROM base_table_step6 AS base
  INNER JOIN prices
    ON base.ticker = prices.ticker
    AND base.year_end = prices.market_date
  WHERE base.year_end = '2017-12-31'
),
cte_region_portfolio AS (
SELECT
  region,
  first_name,
  ticker,
  portfolio_value,
  SUM(portfolio_value) OVER (
    PARTITION BY region, ticker
  ) AS region_total
FROM cte_mentor_portfolio
)
-- final output
SELECT
  region,
  first_name,
  ticker,
  ROUND(100 * portfolio_value / region_total, 2) AS contribution_percentage
FROM cte_region_portfolio
ORDER BY ticker, region, contribution_percentage DESC;"""

pd.read_sql_query(query, engine)

Unnamed: 0,region,first_name,ticker,contribution_percentage
0,Africa,Enoch,BTC,100.0
1,Asia,Leah,BTC,100.0
2,Australia,Ben,BTC,32.96
3,Australia,Danny,BTC,29.99
4,Australia,Sonia,BTC,21.86
5,Australia,Pavan,BTC,15.19
6,India,Vikram,BTC,100.0
7,United States,Nandita,BTC,20.99
8,United States,Abe,BTC,17.69
9,United States,Ayush,BTC,16.32


**Bonus - Question 5**

Calculate the ranks for each mentor in the US and Australia for each year and ticker

In [11]:
final_query = """
SELECT
  EXTRACT(YEAR FROM year_end) AS year,
  region,
  first_name,
  ticker,
  RANK() OVER (
    PARTITION BY region, year_end
    ORDER BY cumulative_quantity DESC
  ) AS ranking
FROM base_table_step6
WHERE region IN ('United States', 'Australia')
ORDER BY year_end, region, ranking;"""
pd.read_sql_query(final_query, engine)

Unnamed: 0,year,region,first_name,ticker,ranking
0,2017,Australia,Ben,ETH,1
1,2017,Australia,Ben,BTC,2
2,2017,Australia,Danny,ETH,3
3,2017,Australia,Sonia,ETH,4
4,2017,Australia,Danny,BTC,5
...,...,...,...,...,...
105,2021,United States,Abe,ETH,10
106,2021,United States,Charlie,ETH,11
107,2021,United States,Vipul,BTC,12
108,2021,United States,Rowan,ETH,13


Now we can pivote this long table in a slightly easier to read wide table.

In [38]:
final_query2 = """
WITH cte_ranks AS (
SELECT
  EXTRACT(YEAR FROM year_end) AS year,
  region,
  first_name,
  ticker,
  RANK() OVER (
    PARTITION BY region, ticker, year_end
    ORDER BY cumulative_quantity DESC
  ) AS ranking
FROM base_table_step6
WHERE region IN ('United States', 'Australia')
)

SELECT
  region,
  first_name,
  MAX(CASE WHEN ticker = 'BTC' AND year = 2017 THEN ranking ELSE NULL END) AS "BTC 2017",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2018 THEN ranking ELSE NULL END) AS "BTC 2018",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2019 THEN ranking ELSE NULL END) AS "BTC 2019",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2020 THEN ranking ELSE NULL END) AS "BTC 2020",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2017 THEN ranking ELSE NULL END) AS "ETH 2017",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2018 THEN ranking ELSE NULL END) AS "ETH 2018",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2019 THEN ranking ELSE NULL END) AS "ETH 2019",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2020 THEN ranking ELSE NULL END) AS "ETH 2020"
FROM cte_ranks
GROUP BY region, first_name
ORDER BY region, "BTC 2017";
"""
pd.read_sql_query(final_query2, engine)

Unnamed: 0,region,first_name,BTC 2017,BTC 2018,BTC 2019,BTC 2020,ETH 2017,ETH 2018,ETH 2019,ETH 2020
0,Australia,Ben,1,2,1,1,1,1,1,1
1,Australia,Danny,2,1,2,2,2,2,2,2
2,Australia,Pavan,4,4,4,4,4,4,4,4
3,Australia,Sonia,3,3,3,3,3,3,3,3
4,United States,Abe,2,3,3,3,4,4,4,4
5,United States,Alex,6,6,6,6,3,3,3,3
6,United States,Ayush,3,2,2,2,7,7,7,7
7,United States,Charlie,5,5,5,5,5,5,5,5
8,United States,Nandita,1,1,1,1,1,2,2,2
9,United States,Rowan,4,4,4,4,6,6,6,6


I don't know why but ordering per region and "BTC 2017" columns it orders the table
by region and first name first.
Without `ORDER BY` clause, or ordering only by "BAT 2017" columns, it returns the right results. 

In [42]:
final_query2 = """
WITH cte_ranks AS (
SELECT
  EXTRACT(YEAR FROM year_end) AS year,
  region,
  first_name,
  ticker,
  RANK() OVER (
    PARTITION BY region, ticker, year_end
    ORDER BY cumulative_quantity DESC
  ) AS ranking
FROM base_table_step6
WHERE region IN ('United States', 'Australia')
)

SELECT
  region,
  first_name,
  MAX(CASE WHEN ticker = 'BTC' AND year = 2017 THEN ranking ELSE NULL END) AS "BTC 2017",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2018 THEN ranking ELSE NULL END) AS "BTC 2018",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2019 THEN ranking ELSE NULL END) AS "BTC 2019",
  MAX(CASE WHEN ticker = 'BTC' AND year = 2020 THEN ranking ELSE NULL END) AS "BTC 2020",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2017 THEN ranking ELSE NULL END) AS "ETH 2017",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2018 THEN ranking ELSE NULL END) AS "ETH 2018",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2019 THEN ranking ELSE NULL END) AS "ETH 2019",
  MAX(CASE WHEN ticker = 'ETH' AND year = 2020 THEN ranking ELSE NULL END) AS "ETH 2020"
FROM cte_ranks
GROUP BY region, first_name
#ORDER BY "BTC 2017"
;
"""
pd.read_sql_query(final_query2, engine)

Unnamed: 0,region,first_name,BTC 2017,BTC 2018,BTC 2019,BTC 2020,ETH 2017,ETH 2018,ETH 2019,ETH 2020
0,Australia,Ben,1,2,1,1,1,1,1,1
1,Australia,Danny,2,1,2,2,2,2,2,2
2,Australia,Sonia,3,3,3,3,3,3,3,3
3,Australia,Pavan,4,4,4,4,4,4,4,4
4,United States,Nandita,1,1,1,1,1,2,2,2
5,United States,Abe,2,3,3,3,4,4,4,4
6,United States,Ayush,3,2,2,2,7,7,7,7
7,United States,Rowan,4,4,4,4,6,6,6,6
8,United States,Charlie,5,5,5,5,5,5,5,5
9,United States,Alex,6,6,6,6,3,3,3,3


# References
- [Data With Danny Course - Step 7](https://github.com/DataWithDanny/sql-masterclass/blob/main/course-content/step7.md)