In [57]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (API Data)

### Business Question:
Which GICS sectors show the strongest momentum in terms of risk-adjusted returns (Sharpe ratio) over rolling 3-month periods in the past year, and how does this compare to their historical patterns?

This would help identify:
- Which sectors are currently delivering the best returns per unit of risk
- If there are seasonal patterns in sector performance
- Whether current sector performance aligns with historical trends

In [52]:
# Descriptive Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.gics_sector,
        d.trade_date,
        d.year,
        d.month,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.trade_date >= DATE_SUB(CURDATE(), INTERVAL 5 YEAR)
),
sector_returns AS (
    SELECT 
        gics_sector,
        trade_date,
        AVG(daily_return) as avg_daily_return,
        STDDEV(daily_return) as daily_volatility,
        COUNT(DISTINCT symbol) as num_stocks
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY gics_sector, trade_date
),
rolling_metrics AS (
    SELECT 
        gics_sector,
        trade_date,
        AVG(avg_daily_return) OVER (
            PARTITION BY gics_sector 
            ORDER BY trade_date 
            ROWS BETWEEN 63 PRECEDING AND CURRENT ROW
        ) * 252 AS annualized_return,
        STDDEV(avg_daily_return) OVER (
            PARTITION BY gics_sector 
            ORDER BY trade_date 
            ROWS BETWEEN 63 PRECEDING AND CURRENT ROW
        ) * SQRT(252) AS annualized_volatility,
        num_stocks,
        MONTH(trade_date) as month,
        YEAR(trade_date) as year
    FROM sector_returns
),
sharpe_ratios AS (
    SELECT 
        gics_sector,
        trade_date,
        year,
        month,
        annualized_return,
        annualized_volatility,
        CASE 
            WHEN annualized_volatility > 0 THEN annualized_return / annualized_volatility
            ELSE NULL 
        END AS sharpe_ratio,
        num_stocks
    FROM rolling_metrics
),
current_metrics AS (
    SELECT 
        gics_sector,
        AVG(sharpe_ratio) AS current_sharpe,
        STDDEV(sharpe_ratio) AS sharpe_volatility,
        COUNT(DISTINCT trade_date) as trading_days,
        AVG(num_stocks) as avg_stocks
    FROM sharpe_ratios
    WHERE trade_date >= DATE_SUB(CURDATE(), INTERVAL 3 MONTH)
    GROUP BY gics_sector
),
historical_metrics AS (
    SELECT 
        gics_sector,
        month,
        AVG(sharpe_ratio) AS avg_historical_sharpe,
        STDDEV(sharpe_ratio) AS historical_sharpe_volatility
    FROM sharpe_ratios
    WHERE trade_date < DATE_SUB(CURDATE(), INTERVAL 3 MONTH)
    GROUP BY gics_sector, month
)
SELECT 
    cm.gics_sector,
    ROUND(cm.current_sharpe, 3) as current_3m_sharpe,
    ROUND(cm.sharpe_volatility, 3) as current_sharpe_volatility,
    ROUND(hm.avg_historical_sharpe, 3) as historical_sharpe_same_month,
    ROUND(hm.historical_sharpe_volatility, 3) as historical_sharpe_volatility,
    ROUND((cm.current_sharpe - hm.avg_historical_sharpe) / 
        CASE 
            WHEN hm.historical_sharpe_volatility = 0 THEN 1 
            ELSE hm.historical_sharpe_volatility 
        END, 2) as sharpe_z_score,
    cm.trading_days,
    ROUND(cm.avg_stocks) as avg_stocks
FROM current_metrics cm
JOIN historical_metrics hm ON cm.gics_sector = hm.gics_sector 
    AND MONTH(CURDATE()) = hm.month
ORDER BY cm.current_sharpe DESC;
'''


In [53]:
api_results = pd.read_sql(sql_query, engine)
api_results


Unnamed: 0,gics_sector,current_3m_sharpe,current_sharpe_volatility,historical_sharpe_same_month,historical_sharpe_volatility,sharpe_z_score,trading_days,avg_stocks
0,Utilities,0.924,1.123,1.168,1.824,-0.13,63,11.0
1,Financials,0.487,1.036,1.086,2.552,-0.23,63,26.0
2,Energy,0.399,1.072,1.021,6.303,-0.1,63,6.0
3,Health Care,0.27,0.871,1.837,3.052,-0.51,63,21.0
4,Consumer Discretionary,-0.294,1.497,1.53,1.815,-1.0,63,14.0
5,Information Technology,-0.543,1.292,1.397,2.085,-0.93,63,19.0
6,Industrials,-0.721,1.598,1.01,1.935,-0.89,63,19.0
7,Real Estate,-0.783,0.638,0.234,2.065,-0.49,63,10.0
8,Communication Services,-1.133,1.006,0.156,2.582,-0.5,63,4.0
9,Consumer Staples,-1.545,1.029,1.718,1.604,-2.03,63,11.0


### Data Dictionary

1. **gics_sector**
   - The Global Industry Classification Standard sector classification
   - Example: "Utilities", "Financials", etc.

2. **current_3m_sharpe** (Current 3-Month Sharpe Ratio)
   - The risk-adjusted return for the most recent 3-month period
   - Higher is better (shows better returns per unit of risk)
   - Example: Utilities at 0.924 shows positive risk-adjusted returns
   - Calculated as: (annualized return) / (annualized volatility)

3. **current_sharpe_volatility**
   - How much the Sharpe ratio fluctuated within the current 3-month period
   - Higher numbers indicate less stable performance
   - Example: Industrials at 1.598 shows high variability in risk-adjusted returns

4. **historical_sharpe_same_month**
   - The average Sharpe ratio for the same calendar month in previous years
   - Used as a seasonal benchmark
   - Example: For April 2025, this would be the average of April performances from previous years
   - Helps identify if current performance is unusual for this time of year

5. **historical_sharpe_volatility**
   - How much the Sharpe ratio typically varies in this calendar month historically
   - Higher numbers indicate this month tends to be more unpredictable
   - Example: Energy at 6.303 shows this sector historically has very volatile performance in this month

6. **sharpe_z_score**
   - How many standard deviations the current Sharpe ratio is from its historical average
   - Negative means worse than historical average
   - Example: Consumer Staples at -2.03 is performing unusually poorly compared to history
   - Values beyond ±2 are statistically significant

7. **trading_days**
   - Number of trading days in the current 3-month period
   - Should be around 63 (21 trading days × 3 months)
   - Consistent 63 across all sectors shows complete data coverage

8. **avg_stocks**
   - Average number of companies in each sector in our dataset
   - Helps assess how representative the sector metrics are
   - Example: Financials with 26 stocks vs Communication Services with 4 stocks
   - Larger numbers generally mean more reliable sector-wide metrics


### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 

# Diagnostic Analytics Query (API Data)

### Business Question:
Which GICS sector in the S&P 500 has experienced the largest year-over-year increase in daily price volatility since 2014, and can we identify particular sub-industries (or individual companies) within that sector driving this trend?

### Query 1:
Used to find which GICS sector has experienced the largest year over year increase in daily price volatility since 2014

In [64]:
# Diagnostic Analytics Query for API Data
# Important to note that it is filtering for high volatility periods only
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.gics_sector,
        d.year,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.year >= 2014
),
annual_volatility AS (
    SELECT 
        gics_sector,
        year,
        COUNT(DISTINCT symbol) as num_stocks,
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility  -- Annualize the daily volatility
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY gics_sector, year
),
yoy_changes AS (
    SELECT 
        av.gics_sector,
        av.year,
        av.num_stocks,
        av.trading_days,
        av.annual_volatility as current_volatility,
        LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) as prev_year_volatility,
        -- Calculate year-over-year change
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) 
            / LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) * 100 as volatility_change_pct,
        -- Absolute change in volatility
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) as volatility_change_abs
    FROM annual_volatility av
)
SELECT 
    gics_sector,
    year,
    num_stocks,
    trading_days,
    ROUND(current_volatility, 4) as current_volatility,
    ROUND(prev_year_volatility, 4) as prev_year_volatility,
    ROUND(volatility_change_pct, 2) as volatility_change_pct,
    ROUND(volatility_change_abs, 4) as volatility_change_abs
FROM yoy_changes
WHERE volatility_change_pct IS NOT NULL  -- Remove first year where we can't calculate YoY change
ORDER BY 
    gics_sector,  -- Order by sector first
    year;         -- Then chronologically within each sector
'''


In [65]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,gics_sector,year,num_stocks,trading_days,current_volatility,prev_year_volatility,volatility_change_pct,volatility_change_abs
0,Communication Services,2020,4,253,0.3995,0.2617,52.69,0.1379
1,Communication Services,2021,4,252,0.2467,0.3995,-38.26,-0.1529
2,Communication Services,2022,4,251,0.339,0.2467,37.41,0.0923
3,Communication Services,2023,4,250,0.2592,0.339,-23.54,-0.0798
4,Communication Services,2024,4,252,0.2833,0.2592,9.3,0.0241
5,Communication Services,2025,4,69,0.3902,0.2833,37.76,0.107
6,Consumer Discretionary,2020,14,253,0.723,0.2806,157.62,0.4424
7,Consumer Discretionary,2021,14,252,0.39,0.723,-46.06,-0.333
8,Consumer Discretionary,2022,14,251,0.5344,0.39,37.05,0.1445
9,Consumer Discretionary,2023,14,250,0.3365,0.5344,-37.03,-0.1979


### Query 2:
Used to identify which sub-sectors and individual companies are driving this change

In [62]:
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.security,
        s.gics_sector,
        s.gics_industry,
        d.year,
        d.month,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.year >= 2014
        AND s.gics_sector IN ('Energy', 'Financials')  -- Focus on sectors with highest volatility changes
),
company_volatility AS (
    SELECT 
        symbol,
        security,
        gics_sector,
        gics_industry,
        year,
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility,
        AVG(daily_return) * 252 as annual_return,
        COUNT(*) as observations
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY symbol, security, gics_sector, gics_industry, year
),
industry_volatility AS (
    SELECT 
        gics_sector,
        gics_industry,
        year,
        COUNT(DISTINCT symbol) as num_companies,
        AVG(annual_volatility) as industry_volatility,
        STDDEV(annual_volatility) as volatility_dispersion,
        AVG(annual_return) as industry_return
    FROM company_volatility
    GROUP BY gics_sector, gics_industry, year
),
volatility_ranks AS (
    SELECT 
        cv.*,
        iv.industry_volatility,
        iv.num_companies,
        iv.industry_return,
        -- Rank companies within their industry by volatility
        RANK() OVER (PARTITION BY cv.gics_industry, cv.year ORDER BY cv.annual_volatility DESC) as volatility_rank,
        -- Calculate how much company volatility deviates from industry average
        (cv.annual_volatility - iv.industry_volatility) / iv.industry_volatility * 100 as volatility_vs_industry_pct
    FROM company_volatility cv
    JOIN industry_volatility iv 
        ON cv.gics_industry = iv.gics_industry 
        AND cv.year = iv.year
)
SELECT 
    gics_sector,
    gics_industry,
    year,
    symbol,
    security,
    ROUND(annual_volatility, 4) as annual_volatility,
    ROUND(annual_return * 100, 2) as annual_return_pct,
    ROUND(industry_volatility, 4) as industry_volatility,
    ROUND(volatility_vs_industry_pct, 2) as pct_above_industry_avg,
    volatility_rank,
    num_companies,
    trading_days
FROM volatility_ranks
WHERE (year = 2020 OR year = 2025)  -- Focus on COVID and current period
    AND volatility_rank <= 3  -- Show top 3 most volatile companies per industry
ORDER BY 
    year DESC,
    gics_sector,
    gics_industry,
    volatility_rank;
'''


In [63]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,gics_sector,gics_industry,year,symbol,security,annual_volatility,annual_return_pct,industry_volatility,pct_above_industry_avg,volatility_rank,num_companies,trading_days
0,Energy,Integrated Oil & Gas,2025,CVX,Chevron Corporation,0.3465,-13.84,0.3465,0.0,1,1,69
1,Energy,Oil & Gas Equipment & Services,2025,BKR,Baker Hughes,0.4842,-14.79,0.4842,0.0,1,1,69
2,Energy,Oil & Gas Exploration & Production,2025,APA,APA Corporation,0.7148,-126.18,0.5354,33.51,1,4,69
3,Energy,Oil & Gas Exploration & Production,2025,DVN,Devon Energy,0.599,-33.29,0.5354,11.88,2,4,69
4,Energy,Oil & Gas Exploration & Production,2025,COP,ConocoPhillips,0.4707,-36.09,0.5354,-12.08,3,4,69
5,Financials,Asset Management & Custody Banks,2025,APO,Apollo Global Management,0.5826,-80.86,0.4443,31.12,1,6,69
6,Financials,Asset Management & Custody Banks,2025,BX,Blackstone Inc.,0.5113,-94.86,0.4443,15.08,2,6,69
7,Financials,Asset Management & Custody Banks,2025,BEN,Franklin Resources,0.4269,-30.83,0.4443,-3.92,3,6,69
8,Financials,Consumer Finance,2025,DFS,Discover Financial,0.5728,-18.31,0.5126,11.74,1,3,69
9,Financials,Consumer Finance,2025,COF,Capital One,0.5273,-22.45,0.5126,2.86,2,3,69


### Analysis:
*   **Insight:** 
*   **Recommendation:** 
*   **Prediction:** 