In [57]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (API Data)

### Business Question:
Which GICS sectors show the strongest momentum in terms of risk-adjusted returns (Sharpe ratio) over rolling 3-month periods in the past year, and how does this compare to their historical patterns?

This would help identify:
- Which sectors are currently delivering the best returns per unit of risk
- If there are seasonal patterns in sector performance
- Whether current sector performance aligns with historical trends

In [52]:
# Descriptive Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.gics_sector,
        d.trade_date,
        d.year,
        d.month,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.trade_date >= DATE_SUB(CURDATE(), INTERVAL 5 YEAR)
),
sector_returns AS (
    SELECT 
        gics_sector,
        trade_date,
        AVG(daily_return) as avg_daily_return,
        STDDEV(daily_return) as daily_volatility,
        COUNT(DISTINCT symbol) as num_stocks
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY gics_sector, trade_date
),
rolling_metrics AS (
    SELECT 
        gics_sector,
        trade_date,
        AVG(avg_daily_return) OVER (
            PARTITION BY gics_sector 
            ORDER BY trade_date 
            ROWS BETWEEN 63 PRECEDING AND CURRENT ROW
        ) * 252 AS annualized_return,
        STDDEV(avg_daily_return) OVER (
            PARTITION BY gics_sector 
            ORDER BY trade_date 
            ROWS BETWEEN 63 PRECEDING AND CURRENT ROW
        ) * SQRT(252) AS annualized_volatility,
        num_stocks,
        MONTH(trade_date) as month,
        YEAR(trade_date) as year
    FROM sector_returns
),
sharpe_ratios AS (
    SELECT 
        gics_sector,
        trade_date,
        year,
        month,
        annualized_return,
        annualized_volatility,
        CASE 
            WHEN annualized_volatility > 0 THEN annualized_return / annualized_volatility
            ELSE NULL 
        END AS sharpe_ratio,
        num_stocks
    FROM rolling_metrics
),
current_metrics AS (
    SELECT 
        gics_sector,
        AVG(sharpe_ratio) AS current_sharpe,
        STDDEV(sharpe_ratio) AS sharpe_volatility,
        COUNT(DISTINCT trade_date) as trading_days,
        AVG(num_stocks) as avg_stocks
    FROM sharpe_ratios
    WHERE trade_date >= DATE_SUB(CURDATE(), INTERVAL 3 MONTH)
    GROUP BY gics_sector
),
historical_metrics AS (
    SELECT 
        gics_sector,
        month,
        AVG(sharpe_ratio) AS avg_historical_sharpe,
        STDDEV(sharpe_ratio) AS historical_sharpe_volatility
    FROM sharpe_ratios
    WHERE trade_date < DATE_SUB(CURDATE(), INTERVAL 3 MONTH)
    GROUP BY gics_sector, month
)
SELECT 
    cm.gics_sector,
    ROUND(cm.current_sharpe, 3) as current_3m_sharpe,
    ROUND(cm.sharpe_volatility, 3) as current_sharpe_volatility,
    ROUND(hm.avg_historical_sharpe, 3) as historical_sharpe_same_month,
    ROUND(hm.historical_sharpe_volatility, 3) as historical_sharpe_volatility,
    ROUND((cm.current_sharpe - hm.avg_historical_sharpe) / 
        CASE 
            WHEN hm.historical_sharpe_volatility = 0 THEN 1 
            ELSE hm.historical_sharpe_volatility 
        END, 2) as sharpe_z_score,
    cm.trading_days,
    ROUND(cm.avg_stocks) as avg_stocks
FROM current_metrics cm
JOIN historical_metrics hm ON cm.gics_sector = hm.gics_sector 
    AND MONTH(CURDATE()) = hm.month
ORDER BY cm.current_sharpe DESC;
'''


In [53]:
api_results = pd.read_sql(sql_query, engine)
api_results


Unnamed: 0,gics_sector,current_3m_sharpe,current_sharpe_volatility,historical_sharpe_same_month,historical_sharpe_volatility,sharpe_z_score,trading_days,avg_stocks
0,Utilities,0.924,1.123,1.168,1.824,-0.13,63,11.0
1,Financials,0.487,1.036,1.086,2.552,-0.23,63,26.0
2,Energy,0.399,1.072,1.021,6.303,-0.1,63,6.0
3,Health Care,0.27,0.871,1.837,3.052,-0.51,63,21.0
4,Consumer Discretionary,-0.294,1.497,1.53,1.815,-1.0,63,14.0
5,Information Technology,-0.543,1.292,1.397,2.085,-0.93,63,19.0
6,Industrials,-0.721,1.598,1.01,1.935,-0.89,63,19.0
7,Real Estate,-0.783,0.638,0.234,2.065,-0.49,63,10.0
8,Communication Services,-1.133,1.006,0.156,2.582,-0.5,63,4.0
9,Consumer Staples,-1.545,1.029,1.718,1.604,-2.03,63,11.0


### Data Dictionary

1. **gics_sector**
   - The Global Industry Classification Standard sector classification
   - Example: "Utilities", "Financials", etc.

2. **current_3m_sharpe** (Current 3-Month Sharpe Ratio)
   - The risk-adjusted return for the most recent 3-month period
   - Higher is better (shows better returns per unit of risk)
   - Example: Utilities at 0.924 shows positive risk-adjusted returns
   - Calculated as: (annualized return) / (annualized volatility)

3. **current_sharpe_volatility**
   - How much the Sharpe ratio fluctuated within the current 3-month period
   - Higher numbers indicate less stable performance
   - Example: Industrials at 1.598 shows high variability in risk-adjusted returns

4. **historical_sharpe_same_month**
   - The average Sharpe ratio for the same calendar month in previous years
   - Used as a seasonal benchmark
   - Example: For April 2025, this would be the average of April performances from previous years
   - Helps identify if current performance is unusual for this time of year

5. **historical_sharpe_volatility**
   - How much the Sharpe ratio typically varies in this calendar month historically
   - Higher numbers indicate this month tends to be more unpredictable
   - Example: Energy at 6.303 shows this sector historically has very volatile performance in this month

6. **sharpe_z_score**
   - How many standard deviations the current Sharpe ratio is from its historical average
   - Negative means worse than historical average
   - Example: Consumer Staples at -2.03 is performing unusually poorly compared to history
   - Values beyond ±2 are statistically significant

7. **trading_days**
   - Number of trading days in the current 3-month period
   - Should be around 63 (21 trading days × 3 months)
   - Consistent 63 across all sectors shows complete data coverage

8. **avg_stocks**
   - Average number of companies in each sector in our dataset
   - Helps assess how representative the sector metrics are
   - Example: Financials with 26 stocks vs Communication Services with 4 stocks
   - Larger numbers generally mean more reliable sector-wide metrics


### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 

# Diagnostic Analytics Query (API Data)

### Business Question:
Which GICS sector in the S&P 500 has experienced the largest year-over-year increase in daily price volatility since 2014, and can we identify particular sub-industries (or individual companies) within that sector driving this trend?

### Query 1:
Used to find which GICS sector has experienced the largest year over year increase in daily price volatility since 2015

In [67]:
# Diagnostic Analytics Query for API Data
# Important to note that it is filtering for high volatility periods only
sql_query = '''
WITH daily_returns AS (
    -- Keep existing daily returns calculation
    SELECT 
        s.symbol,
        s.gics_sector,
        d.year,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.year >= 2014
),
annual_volatility AS (
    -- Keep existing annual volatility calculation
    SELECT 
        gics_sector,
        year,
        COUNT(DISTINCT symbol) as num_stocks,
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY gics_sector, year
),
yoy_changes AS (
    -- Keep existing YoY calculations
    SELECT 
        av.gics_sector,
        av.year,
        av.num_stocks,
        av.trading_days,
        av.annual_volatility as current_volatility,
        LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) as prev_year_volatility,
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) 
            / LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) * 100 as volatility_change_pct,
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) as volatility_change_abs
    FROM annual_volatility av
)
SELECT 
    gics_sector,
    year,
    num_stocks,
    ROUND(current_volatility, 4) as current_volatility,
    ROUND(prev_year_volatility, 4) as prev_year_volatility,
    ROUND(volatility_change_pct, 2) as volatility_change_pct,
    ROUND(volatility_change_abs, 4) as volatility_change_abs
FROM yoy_changes
WHERE volatility_change_pct IS NOT NULL
    AND trading_days > 200  -- Filter out partial years like 2025
ORDER BY volatility_change_pct DESC
LIMIT 10;
'''


In [68]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,gics_sector,year,num_stocks,current_volatility,prev_year_volatility,volatility_change_pct,volatility_change_abs
0,Utilities,2020,10,0.4596,0.1514,203.63,0.3082
1,Financials,2020,26,0.5679,0.211,169.17,0.3569
2,Real Estate,2020,10,0.497,0.1899,161.74,0.3071
3,Consumer Discretionary,2020,14,0.723,0.2807,157.53,0.4423
4,Energy,2020,6,0.8233,0.3475,136.95,0.4759
5,Industrials,2020,19,0.5498,0.263,109.03,0.2868
6,Materials,2020,9,0.5191,0.2941,76.48,0.225
7,Consumer Staples,2020,11,0.3666,0.2297,59.6,0.1369
8,Health Care,2020,21,0.4557,0.2931,55.46,0.1626
9,Information Technology,2020,19,0.4968,0.3239,53.37,0.1729


# Brief Insights
1. **Top 3 Largest YoY Increases:**
   - Utilities (2020): 203.63% increase (0.1514 to 0.4596)
   - Financials (2020): 169.17% increase (0.2110 to 0.5679)
   - Real Estate (2020): 161.74% increase (0.1899 to 0.4970)

2. **Notable Patterns:**
   - The COVID-19 pandemic in 2020 caused the most significant volatility spikes across all sectors
   - Energy sector shows consistently high volatility and significant spikes:
     - 136.95% increase in 2020
     - 96.15% increase in 2025 (though this is partial year data)
   - Most sectors show cyclical patterns of volatility, with major spikes followed by significant decreases

3. **Sector Stability Analysis:**
   - Utilities traditionally had the lowest base volatility (often below 0.20) but experienced the largest percentage increase
   - Consumer Staples and Health Care generally show more stable patterns outside of 2020
   - Technology and Energy consistently show higher base volatility levels


### Query 2:
Used to identify which sub-sectors and individual companies are driving this change

In [77]:
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.security,
        s.gics_industry,
        d.year,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE s.gics_sector = 'Utilities'
    AND d.year = 2020
),
company_metrics AS (
    SELECT 
        symbol,
        security,
        gics_industry,
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility,
        -- Calculate compound annual return
        (EXP(SUM(LN(1 + NULLIF(daily_return, 0)))) - 1) * 100 as annual_return_pct
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY symbol, security, gics_industry
)
SELECT 
    symbol,
    security,
    gics_industry,
    ROUND(annual_volatility, 4) as annual_volatility,
    ROUND(annual_return_pct, 2) as annual_return_pct,
    trading_days
FROM company_metrics
ORDER BY annual_volatility DESC;
'''


In [78]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,symbol,security,gics_industry,annual_volatility,annual_return_pct,trading_days
0,CNP,CenterPoint Energy,Multi-Utilities,0.5857,-16.99,252
1,AES,AES Corporation,Independent Power Producers & Energy Traders,0.5812,21.46,252
2,EVRG,Evergy,Electric Utilities,0.4922,-9.34,252
3,EXC,Exelon,Electric Utilities,0.4889,-3.57,252
4,DTE,DTE Energy,Multi-Utilities,0.4753,-2.02,252
5,ETR,Entergy,Electric Utilities,0.468,-12.35,252
6,EIX,Edison International,Electric Utilities,0.452,-11.59,252
7,ES,Eversource Energy,Electric Utilities,0.4512,6.76,252
8,D,Dominion Energy,Multi-Utilities,0.4487,-4.28,252
9,AEE,Ameren,Multi-Utilities,0.4314,5.29,252


### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 