In [9]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (API Data)

### Business Question:
Which GICS sector in the S&P 500 has experienced the largest year-over-year increase in daily price volatility since 2014 and when did it occur?

In [10]:
# Descriptive Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
    -- Keep existing daily returns calculation
    SELECT 
        s.symbol,
        s.gics_sector,
        d.year,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE d.year >= 2014
),
annual_volatility AS (
    -- Keep existing annual volatility calculation
    SELECT 
        gics_sector,
        year,
        COUNT(DISTINCT symbol) as num_stocks,
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY gics_sector, year
),
yoy_changes AS (
    -- Keep existing YoY calculations
    SELECT 
        av.gics_sector,
        av.year,
        av.num_stocks,
        av.trading_days,
        av.annual_volatility as current_volatility,
        LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) as prev_year_volatility,
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) 
            / LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year) * 100 as volatility_change_pct,
        (av.annual_volatility - LAG(av.annual_volatility) OVER (PARTITION BY av.gics_sector ORDER BY av.year)) as volatility_change_abs
    FROM annual_volatility av
)
SELECT 
    gics_sector,
    year,
    num_stocks,
    ROUND(current_volatility, 4) as current_volatility,
    ROUND(prev_year_volatility, 4) as prev_year_volatility,
    ROUND(volatility_change_pct, 2) as volatility_change_pct,
    ROUND(volatility_change_abs, 4) as volatility_change_abs
FROM yoy_changes
WHERE volatility_change_pct IS NOT NULL
    AND trading_days > 200  -- Filter out partial years like 2025
ORDER BY volatility_change_pct DESC
LIMIT 10;
'''


In [11]:
api_results = pd.read_sql(sql_query, engine)
api_results


Unnamed: 0,gics_sector,year,num_stocks,current_volatility,prev_year_volatility,volatility_change_pct,volatility_change_abs
0,Real Estate,2020,31,0.5694,0.1816,213.5,0.3878
1,Financials,2020,73,0.5586,0.2177,156.61,0.3409
2,Energy,2020,22,0.8306,0.3275,153.62,0.5031
3,Consumer Discretionary,2020,51,0.6687,0.2793,139.38,0.3893
4,Industrials,2020,76,0.5181,0.2539,104.07,0.2642
5,Materials,2020,25,0.5337,0.2805,90.26,0.2532
6,Communication Services,2020,22,0.4774,0.2632,81.37,0.2142
7,Consumer Staples,2020,37,0.3924,0.2312,69.74,0.1612
8,Health Care,2020,56,0.4697,0.2807,67.29,0.1889
9,Information Technology,2020,68,0.5283,0.327,61.57,0.2013


### Data Dictionary

1.  **`gics_sector`**:
    *   **Definition:** The Global Industry Classification Standard (GICS) sector name. This broadly categorizes companies based on their primary business activity.
    *   **Example:** 'Real Estate', 'Financials', 'Energy'.

2.  **`year`**:
    *   **Definition:** The calendar year for which the `current_volatility` metric was calculated. In this table, it's specifically 2020.
    *   **Example:** 2020.

3.  **`num_stocks`**:
    *   **Definition:** The number of distinct companies within that `gics_sector` that had sufficient data available in the database to be included in the volatility calculation for the specified `year` (2020).
    *   **Example:** 31 (for Real Estate in 2020).

4.  **`current_volatility`**:
    *   **Definition:** The calculated annualized volatility of the sector's daily returns for the specified `year` (2020). It's typically calculated as the standard deviation of the sector's average daily returns, multiplied by the square root of the number of trading days in a year (commonly √252). A higher value indicates greater price fluctuation or risk during that year.
    *   **Example:** 0.5258 (or 56.94%) for Real Estate in 2020.

5.  **`prev_year_volatility`**:
    *   **Definition:** The calculated annualized volatility of the sector's daily returns for the year *prior* to the specified `year`. In this case, it represents the volatility for 2019.
    *   **Example:** 0.1816 (or 18.16%) for Real Estate in 2019.

6.  **`volatility_change_pct`**:
    *   **Definition:** The percentage change in annualized volatility from the previous year (2019) to the current year (2020). Calculated as `((current_volatility - prev_year_volatility) / prev_year_volatility) * 100`. This highlights the relative magnitude of the volatility increase or decrease.
    *   **Example:** 213.50% for Real Estate, indicating its 2020 volatility was 213.50% higher than its 2019 volatility.

7.  **`volatility_change_abs`**:
    *   **Definition:** The absolute difference in annualized volatility between the current year (2020) and the previous year (2019). Calculated as `current_volatility - prev_year_volatility`. This shows the raw change in the volatility measure.
    *   **Example:** 0.3878 for Real Estate, meaning its annualized volatility measure increased by approximately 0.35 from 2019 to 2020.



### Analysis:

### Insight:
The Real Estate sector experienced the most dramatic volatility increase in 2020, with a 213.50% surge from 2019 (0.1816) to 2020 (0.5694), coinciding with the COVID-19 pandemic. This was significantly higher than other sectors, even surpassing traditionally volatile sectors like Energy (153.62% increase) and Financials (156.61% increase). Notably, all sectors showed at least a 60% increase in volatility during this period, indicating a market-wide impact, but defensive sectors like Consumer Staples (69.74%) and Health Care (67.29%) demonstrated relatively more stability.

### Recommendation:
1. Risk managers and portfolio managers should implement enhanced monitoring systems for Real Estate exposure, particularly during periods of economic uncertainty or public health crises.
2. Consider using Consumer Staples and Health Care sectors as potential volatility hedges, as they demonstrated the most resilience during the 2020 market turbulence (lowest volatility increases of 69.74% and 67.29% respectively).
3. Develop sector-specific stress testing scenarios that account for the possibility of sudden volatility spikes similar to those seen in 2020, especially for sectors that showed extreme sensitivity (Real Estate, Financials, Energy).

### Prediction:
While volatility levels are likely to normalize post-crisis, Real Estate sector volatility will remain sensitive to future public health or work-from-home related disruptions. The sector's dramatic response to COVID-19 (213.50% volatility increase) suggests it could experience similar outsized reactions to future events that impact physical occupancy rates or commercial property values. We may see elevated baseline volatility in Real Estate compared to pre-2020 levels as the sector continues to adapt to structural changes in how commercial and office spaces are utilized.


# Diagnostic Analytics Query (API Data)

### Business Question:
Which sub-industries or individual companies within the Real Estate sector drove the large volatility increase observed in 2020?

In [14]:
sql_query = '''
WITH daily_returns AS (
    SELECT 
        s.symbol,
        s.security,
        s.gics_industry,
        d.year,
        d.trade_date,
        fp.close_price,
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_close_price,
        (fp.close_price - LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date)) / 
        LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS daily_return
    FROM fact_price fp
    JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
    JOIN dim_date d ON fp.date_id = d.date_id
    WHERE (s.gics_sector = 'Real Estate')
    AND d.year = 2020
),
company_metrics AS (
    SELECT 
        symbol,
        security,
        COALESCE(gics_industry, 'S&P 500 Index') as gics_industry,  -- Label for benchmark
        COUNT(DISTINCT trade_date) as trading_days,
        STDDEV(daily_return) * SQRT(252) as annual_volatility,
        (EXP(SUM(LN(1 + NULLIF(daily_return, 0)))) - 1) * 100 as annual_return_pct
    FROM daily_returns
    WHERE daily_return IS NOT NULL
    GROUP BY symbol, security, gics_industry
)
SELECT 
    symbol,
    security,
    gics_industry,
    ROUND(annual_volatility, 4) as annual_volatility,
    ROUND(annual_return_pct, 2) as annual_return_pct,
    trading_days
FROM company_metrics
ORDER BY 
    CASE WHEN symbol = 'SPY' THEN 0 ELSE 1 END,  -- Show benchmark first
    annual_volatility DESC;
'''


In [15]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,symbol,security,gics_industry,annual_volatility,annual_return_pct,trading_days
0,SPG,Simon Property Group,Retail REITs,0.9159,-36.97,252
1,VTR,Ventas,Health Care REITs,0.8427,-7.54,252
2,KIM,Kimco Realty,Retail REITs,0.7586,-19.32,252
3,WELL,Welltower,Health Care REITs,0.7478,-15.8,252
4,REG,Regency Centers,Retail REITs,0.6993,-22.18,252
5,WY,Weyerhaeuser,Timber REITs,0.6926,16.17,252
6,HST,Host Hotels & Resorts,Hotel & Resort REITs,0.6911,-18.35,252
7,FRT,Federal Realty Investment Trust,Retail REITs,0.6726,-28.65,252
8,VICI,Vici Properties,Hotel & Resort REITs,0.6309,7.1,252
9,CBRE,CBRE Group,Real Estate Services,0.6231,2.02,252


### Data Dictionary

1. **`symbol`**
   - The stock ticker symbol used on exchanges
   - Example: 'SPG' for Simon Property Group

2. **`security`**
   - The full company name
   - Example: 'Simon Property Group'

3. **`gics_industry`**
   - The Global Industry Classification Standard sub-industry classification
   - Categories shown:
     - Retail REITs (retail property owners)
     - Health Care REITs (medical facility owners)
     - Hotel & Resort REITs (hospitality property owners)
     - Multi-Family Residential REITs (apartment complex owners)
     - Single-Family Residential REITs (house rental owners)
     - Industrial REITs (warehouse/logistics facility owners)
     - Data Center REITs (computing facility owners)
     - Telecom Tower REITs (communication infrastructure owners)
     - Self-Storage REITs (storage facility owners)
     - Office REITs (commercial office building owners)
     - Real Estate Services (property management/consulting)
     - Other Specialized REITs (mixed-use property owners)

4. **`annual_volatility`**
   - Measures the degree of variation in daily stock returns over 2020
   - Calculated as: Standard deviation of daily returns × √252 (annualization factor)
   - Higher numbers indicate more price volatility
   - Example: 0.9159 (91.59%) for SPG indicates extreme price swings

5. **`annual_return_pct`**
   - The total percentage return for 2020
   - Calculated using compound daily returns: (EXP(SUM(LN(1 + daily_return))) - 1) * 100
   - Positive numbers indicate gains, negative indicate losses
   - Example: -36.97% for SPG means investors lost 36.97% of value in 2020

6. **`trading_days`**
   - Number of days the stock traded during 2020
   - 252 is typical for a full year (excluding weekends and holidays)
   - Used to verify data completeness
   - All companies in the dataset show complete trading data with 252 days

### Notes:
- REIT stands for Real Estate Investment Trust, a company that owns, operates, or finances income-generating real estate
- All volatility and return calculations are based on the COVID-impacted year of 2020
- The data represents companies within the Real Estate sector of the S&P 500



### Analysis:

### Insight:
Retail and Health Care REITs experienced the most severe volatility in 2020, with Simon Property Group (SPG) showing the highest volatility (0.9159) among all Real Estate companies, nearly 2.5x higher than the sector's most stable company, Public Storage (0.3643). There's a clear pattern where property type significantly influenced volatility levels:

1. Most Volatile Sub-industries (>0.65 volatility):
   - Retail REITs (SPG: 0.9159, KIM: 0.7586)
   - Health Care REITs (VTR: 0.8427, WELL: 0.7478)
   - Hotel & Resort REITs (HST: 0.6911)

2. Most Stable Sub-industries (<0.45 volatility):
   - Self-Storage REITs (PSA: 0.3643, EXR: 0.4120)
   - Data Center REITs (EQIX: 0.4054, DLR: 0.4333)
   - Telecom Tower REITs (SBAC: 0.4257, CCI: 0.4258)

This pattern strongly correlates with each sub-industry's exposure to COVID-19 lockdown measures and social distancing requirements.

### Recommendation:
1. Portfolio managers should consider implementing a "hybrid REIT" strategy that balances:
   - Digital Infrastructure REITs (Data Centers, Telecom Towers) for stability
   - Traditional REITs (Retail, Healthcare) for recovery potential
   - Industrial/Logistics REITs (like Prologis) for e-commerce exposure

2. Risk managers should:
   - Develop sub-industry specific risk metrics, as the performance gap between different REIT types (0.9159 vs 0.3643) suggests traditional sector-level diversification may be insufficient
   - Increase monitoring of REITs with physical occupancy dependence
   - Consider setting different risk limits for different REIT sub-categories

### Prediction:
The bifurcation between digital and physical real estate will likely persist:
- Digital/Infrastructure REITs (Data Centers, Telecom Towers) will maintain lower volatility due to long-term contracts and essential service status
- Traditional retail and healthcare REITs will face continued volatility as they adapt to hybrid operating models
- Industrial/Logistics REITs will see steady growth with e-commerce expansion
- The sector may see increased M&A activity as stronger REITs (particularly in digital/infrastructure) acquire distressed traditional property portfolios