In [18]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (API Data)

### Business Question:
What are the average daily return and volatility (standard deviation) for each GICS sector over the past year, as computed from API data?

In [19]:
# Descriptive Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
  SELECT
    s.symbol,
    d.trade_date as date,
    fp.close_price as adj_close,
    -- Get the previous day's adjusted closing price for the same symbol
    LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_adj_close
  FROM fact_price fp
  JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
  JOIN dim_date d ON fp.date_id = d.date_id
  WHERE d.trade_date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
),
-- Calculate return based on adjusted close-to-close change
close_to_close_returns AS (
    SELECT
        symbol,
        date,
        -- Calculate return, handle division by zero if prev_adj_close is 0 or NULL
        CASE
            WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
            THEN (adj_close - prev_adj_close) / prev_adj_close
            ELSE NULL
        END AS daily_return
    FROM daily_returns
    WHERE prev_adj_close IS NOT NULL
)
SELECT
  s.gics_sector,
  AVG(dr.daily_return) AS avg_daily_return,
  STDDEV(dr.daily_return) AS return_volatility,
  ROW_NUMBER() OVER (ORDER BY AVG(dr.daily_return) DESC) AS sector_rank
FROM close_to_close_returns dr
JOIN dim_symbol s ON dr.symbol = s.symbol
GROUP BY s.gics_sector
ORDER BY sector_rank;
'''

In [20]:
api_results = pd.read_sql(sql_query, engine)
api_results


Unnamed: 0,gics_sector,avg_daily_return,return_volatility,sector_rank
0,Utilities,0.000848,0.021737,1
1,Financials,0.000782,0.017088,2
2,Communication Services,0.00048,0.021296,3
3,Real Estate,0.000318,0.017334,4
4,Industrials,0.000277,0.020857,5
5,Consumer Discretionary,4.3e-05,0.023566,6
6,Information Technology,-0.000146,0.026017,7
7,Health Care,-0.000179,0.017829,8
8,Consumer Staples,-0.000313,0.015525,9
9,Materials,-0.000465,0.021349,10


### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 

# Diagnostic Analytics Query (API Data)

### Business Question:
Why are certain sectors experiencing higher return volatility? Are there specific months where volatility significantly deviates from the overall average for that sector?

In [29]:
# Diagnostic Analytics Query for API Data
# Important to not that it is filtering for high volatility periods only
sql_query = '''
WITH daily_returns AS (
  SELECT
    s.symbol,
    d.trade_date as date,
    fp.close_price as adj_close,
    LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_adj_close,
    s.gics_sector,
    d.year,
    d.month
  FROM fact_price fp
  JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
  JOIN dim_date d ON fp.date_id = d.date_id
),
monthly_stats AS (
    SELECT 
        gics_sector,
        year,
        month,
        -- Calculate monthly volatility
        STDDEV(
            CASE
                WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
                THEN (adj_close - prev_adj_close) / prev_adj_close
                ELSE NULL
            END
        ) AS monthly_volatility,
        -- Calculate average monthly return
        AVG(
            CASE
                WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
                THEN (adj_close - prev_adj_close) / prev_adj_close
                ELSE NULL
            END
        ) AS avg_monthly_return,
        -- Count distinct trading days
        COUNT(DISTINCT date) as distinct_trading_days,
        -- Count total observations (stocks × trading days)
        COUNT(*) as total_observations,
        -- Count distinct stocks
        COUNT(DISTINCT symbol) as num_stocks
    FROM daily_returns
    WHERE prev_adj_close IS NOT NULL
    GROUP BY gics_sector, year, month
),
sector_baselines AS (
    SELECT
        gics_sector,
        AVG(monthly_volatility) as avg_sector_volatility,
        STDDEV(monthly_volatility) as volatility_std
    FROM monthly_stats
    GROUP BY gics_sector
)
SELECT 
    ms.gics_sector,
    ms.year,
    ms.month,
    ms.monthly_volatility,
    ms.avg_monthly_return,
    sb.avg_sector_volatility as baseline_volatility,
    (ms.monthly_volatility - sb.avg_sector_volatility) / sb.volatility_std as volatility_zscore,
    ms.distinct_trading_days,
    ms.num_stocks,
    ms.total_observations
FROM monthly_stats ms
JOIN sector_baselines sb ON ms.gics_sector = sb.gics_sector
WHERE (ms.monthly_volatility - sb.avg_sector_volatility) / sb.volatility_std > 1.5
ORDER BY 
    ms.gics_sector,
    volatility_zscore DESC;
'''

# Alternate Query
'''
This query is to return all months and years, not just volatile ones

WITH daily_returns AS (
  SELECT
    s.symbol,
    d.trade_date as date,
    fp.close_price as adj_close,
    LAG(fp.close_price, 1) OVER (PARTITION BY s.symbol ORDER BY d.trade_date) AS prev_adj_close,
    s.gics_sector,
    d.year,
    d.month
  FROM fact_price fp
  JOIN dim_symbol s ON fp.symbol_id = s.symbol_id
  JOIN dim_date d ON fp.date_id = d.date_id
),
monthly_stats AS (
    SELECT 
        gics_sector,
        year,
        month,
        STDDEV(
            CASE
                WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
                THEN (adj_close - prev_adj_close) / prev_adj_close
                ELSE NULL
            END
        ) AS monthly_volatility,
        AVG(
            CASE
                WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
                THEN (adj_close - prev_adj_close) / prev_adj_close
                ELSE NULL
            END
        ) AS avg_monthly_return,
        COUNT(DISTINCT date) as distinct_trading_days,
        COUNT(DISTINCT symbol) as num_stocks,
        COUNT(*) as total_observations
    FROM daily_returns
    WHERE prev_adj_close IS NOT NULL
    GROUP BY gics_sector, year, month
),
sector_baselines AS (
    SELECT
        gics_sector,
        AVG(monthly_volatility) as avg_sector_volatility,
        STDDEV(monthly_volatility) as volatility_std
    FROM monthly_stats
    GROUP BY gics_sector
)
SELECT 
    ms.gics_sector,
    ms.year,
    ms.month,
    ms.monthly_volatility,
    ms.avg_monthly_return,
    sb.avg_sector_volatility as baseline_volatility,
    (ms.monthly_volatility - sb.avg_sector_volatility) / sb.volatility_std as volatility_zscore,
    ms.distinct_trading_days,
    ms.num_stocks,
    ms.total_observations
FROM monthly_stats ms
JOIN sector_baselines sb ON ms.gics_sector = sb.gics_sector
ORDER BY 
    ms.gics_sector,
    ms.year,
    ms.month;

'''


In [30]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

Unnamed: 0,gics_sector,year,month,monthly_volatility,avg_monthly_return,baseline_volatility,volatility_zscore,distinct_trading_days,num_stocks,total_observations
0,Communication Services,2020,3,0.054895,-0.004789,0.017571,4.836413,22,2,44
1,Communication Services,2025,4,0.039049,-0.009101,0.017571,2.78315,9,2,18
2,Communication Services,2024,2,0.032495,-0.007265,0.017571,1.933892,20,2,40
3,Communication Services,2022,12,0.030961,-0.003992,0.017571,1.735088,21,2,42
4,Communication Services,2024,7,0.030311,0.007267,0.017571,1.650847,22,2,44
5,Consumer Discretionary,2020,3,0.088211,-0.010259,0.024911,5.413879,22,7,154
6,Consumer Discretionary,2025,4,0.060029,-0.007068,0.024911,3.003513,9,8,72
7,Consumer Discretionary,2020,4,0.056776,0.012946,0.024911,2.725269,21,7,147
8,Consumer Discretionary,2020,11,0.048327,0.007522,0.024911,2.002721,20,7,140
9,Consumer Staples,2020,3,0.055643,0.000115,0.015668,6.817545,22,7,154


### Analysis:
*   **Insight:** 
*   **Recommendation:** 
*   **Prediction:** 