In [5]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (API Data)

### Business Question:
What are the average daily return and volatility (standard deviation) for each GICS sector over the past year, as computed from API data?

In [6]:
# Descriptive Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
  SELECT
    a.symbol,
    a.date,
    a.adj_close,
    -- Get the previous day's adjusted closing price for the same symbol
    LAG(a.adj_close, 1) OVER (PARTITION BY a.symbol ORDER BY a.date) AS prev_adj_close
  FROM stg_tiingo_api a
  WHERE a.date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
),
-- Calculate return based on adjusted close-to-close change
close_to_close_returns AS (
    SELECT
        symbol,
        date,
        -- Calculate return, handle division by zero if prev_adj_close is 0 or NULL
        CASE
            WHEN prev_adj_close IS NOT NULL AND prev_adj_close != 0 
            THEN (adj_close - prev_adj_close) / prev_adj_close
            ELSE NULL
        END AS daily_return
    FROM daily_returns
    WHERE prev_adj_close IS NOT NULL
)
SELECT
  s.gics_sector,
  AVG(dr.daily_return) AS avg_daily_return,
  STDDEV(dr.daily_return) AS return_volatility,
  ROW_NUMBER() OVER (ORDER BY AVG(dr.daily_return) DESC) AS sector_rank
FROM close_to_close_returns dr
JOIN dim_symbol s ON dr.symbol = s.symbol
GROUP BY s.gics_sector
ORDER BY sector_rank;
'''

In [7]:
api_results = pd.read_sql(sql_query, engine)
api_results


Unnamed: 0,gics_sector,avg_daily_return,return_volatility,sector_rank
0,Utilities,0.000753,0.01933,1
1,Communication Services,0.000678,0.019926,2
2,Financials,0.000515,0.018344,3
3,Real Estate,0.000214,0.015821,4
4,Industrials,8.2e-05,0.020833,5
5,Consumer Discretionary,-0.000166,0.027503,6
6,Consumer Staples,-0.000219,0.018373,7
7,Health Care,-0.000291,0.020572,8
8,Information Technology,-0.000412,0.032174,9
9,Energy,-0.000429,0.021383,10


### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 

# Diagnostic Analytics Query (API Data)

### Business Question:
Why are certain sectors experiencing higher return volatility? Are there specific months where volatility significantly deviates from the overall average for that sector?

In [3]:
# Diagnostic Analytics Query for API Data
sql_query = '''
WITH daily_returns AS (
  SELECT
    a.symbol,
    a.date,
    ((a.close - a.open) / a.open) AS daily_return
  FROM stg_tiingo_api a
  WHERE a.date >= DATE_SUB(CURDATE(), INTERVAL 1 YEAR)
),
returns_by_month AS (
  SELECT 
         s.gics_sector,
         d.year,
         d.month,
         AVG(dr.daily_return) AS avg_return,
         STDDEV(dr.daily_return) AS volatility
  FROM daily_returns dr
  JOIN dim_symbol s ON dr.symbol = s.symbol
  JOIN dim_date d ON dr.date = d.trade_date
  GROUP BY s.gics_sector, d.year, d.month
),
overall_avg AS (
   SELECT AVG(volatility) AS overall_avg_volatility 
   FROM returns_by_month
)
SELECT
    rbm.gics_sector,
    rbm.year,
    rbm.month,
    rbm.avg_return,
    rbm.volatility,
    RANK() OVER (PARTITION BY rbm.gics_sector ORDER BY rbm.volatility DESC) AS vol_rank
FROM returns_by_month rbm
JOIN overall_avg oa ON 1=1
WHERE rbm.volatility > oa.overall_avg_volatility
ORDER BY rbm.gics_sector, rbm.year, rbm.month;
'''


In [4]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query, engine)
api_diag_results

NameError: name 'conn' is not defined

### Analysis:
*   **Insight:** 
*   **Recommendation:** 
*   **Prediction:** 