In [28]:
# Import necessary modules
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables from .env file
# Ensure the .env file is in the parent directory or specify the path if elsewhere
dotenv_path = os.path.join(os.path.dirname('__file__'), '..', '.env') # Assumes .env is one level up
load_dotenv(dotenv_path=dotenv_path)
print(f"Attempting to load .env from: {dotenv_path}") # Debugging print
print(f".env loaded: {load_dotenv(dotenv_path=dotenv_path)}") # Check if loading was successful

# Establish database connection using credentials from environment variables
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

# Check if all necessary variables are loaded
if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    # Print which variables are missing for debugging
    missing_vars = [var for var, val in {'DB_USER': DB_USER, 'DB_PASSWORD': DB_PASSWORD, 'DB_HOST': DB_HOST, 'DB_PORT': DB_PORT, 'DB_NAME': DB_NAME}.items() if val is None]
    raise Exception(f"Database credentials not fully set. Missing: {', '.join(missing_vars)}")

print("Database credentials loaded successfully.") # Confirmation

try:
    # Create SQLAlchemy engine
    conn_string = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
    engine = create_engine(conn_string)
    # Test connection
    with engine.connect() as connection:
        print("Database connection successful!")
except Exception as e:
    print(f"Database connection failed: {e}")
    raise

# Set pandas display options (optional, you already have this later)
pd.set_option('display.max_rows', None)

Attempting to load .env from: ..\.env
.env loaded: True
Database credentials loaded successfully.
Database connection successful!


# Descriptive Analytics Query (Wikipedia Web Scrape Data)

### Business Question:
What is the current sector composition of the S&P 500, and which sectors have the highest concentration of companies?

In [29]:
# Descriptive Analytics Query for Wikipedia Data
sql_query_wiki_desc = '''
WITH SectorCounts AS (
    -- Calculate the number of companies in each GICS sector
    SELECT
        gics_sector,
        COUNT(symbol) AS num_companies
    FROM
        dim_symbol -- Using the cleaned dimension table
    WHERE
        gics_sector IS NOT NULL AND gics_sector != '' -- Ensure sector data is valid
    GROUP BY
        gics_sector
)
-- Select sector counts and rank them
SELECT
    gics_sector,
    num_companies,
    ROW_NUMBER() OVER (ORDER BY num_companies DESC) as sector_rank
FROM
    SectorCounts
ORDER BY
    sector_rank;
'''

In [30]:
api_results = pd.read_sql(sql_query_wiki_desc, engine)
api_results


Unnamed: 0,gics_sector,num_companies,sector_rank
0,Industrials,78,1
1,Financials,73,2
2,Information Technology,69,3
3,Health Care,60,4
4,Consumer Discretionary,51,5
5,Consumer Staples,38,6
6,Utilities,31,7
7,Real Estate,31,8
8,Materials,26,9
9,Energy,23,10


### Data Dictionary - Descriptive Query Results

| Column Name    | Data Type | Description |
|---------------|-----------|-------------|
| gics_sector   | string    | Global Industry Classification Standard (GICS) sector name, representing the highest level of industry classification (e.g., Information Technology, Healthcare) |
| num_companies | integer   | Total count of companies within each GICS sector currently listed in the S&P 500 |
| sector_rank   | integer   | Ranking of sectors based on the number of companies, where 1 represents the sector with the most companies |

### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 

# Diagnostic Analytics Query (Wikipedia Web Scrape Data)

### Business Question:
Within the Information Technology sector (the largest sector by company count), which sub-industries are most prominent, and what is the average tenure (years since being added to the S&P 500) of companies within those dominant sub-industries?

In [26]:
# Diagnostic Analytics Query for Wikipedia Data
sql_query_wiki_diag = '''
WITH TopSectorIndustries AS (
    SELECT
        ds.symbol,
        ds.gics_industry,
        CAST(rw.`Date added` AS DATE) as date_added_parsed
    FROM
        dim_symbol ds
    JOIN
        raw_wikipedia_sp500 rw ON ds.symbol = rw.`Symbol`
    WHERE
        ds.gics_sector = 'Information Technology'
        AND rw.`Date added` IS NOT NULL
        AND rw.`Date added` != ''
),
IndustryTenure AS (
    SELECT
        symbol,
        gics_industry,
        date_added_parsed,
        CASE
            WHEN date_added_parsed IS NOT NULL AND date_added_parsed <= CURDATE()
            THEN TIMESTAMPDIFF(YEAR, date_added_parsed, CURDATE())
            ELSE 0
        END AS years_in_index
    FROM
        TopSectorIndustries
    WHERE date_added_parsed IS NOT NULL
)
SELECT
    gics_industry,
    COUNT(symbol) AS num_companies,
    AVG(years_in_index) AS avg_years_in_index,
FROM
    IndustryTenure
GROUP BY
    gics_industry
ORDER BY
    num_companies DESC,
    avg_years_in_index DESC;
'''

In [27]:
# Execute diagnostic query for API data
api_diag_results = pd.read_sql(sql_query_wiki_diag, engine)
api_diag_results

Unnamed: 0,gics_industry,avg_years_in_index,num_companies
0,Semiconductors,16.5714,14
1,Application Software,12.9231,13
2,"Technology Hardware, Storage & Peripherals",19.25,8
3,Systems Software,10.6667,6
4,Communications Equipment,27.4,5
5,IT Consulting & Other Services,22.0,5
6,Semiconductor Materials & Equipment,15.4,5
7,Electronic Equipment & Instruments,6.8,5
8,Internet Services & Infrastructure,12.0,3
9,Electronic Components,23.0,2


### Data Dictionary - Diagnostic Query Results

| Column Name        | Data Type | Description |
|-------------------|-----------|-------------|
| gics_industry     | string    | GICS industry name, representing a more granular classification level below sector (e.g., Software, Semiconductors) |
| num_companies     | integer   | Count of companies within each GICS industry currently listed in the S&P 500 |
| avg_years_in_index| decimal   | Average tenure (in years) of companies within each industry, calculated from their respective dates of addition to the S&P 500 index |

### Analysis:
*   **Insight:**
*   **Recommendation:** 
*   **Prediction:** 