In [0]:
%pip install yfinance lxml
dbutils.library.restartPython() # Restarts the kernel to apply changes

In [0]:
import pandas as pd
import requests
from io import StringIO

# 1. Define the URL
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# 2. Define Headers to mimic a browser (Bypasses 403 Error)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 3. Fetch content using requests
try:
    response = requests.get(url, headers=headers)
    response.raise_for_status() # Check for other HTTP errors
    
    # 4. Read HTML from the response text
    # We use StringIO because read_html expects a file-like object or string
    tables = pd.read_html(StringIO(response.text))
    
    sp500_df = tables[0] # The first table is usually the S&P 500 list
    
    # 5. Clean Ticker Symbols (Replace '.' with '-' for Yahoo compatibility)
    tickers = sp500_df['Symbol'].str.replace('.', '-', regex=False).tolist()
    
    print(f"Success! Extracted {len(tickers)} tickers.")
    print(f"Sample: {tickers[:5]}")
    
except Exception as e:
    print(f"Error extracting tickers: {e}")

In [0]:
# 1. Download data (Last 1 year to keep it manageable, or change period='max')
print("Downloading market data...")
data = yf.download(tickers, period="5y", group_by='ticker', threads=True)

# 2. Reshape Data for Spark
# yfinance returns a MultiIndex (Ticker, Price Type). We need to flatten this into a standard schema.
# Stack the data to get Ticker as a column
data_stacked = data.stack(level=0).rename_axis(['Date', 'Ticker']).reset_index()

# 3. Convert to Spark DataFrame
spark_df = spark.createDataFrame(data_stacked)

# 4. Basic Cleaning: Ensure column names are clean (remove spaces if any)
from pyspark.sql.functions import col
spark_df = spark_df.select([col(c).alias(c.replace(' ', '_')) for c in spark_df.columns])

display(spark_df)

In [0]:
raw_df = spark.read.table("sp500_raw_closes")
raw_df.display()
# DBTITLE 1


In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# --- A. Calculate Mean, Median, StdDev ---
stats_df = spark_df.groupBy("Ticker").agg(
    F.mean("Close").alias("Mean_Close"),
    F.stddev("Close").alias("StdDev_Close"),
    F.percentile_approx("Close", 0.5).alias("Median_Close")
)

# --- B. Calculate Mode (Most Frequent Closing Price) ---
# 1. Count frequency of each price per ticker
freq_df = spark_df.groupBy("Ticker", "Close").count()

# 2. Rank prices by frequency (descending) within each Ticker
window_spec = Window.partitionBy("Ticker").orderBy(F.desc("count"))
ranked_mode = freq_df.withColumn("rank", F.rank().over(window_spec))

# 3. Filter for the #1 ranked price (The Mode)
mode_df = ranked_mode.filter("rank == 1").select(
    F.col("Ticker"), 
    F.col("Close").alias("Mode_Close")
)

# --- C. Join All Metrics ---
final_analysis = stats_df.join(mode_df, on="Ticker", how="left")

# Display results
display(final_analysis)