import libraries

In [6]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup

def get_data(ticker):
    data = yf.download(ticker, start="2022-01-01", end="2024-01-01")
    return data

Gather Data - use S&P 500 stocks

In [7]:
# Fetch the list of S&P 500 companies from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})

tickers = []
company_names = []
sectors = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text.strip()
    company_name = row.findAll('td')[1].text.strip()
    sector = row.findAll('td')[3].text.strip()
    tickers.append(ticker)
    company_names.append(company_name)
    sectors.append(sector)

# Create a DataFrame
sp500 = pd.DataFrame({'Ticker': tickers, 'Company': company_names, 'Sector': sectors})

IndexError: list index out of range

Fetch financial data

In [None]:
# Initialize empty lists to store data
revenues = []
market_caps = []
betas = []

for ticker in sp500['Ticker']:
    try:
        stock = yf.Ticker(ticker)
        
        # Get total revenue from quarterly financials
        quarterly_financials = stock.quarterly_financials
        revenue = quarterly_financials.loc['Total Revenue'].sum()
        revenues.append(revenue)
        
        # Get market cap and beta
        info = stock.info
        market_cap = info.get('marketCap', np.nan)
        beta = info.get('beta', np.nan)
        market_caps.append(market_cap)
        betas.append(beta)
        
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        revenues.append(np.nan)
        market_caps.append(np.nan)
        betas.append(np.nan)

sp500['Revenue'] = revenues
sp500['MarketCap'] = market_caps
sp500['Beta'] = betas

Data Cleaning/handling

In [None]:
# Remove rows with NaNs in Revenue, MarketCap, or Beta
sp500_clean = sp500.dropna(subset=['Revenue', 'MarketCap', 'Beta'])

#categorize by market cap
# Define market cap categories
def categorize_market_cap(market_cap):
    if market_cap >= 10e9:
        return 'Large Cap'
    elif market_cap >= 2e9:
        return 'Mid Cap'
    else:
        return 'Small Cap'

# Apply the categorization
sp500_clean['MarketCapCategory'] = sp500_clean['MarketCap'].apply(categorize_market_cap)

Clustering implementation (define clustering function)

In [None]:
def cluster_market_cap_category(df, category):
    df_category = df[df['MarketCapCategory'] == category]
    # Use Beta and Revenue as features
    X = df_category[['Beta', 'Revenue']].values
    
    # Handle any remaining NaNs
    X = X[~np.isnan(X).any(axis=1)]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Determine optimal number of clusters
    sse = []
    silhouette_scores = []
    K = range(2, 10)
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        sse.append(kmeans.inertia_)
        silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(silhouette_avg)
    
    # Plot elbow method and silhouette scores
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(K, sse, 'bx-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Sum of Squared Errors')
    plt.title(f'Elbow Method for {category}')
    
    plt.subplot(1, 2, 2)
    plt.plot(K, silhouette_scores, 'bx-')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.title(f'Silhouette Analysis for {category}')
    plt.tight_layout()
    plt.show()
    
    # Choose optimal k (e.g., k with max silhouette score)
    optimal_k = K[silhouette_scores.index(max(silhouette_scores))]
    print(f"Optimal number of clusters for {category}: {optimal_k}")
    
    # Apply k-means with optimal k
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    
    # Add labels to original DataFrame
    df_category = df_category.iloc[~np.isnan(X).any(axis=1), :].copy()
    df_category['Cluster'] = labels
    
    return df_category

Apply Clustering to each market cap category

In [None]:
# Apply clustering to Large Cap
df_large_cap = cluster_market_cap_category(sp500_clean, 'Large Cap')

# Apply clustering to Mid Cap
df_mid_cap = cluster_market_cap_category(sp500_clean, 'Mid Cap')

# If there are Small Cap companies, apply clustering
if 'Small Cap' in sp500_clean['MarketCapCategory'].unique():
    df_small_cap = cluster_market_cap_category(sp500_clean, 'Small Cap')

Visualization/Analysis

In [None]:
# Combine clustered data
clustered_data = pd.concat([df_large_cap, df_mid_cap], ignore_index=True)
#plot data / visualize data
def plot_clusters(df_category):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df_category, x='Revenue', y='Beta', hue='Cluster', palette='Set1')
    plt.title(f"Clusters in {df_category['MarketCapCategory'].iloc[0]} Category")
    plt.xlabel('Revenue (Standardized)')
    plt.ylabel('Beta (Standardized)')
    plt.show()

#plot for large market cap companies
plot_clusters(df_large_cap)
#midcap
plot_clusters(df_mid_cap)

Cluster Profiles

In [None]:
def cluster_profiles(df_category):
    clusters = df_category['Cluster'].unique()
    for cluster in clusters:
        print(f"\nCluster {cluster} in {df_category['MarketCapCategory'].iloc[0]}:")
        cluster_df = df_category[df_category['Cluster'] == cluster]
        print("Statistical Summary:")
        print(cluster_df[['Revenue', 'Beta']].describe())
        print("\nRepresentative Companies:")
        print(cluster_df[['Ticker', 'Company']].head())
        
#cluster profiles for large cap
cluster_profiles(df_large_cap)
#cluster profiles for mid cap
cluster_profiles(df_mid_cap)
