[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/romiaprilian7406/sp500-relative-valuation-estimation/blob/main/notebooks/sp500_relative_valuation_dataset.ipynb)


# Import Library

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from tqdm.auto import tqdm
import time
import warnings

warnings.filterwarnings('ignore')

# Global Configuration

In [2]:
DATA_URL = "https://raw.githubusercontent.com/romiaprilian7406/sp500-companies/main/data/sp500_companies.csv"
EXCLUDED_SECTORS = ['Financials', 'Real Estate'] # Sektor dengan struktur neraca berbeda
REQUEST_DELAY = 0.5 # Detik, untuk menghindari rate limit Yahoo Finance

# Ticker Management

In [3]:
# Mengunduh daftar S&P 500, membuang sektor yang tidak diinginkan, dan memperbaiki format simbol ticker
def load_tickers(url, excluded_sectors):
    print(f"Mengunduh list ticker S&P500 dari: {url}")
    df = pd.read_csv(url)

    initial_count = len(df)

    # Filter Sektor
    df_filtered = df[~df['GICS Sector'].isin(excluded_sectors)].copy()

    # Format Symbol: Ubah 'BRK.B' menjadi 'BRK-B' agar terbaca yfinance
    df_filtered['Symbol'] = df_filtered['Symbol'].str.replace('.', '-', regex=False)

    # Logging sederhana
    dropped_count = initial_count - len(df_filtered)
    print(f"Total Awal    : {initial_count}")
    print(f"Dibuang       : {dropped_count} (Sectors: {excluded_sectors})")
    print(f"Total Target  : {len(df_filtered)}")

    return df_filtered

# Membuat dictionary mapping {Ticker: Sector} untuk referensi cepat
def get_sector_map(df):
    return dict(zip(df['Symbol'], df['GICS Sector']))

# Data Extraction

In [4]:
# Helper internal untuk mengambil nilai float secara aman dari Dictionary atau Series.
# Mengembalikan np.nan jika tidak ditemukan atau error
def safe_get(source, keys):
    # Jika source adalah dictionary (misal: stock.info)
    if isinstance(source, dict):
        val = source.get(keys, np.nan)
        return float(val) if val is not None else np.nan

    # Jika source adalah Pandas Series (misal: financials)
    if isinstance(keys, str):
        keys = [keys]

    for k in keys:
        if k in source:
            val = source[k]
            # Cek validitas data sebelum convert
            return float(val) if not pd.isna(val) else np.nan

    return np.nan

# Mengambil data fundamental lengkap untuk satu ticker menggunakan yfinance
def fetch_financial_data(ticker, sector):
    try:
        stock = yf.Ticker(ticker)

        # 1. Fetch .info (Market Data & Estimates)
        info = stock.info

        # 2. Fetch Financial Statements (Fail-safe)
        try:
            # Mengambil kolom pertama (tahun/kuartal terbaru)
            inc = stock.financials.iloc[:, 0] if not stock.financials.empty else pd.Series(dtype=float)
            bal = stock.balance_sheet.iloc[:, 0] if not stock.balance_sheet.empty else pd.Series(dtype=float)
            cf = stock.cashflow.iloc[:, 0] if not stock.cashflow.empty else pd.Series(dtype=float)
        except Exception:
            # Jika gagal ambil statement, set kosong tapi tetap lanjut ambil data .info
            inc, bal, cf = pd.Series(dtype=float), pd.Series(dtype=float), pd.Series(dtype=float)

        # 3. Construct Data Dictionary
        data = {
            'Ticker': ticker,
            'Sector': sector,

            # TARGET VARIABLES
            'EnterpriseValue': safe_get(info, 'enterpriseValue'),
            'EBITDA': safe_get(inc, ['EBITDA', 'Normalized EBITDA']),

            # MODEL FEATURES (Growth & Risk)
            'ForwardPE': safe_get(info, 'forwardPE'),
            'TrailingPE': safe_get(info, 'trailingPE'),
            'Beta': safe_get(info, 'beta'),
            'TotalRevenue': safe_get(inc, 'Total Revenue'),

            # ROIC COMPONENTS (Quality)
            'EBIT': safe_get(inc, ['EBIT', 'Operating Income', 'Operating Profit']),
            'PretaxIncome': safe_get(inc, ['Pretax Income', 'Income Before Tax']),
            'TaxProvision': safe_get(inc, ['Tax Provision', 'Income Tax Expense']),

            # BALANCE SHEET (Invested Capital)
            'TotalAssets': safe_get(bal, 'Total Assets'),
            'TotalEquity': safe_get(bal, ['Stockholders Equity', 'Total Equity Gross Minority Interest']),
            'TotalDebt': safe_get(bal, ['Total Debt', 'Total Liab']),
            'CashAndEquivalents': safe_get(bal, ['Cash And Cash Equivalents', 'Cash', 'Cash Financial'])
        }

        return data

    except Exception:
        return None

# Execution

In [5]:
# Load Tickers
df_sp500 = load_tickers(DATA_URL, EXCLUDED_SECTORS)
tickers = df_sp500['Symbol'].tolist()
sector_map = get_sector_map(df_sp500)

# Processing Loop
results = []
print(f"\nMemulai pengambilan data untuk {len(tickers)} saham")

# Menggunakan tqdm untuk progress bar
for ticker in tqdm(tickers, desc="Fetching Data"):

    sector = sector_map.get(ticker, 'Unknown')
    data = fetch_financial_data(ticker, sector)

    if data:
        results.append(data)

    # Rate Limiting
    time.sleep(REQUEST_DELAY)

print(f"Proses Selesai, {len(results)} data berhasil diambil")

Mengunduh list ticker S&P500 dari: https://raw.githubusercontent.com/romiaprilian7406/sp500-companies/main/data/sp500_companies.csv
Total Awal    : 503
Dibuang       : 107 (Sectors: ['Financials', 'Real Estate'])
Total Target  : 396

Memulai pengambilan data untuk 396 saham


Fetching Data:   0%|          | 0/396 [00:00<?, ?it/s]

Proses Selesai, 396 data berhasil diambil


In [6]:
if results:
    df_final = pd.DataFrame(results)

    # Tampilkan preview
    print("\nPreview data:")
    display(df_final.head())
else:
    print("GAGAL: Tidak ada data")


Preview data:


Unnamed: 0,Ticker,Sector,EnterpriseValue,EBITDA,ForwardPE,TrailingPE,Beta,TotalRevenue,EBIT,PretaxIncome,TaxProvision,TotalAssets,TotalEquity,TotalDebt,CashAndEquivalents
0,MMM,Industrials,94186770000.0,7373000000.0,18.855309,25.953821,1.148,24575000000.0,6010000000.0,4819000000.0,804000000.0,39868000000.0,3842000000.0,13659000000.0,5600000000.0
1,AOS,Industrials,9529010000.0,786500000.0,16.587528,18.44609,1.335,3818100000.0,707700000.0,701000000.0,167400000.0,3240000000.0,1883500000.0,216700000.0,239600000.0
2,ABT,Health Care,225884500000.0,10790000000.0,22.362432,15.944723,0.714,41950000000.0,7572000000.0,7013000000.0,-6389000000.0,81414000000.0,47664000000.0,15021000000.0,7616000000.0
3,ABBV,Health Care,459663000000.0,14910000000.0,15.794969,170.54544,0.349,56334000000.0,6524000000.0,3716000000.0,-570000000.0,135161000000.0,3325000000.0,67144000000.0,5524000000.0
4,ACN,Information Technology,167500500000.0,11867330000.0,17.972982,21.984774,1.261,69672980000.0,10498950000.0,10270390000.0,2437993000.0,65394900000.0,31195450000.0,8182866000.0,11478730000.0


# Simple EDA

In [7]:
# Shape
rows, cols = df_final.shape
print(f"Dimensi Dataset")
print(f"Total Baris    : {rows}")
print(f"Total Kolom    : {cols}\n")

# Info
print(df_final.info(),"\n")

# Fitur adalah semua kolom selain Target dan Metadata
features_list = [col for col in df_final.columns]

print(f"Jumlah Fitur : {len(features_list)}")
print(f"Daftar Fitur : {features_list}\n")

# Statistik Deskriptif Fitur Numerik
print(f"\nStatistik Deskriptif")
numeric_cols = df_final.select_dtypes(include=[np.number]).columns.tolist()

try:
    display(df_final[numeric_cols].describe().T.round(2))
except:
    print(df_final[numeric_cols].describe().T.round(2))

# Missing Values
print(f"\nLaporan Missing Values:")
missing_count = df_final.isnull().sum()
missing_pct = (df_final.isnull().sum() / len(df_final)) * 100

missing_df_final = pd.DataFrame({
    'Jumlah Hilang': missing_count,
    'Persentase (%)': missing_pct
})

# Filter hanya yang ada missing value & Sort dari yang terbanyak
missing_only = missing_df_final[missing_df_final['Jumlah Hilang'] > 0].sort_values(by='Jumlah Hilang', ascending=False)

# Format persentase setelah sorting
missing_only['Persentase (%)'] = missing_only['Persentase (%)'].map('{:.2f}%'.format)

if not missing_only.empty:
    print(missing_only)
else:
    print("Tidak ada missing values")

Dimensi Dataset
Total Baris    : 396
Total Kolom    : 15

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396 entries, 0 to 395
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Ticker              396 non-null    object 
 1   Sector              396 non-null    object 
 2   EnterpriseValue     393 non-null    float64
 3   EBITDA              391 non-null    float64
 4   ForwardPE           396 non-null    float64
 5   TrailingPE          373 non-null    float64
 6   Beta                390 non-null    float64
 7   TotalRevenue        394 non-null    float64
 8   EBIT                394 non-null    float64
 9   PretaxIncome        394 non-null    float64
 10  TaxProvision        393 non-null    float64
 11  TotalAssets         396 non-null    float64
 12  TotalEquity         396 non-null    float64
 13  TotalDebt           393 non-null    float64
 14  CashAndEquivalents  396 non-null    float64
dtyp

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EnterpriseValue,393.0,148251200000.0,455631100000.0,3747220000.0,24774220000.0,48024800000.0,106658100000.0,4104056000000.0
EBITDA,391.0,7576849000.0,17951780000.0,-7649000000.0,1559950000.0,2829500000.0,6446804000.0,160165000000.0
ForwardPE,396.0,21.08,20.78,-133.05,13.73,18.65,24.32,213.27
TrailingPE,373.0,37.63,55.13,5.01,19.48,25.85,34.98,604.33
Beta,390.0,0.97,0.43,-0.08,0.64,0.99,1.25,2.54
TotalRevenue,394.0,38063470000.0,76534560000.0,705823000.0,7079825000.0,14652710000.0,30639750000.0,680985000000.0
EBIT,394.0,5499791000.0,14662030000.0,-10176000000.0,1080575000.0,2088500000.0,4705525000.0,133050000000.0
PretaxIncome,394.0,4929216000.0,14464310000.0,-12210000000.0,910250000.0,1729500000.0,4121250000.0,132729000000.0
TaxProvision,393.0,916803000.0,2540082000.0,-6389000000.0,143100000.0,323000000.0,761461000.0,21795000000.0
TotalAssets,396.0,51709930000.0,79915350000.0,1248020000.0,12240000000.0,23827300000.0,54931500000.0,624894000000.0



Laporan Missing Values:
                 Jumlah Hilang Persentase (%)
TrailingPE                  23          5.81%
Beta                         6          1.52%
EBITDA                       5          1.26%
EnterpriseValue              3          0.76%
TotalDebt                    3          0.76%
TaxProvision                 3          0.76%
TotalRevenue                 2          0.51%
PretaxIncome                 2          0.51%
EBIT                         2          0.51%


# Export Dataset

In [8]:
file_name = 'sp500_relative_valuation_dataset.csv'
df_final.to_csv(file_name, index=False)
print(f"File dataset disimpan ke: {file_name}")

File dataset disimpan ke: sp500_relative_valuation_dataset.csv
