In [25]:
# Import necessary modules
import os
import requests
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import time

In [14]:
# Load environment variables from .env
load_dotenv()

True

In [15]:
# Establish a database connection using SQLAlchemy and credentials from the .env file
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')

if None in (DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME):
    raise Exception("Database credentials are not fully set in the .env file.")

engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

In [16]:
# --- Query tickers from raw_wikipedia_sp500 ---
with engine.connect() as conn:
    query_wiki = text("SELECT DISTINCT symbol FROM raw_wikipedia_sp500")
    df_wiki = pd.read_sql(query_wiki, conn)

In [17]:
# Standardize ticker symbols: strip any extra whitespace and convert to uppercase
df_wiki['symbol'] = df_wiki['symbol'].str.strip().str.upper()

In [18]:
# --- Query tickers already processed in raw_prices ---
with engine.connect() as conn:
    query_prices = text("SELECT DISTINCT symbol FROM raw_prices")
    df_prices = pd.read_sql(query_prices, conn)

if not df_prices.empty:
    processed_tickers = set(df_prices['symbol'].str.strip().str.upper())
else:
    processed_tickers = set()

In [19]:
# Compute the set difference: tickers in Wikipedia that are not yet in raw_prices
all_tickers = set(df_wiki['symbol'])
new_tickers = list(all_tickers - processed_tickers)
new_tickers.sort()  # Optional: sort for consistency

In [20]:
# Define batch size (e.g., 50 tickers per execution)
batch_size = 50
tickers_to_process = new_tickers[:batch_size]

print(f"Total tickers in Wikipedia table: {len(all_tickers)}")
print(f"Tickers already processed in raw_prices: {len(processed_tickers)}")
print(f"New tickers to process in this batch: {len(tickers_to_process)}")
print("Tickers in current batch:", tickers_to_process)

Total tickers in Wikipedia table: 503
Tickers already processed in raw_prices: 1
New tickers to process in this batch: 50
Tickers in current batch: ['A', 'ABBV', 'ABNB', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP', 'ADSK', 'AEE', 'AEP', 'AES', 'AFL', 'AIG', 'AIZ', 'AJG', 'AKAM', 'ALB', 'ALGN', 'ALL', 'ALLE', 'AMAT', 'AMCR', 'AMD', 'AME', 'AMGN', 'AMP', 'AMT', 'AMZN', 'ANET', 'ANSS', 'AON', 'AOS', 'APA', 'APD', 'APH', 'APO', 'APTV', 'ARE', 'ATO', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXON', 'AXP', 'AZO', 'BA']


In [23]:
# Defines get_eod_prices function 
def get_eod_prices(symbol, start="2019-01-01", end=None, resample_freq="daily"):
    """
    Fetches historical EOD price data for a given symbol from the Tiingo API
    and returns a DataFrame with columns: trade_date, open, high, low, close, volume, symbol.
    """
    import requests
    import pandas as pd
    TIINGO_KEY = os.getenv('TIINGO_KEY')
    if not TIINGO_KEY:
        raise Exception("TIINGO_KEY not set in the .env file.")
    
    base_url = f"https://api.tiingo.com/tiingo/daily/{symbol}/prices"
    params = {
        'startDate': start,
        'format': 'json',
        'token': TIINGO_KEY,
        'resampleFreq': resample_freq
    }
    if end:
        params['endDate'] = end
    response = requests.get(base_url, params=params, timeout=30)
    response.raise_for_status()
    data = response.json()
    if not data:
        raise Exception(f"No data returned from Tiingo for symbol: {symbol}")
    df = pd.DataFrame(data)
    # Keep only necessary columns; note: column 'date' will be renamed to 'trade_date'
    columns_to_keep = ['date', 'open', 'high', 'low', 'close', 'volume']
    df = df[[col for col in columns_to_keep if col in df.columns]]
    df['date'] = pd.to_datetime(df['date'])
    for col in ['open', 'high', 'low', 'close']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if 'volume' in df.columns:
        df['volume'] = pd.to_numeric(df['volume'], errors='coerce')
    # Rename 'date' to 'trade_date' to match our table schema
    df.rename(columns={'date': 'date'}, inplace=True)
    df['symbol'] = symbol
    df = df.sort_values('date').reset_index(drop=True)
    return df

In [26]:
# Process each ticker in the current batch 
for symbol in tickers_to_process:
    try:
        print(f"Processing symbol: {symbol}")
        df_symbol = get_eod_prices(symbol)
        # Insert the fetched DataFrame into the raw_prices table (append new data)
        df_symbol.to_sql(name="raw_prices", con=engine, if_exists="append", index=False)
        print(f"Symbol {symbol} processed successfully.")
    except Exception as e:
        print(f"Error processing {symbol}: {e}")
    # Short delay between requests to be gentle on the API
    time.sleep(1)

print("Batch processing complete. Run this cell again after one hour to process the next batch.")

Processing symbol: A
Error processing A: (pymysql.err.IntegrityError) (1062, "Duplicate entry 'A-2019-01-02' for key 'raw_prices.PRIMARY'")
[SQL: INSERT INTO raw_prices (date, open, high, low, close, volume, symbol) VALUES (%(date)s, %(open)s, %(high)s, %(low)s, %(close)s, %(volume)s, %(symbol)s)]
[parameters: [{'date': datetime.datetime(2019, 1, 2, 0, 0, tzinfo=datetime.timezone.utc), 'open': 66.5, 'high': 66.57, 'low': 65.3, 'close': 65.69, 'volume': 2113304, 'symbol': 'A'}, {'date': datetime.datetime(2019, 1, 3, 0, 0, tzinfo=datetime.timezone.utc), 'open': 65.53, 'high': 65.78, 'low': 62.0, 'close': 63.27, 'volume': 5383926, 'symbol': 'A'}, {'date': datetime.datetime(2019, 1, 4, 0, 0, tzinfo=datetime.timezone.utc), 'open': 64.09, 'high': 65.95, 'low': 64.09, 'close': 65.46, 'volume': 3123654, 'symbol': 'A'}, {'date': datetime.datetime(2019, 1, 7, 0, 0, tzinfo=datetime.timezone.utc), 'open': 65.64, 'high': 67.43, 'low': 65.61, 'close': 66.85, 'volume': 3235055, 'symbol': 'A'}, {'date