# Historical Stock Price Data

### Polygon: https://polygon.io/dashboard

In [1]:
# Replace with your Polygon.io API Key
API_KEY = "TlQH6pl7Yc8D7E9peGk2IXF0EsBTNGxi"

In [2]:
# Import libraries
import requests
import pandas as pd
import pandas_market_calendars as mcal
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm  # Use tqdm.auto if not in notebook
from time import sleep
import os

In [3]:
# Google BigQuery Authentication
from google.cloud import bigquery
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Export data
from pandas_gbq import to_gbq
import pickle

# Set your OAuth client ID JSON downloaded from GCP Console
# CLIENT_SECRET_FILE = 'client_secret.json'  # downloaded from GCP
CLIENT_SECRET_FILE = os.path.expanduser("/Users/ryanrunchey/credentials/gcp_credentials/client_secret_295707256455-0fsr3bqoc89psl22fgp2cfipbd4m1s1v.apps.googleusercontent.com.json")
SCOPES = ['https://www.googleapis.com/auth/cloud-platform']

# Authenticate interactively (stores a token locally for reuse)
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        credentials = pickle.load(token)
else:
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
    credentials = flow.run_local_server(port=0)
    with open('token.pickle', 'wb') as token:
        pickle.dump(credentials, token)

# Initialize the BigQuery client with those credentials
client = bigquery.Client(credentials=credentials, project="ryanrunchey")

In [4]:
# Start dates
start_dates = [
    '2002-09-18', '2003-09-17', '2004-09-15', '2005-09-16', '2006-09-15',
    '2007-09-14', '2008-09-17', '2009-09-16', '2010-09-15', '2011-09-16',
    '2012-09-14', '2013-09-18', '2014-09-17', '2015-09-16', '2016-09-16',
    '2017-09-15', '2018-09-14', '2019-09-18', '2020-09-16', '2021-09-16'
    # '2022-09-14'  # Optional: add if needed
]

# NYSE trading calendar
nyse = mcal.get_calendar('NYSE')

# Compute end_dates: first NYSE trading day ≥ start_date + 851 days
start_date_objs = [datetime.strptime(d, "%Y-%m-%d") for d in start_dates]
end_dates = []

for start_dt in start_date_objs:
    target_dt = start_dt + timedelta(days=851)
    sched = nyse.schedule(start_date=target_dt, end_date=target_dt + timedelta(days=20))
    first_trading_day = sched.index[0].strftime("%Y-%m-%d")
    end_dates.append(first_trading_day)

# Create dataframe
df_ranges = pd.DataFrame({
    "start_date": start_dates,
    "end_date": end_dates
})

df_ranges

Unnamed: 0,start_date,end_date
0,2002-09-18,2005-01-18
1,2003-09-17,2006-01-17
2,2004-09-15,2007-01-16
3,2005-09-16,2008-01-15
4,2006-09-15,2009-01-13
5,2007-09-14,2010-01-12
6,2008-09-17,2011-01-18
7,2009-09-16,2012-01-17
8,2010-09-15,2013-01-14
9,2011-09-16,2014-01-14


In [5]:
# Export to csv
df_ranges.to_csv("historical_stock_dates.csv")

# Export to pickle
df_ranges.to_pickle("historical_stock_dates.pkl")

# Export to BigQuery
to_gbq(
    dataframe=df_ranges,
    destination_table="historical_stock_price_returns.historical_stock_dates",
    project_id="ryanrunchey",
    if_exists="replace"  # or "append"
)

100%|██████████| 1/1 [00:00<00:00, 4739.33it/s]


In [6]:
# Create the list of dates

# Combine, flatten, deduplicate, and sort
dates = sorted(set(df_ranges['start_date']).union(df_ranges['end_date']))

# Preview
print(dates)

['2002-09-18', '2003-09-17', '2004-09-15', '2005-01-18', '2005-09-16', '2006-01-17', '2006-09-15', '2007-01-16', '2007-09-14', '2008-01-15', '2008-09-17', '2009-01-13', '2009-09-16', '2010-01-12', '2010-09-15', '2011-01-18', '2011-09-16', '2012-01-17', '2012-09-14', '2013-01-14', '2013-09-18', '2014-01-14', '2014-09-17', '2015-01-13', '2015-09-16', '2016-01-19', '2016-09-16', '2017-01-17', '2017-09-15', '2018-01-16', '2018-09-14', '2019-01-15', '2019-09-18', '2020-01-14', '2020-09-16', '2021-01-12', '2021-09-16', '2022-01-18', '2023-01-17', '2024-01-16']


In [7]:
# Ticker to exchange mapping
exchanges = ["XASE", "XNAS"] # 'AMEX':'XASE, 'NASDAQ':'XNAS', 'NYSE': 'XNYS'

all_mappings = []

def get_exchange_tickers_for_date(exchange, date):
    tickers = []
    url = f"https://api.polygon.io/v3/reference/tickers?market=stocks&exchange={exchange}&active=true&date={date}&limit=1000&apiKey={API_KEY}"
    
    while url:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        for item in data.get("results", []):
            tickers.append({
                "ticker": item["ticker"],
                "exchange": exchange,
                "date": date
            })
        url = data.get("next_url")
        if url:
            url += f"&apiKey={API_KEY}"
        # sleep(0.1)  # Optional: use if rate limits encountered

    return tickers

# Build the full mapping
for date in dates:
    print(f"Processing date: {date}")
    for exch in exchanges:
        tickers = get_exchange_tickers_for_date(exch, date)
        all_mappings.extend(tickers)

# Convert to DataFrame and save
df = pd.DataFrame(all_mappings)
df.to_csv("all_ticker_exchange_mappings.csv", index=False)
print("✅ Saved: all_ticker_exchange_mappings.csv")

Processing date: 2002-09-18
Processing date: 2003-09-17
Processing date: 2004-09-15
Processing date: 2005-01-18
Processing date: 2005-09-16
Processing date: 2006-01-17
Processing date: 2006-09-15
Processing date: 2007-01-16
Processing date: 2007-09-14
Processing date: 2008-01-15
Processing date: 2008-09-17
Processing date: 2009-01-13
Processing date: 2009-09-16
Processing date: 2010-01-12
Processing date: 2010-09-15
Processing date: 2011-01-18
Processing date: 2011-09-16
Processing date: 2012-01-17
Processing date: 2012-09-14
Processing date: 2013-01-14
Processing date: 2013-09-18
Processing date: 2014-01-14
Processing date: 2014-09-17
Processing date: 2015-01-13
Processing date: 2015-09-16
Processing date: 2016-01-19
Processing date: 2016-09-16
Processing date: 2017-01-17
Processing date: 2017-09-15
Processing date: 2018-01-16
Processing date: 2018-09-14
Processing date: 2019-01-15
Processing date: 2019-09-18
Processing date: 2020-01-14
Processing date: 2020-09-16
Processing date: 202

In [8]:
# Load full ticker-to-exchange map with dates
df_map = pd.read_csv("all_ticker_exchange_mappings.csv")

# Keep only desired exchanges
# exchanges = ["XNYS", "XASE"]
df_map = df_map[df_map["exchange"].isin(exchanges)]

# Create a dictionary: {date: set of valid tickers}
date_ticker_map = (
    df_map.groupby("date")["ticker"]
    .apply(set)
    .to_dict()
)

# List of dates to fetch
DATES = sorted(date_ticker_map.keys())

# Fetch grouped EOD data for a given date
def fetch_grouped_for_date(date):
    url = f"https://api.polygon.io/v2/aggs/grouped/locale/us/market/stocks/{date}?adjusted=true&apiKey={API_KEY}"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        results = data.get("results", [])
        rows = []

        valid_tickers = date_ticker_map.get(date, set())
        exchange_lookup = df_map[df_map["date"] == date].set_index("ticker")["exchange"].to_dict()

        for item in results:
            ticker = item["T"]
            if ticker in valid_tickers:
                rows.append({
                    "exchange": exchange_lookup[ticker],
                    "ticker": ticker,
                    "date": date,
                    "close": item.get("c")
                })

        return rows

    except Exception as e:
        print(f"Error fetching data for {date}: {e}")
        return []

# Fetch all grouped data in parallel by date
def fetch_all_grouped_data(dates, max_workers=16):
    all_data = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_grouped_for_date, d): d for d in dates}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching grouped data"):
            rows = future.result()
            all_data.extend(rows)
    return all_data

# Final pipeline runner
def run_grouped_pipeline():
    print("Fetching grouped EOD data...")
    data = fetch_all_grouped_data(DATES)
    df = pd.DataFrame(data)
    if df.empty:
        print("No data returned.")
        return df
    return df[['exchange', 'ticker', 'date', 'close']]

In [9]:
df = run_grouped_pipeline()
df.head()

Fetching grouped EOD data...


Fetching grouped data:   0%|          | 0/39 [00:00<?, ?it/s]

Unnamed: 0,exchange,ticker,date,close
0,XASE,KRY,2006-09-15,3.13
1,XASE,VUG,2006-09-15,54.29
2,XASE,BHM,2006-09-15,10.9005
3,XASE,BGF,2006-09-15,18.95
4,XASE,EIF,2006-09-15,14.24


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85859 entries, 0 to 85858
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   exchange  85859 non-null  object 
 1   ticker    85859 non-null  object 
 2   date      85859 non-null  object 
 3   close     85859 non-null  float64
dtypes: float64(1), object(3)
memory usage: 2.6+ MB


In [11]:
df['date'].min(), df['date'].max()

('2003-09-17', '2024-01-16')

# Write Data to BigQuery

In [12]:
# Export to csv
df.to_csv("polygon_historical_stock_prices.csv")

# Export to pickle
df.to_pickle("polygon_historical_stock_prices.pkl")

# Export to BigQuery
to_gbq(
    dataframe=df,
    destination_table="historical_stock_price_returns.polygon_historical_stock_prices",
    project_id="ryanrunchey",
    if_exists="replace"  # or "append"
)

100%|██████████| 1/1 [00:00<00:00, 6615.62it/s]
