# Historical Stock Price Returns

**Historical Price Data**
- We can provide up to 20 years end of day stock price history for all of the exchanges we support. Intraday data (1, 5, 10, 15, 30 and 60 minute bars) is available back to Jan 1, 2008. All of our end of day historical stock prices have been cleaned for spikes and other anomalies and adjusted for splits.
https://www.eoddata.com/products/historicaldata.aspx

## Setup Environment

In [7]:
# Import libraries

import pandas as pd
import numpy as np

import os
import zipfile
from datetime import datetime, timedelta

In [6]:
# Google BigQuery Authentication

from google.cloud import bigquery
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Export data

from pandas_gbq import to_gbq
import pickle

# Set your OAuth client ID JSON downloaded from GCP Console
# CLIENT_SECRET_FILE = 'client_secret.json'  # downloaded from GCP
CLIENT_SECRET_FILE = os.path.expanduser("/Users/ryanrunchey/credentials/gcp_credentials/client_secret_295707256455-0fsr3bqoc89psl22fgp2cfipbd4m1s1v.apps.googleusercontent.com.json")
SCOPES = ['https://www.googleapis.com/auth/cloud-platform']

# Authenticate interactively (stores a token locally for reuse)
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        credentials = pickle.load(token)
else:
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, SCOPES)
    credentials = flow.run_local_server(port=0)
    with open('token.pickle', 'wb') as token:
        pickle.dump(credentials, token)

# Initialize the BigQuery client with those credentials
client = bigquery.Client(credentials=credentials, project="ryanrunchey")

### Test BigQuery Read/Write

In [13]:
# Test query
query = """
SELECT
  underlying_symbol,
  symbol,
  SUM(quantity) AS quantity,
  SUM(fees_and_commissions) AS fees_and_commissions,
  SUM(amount) AS amount,
  SAFE_DIVIDE(SUM(amount), SUM(quantity)) AS net_price_per_unit
FROM
  ryanrunchey.account_transactions.fct_transactions
WHERE
  underlying_symbol = 'ENVX'
GROUP BY
  1,2
ORDER BY
  1,2
"""

df = client.query(query).to_dataframe()
df

Unnamed: 0,underlying_symbol,symbol,quantity,fees_and_commissions,amount,net_price_per_unit
0,ENVX,-ENVX231117C15,0.0,2.72,345.28,
1,ENVX,-ENVX240119C10,0.0,2.68,-152.68,
2,ENVX,-ENVX240119C17.5,0.0,2.73,529.19,
3,ENVX,-ENVX240119C20,0.0,1.36,258.6,
4,ENVX,-ENVX240719C15,0.0,4.02,-169.02,
5,ENVX,-ENVX250117C12.5,0.0,6.74,3538.99,
6,ENVX,-ENVX250117C20,0.0,10.09,3348.51,
7,ENVX,-ENVX260116C8,-36.0,24.5,13257.5,-368.263888889
8,ENVX,ENVX,6407.637,0.22,-58547.82,-9.137193633
9,ENVX,ENVX 01/19/2024 9.00 C,0.0,5.29,-901.29,


## Unzip Data

In [2]:
def unzip_all_in_dir(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.zip'):
                zip_path = os.path.join(root, file)
                extract_dir = os.path.splitext(zip_path)[0]  # create folder with same name
                
                print(f"Unzipping: {zip_path} → {extract_dir}")
                
                try:
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_dir)
                except zipfile.BadZipFile:
                    print(f"⚠️ Skipped bad zip file: {zip_path}")

# 🔧 Replace with your target path
target_folder = '/Users/ryanrunchey/Downloads/EODData'
unzip_all_in_dir(target_folder)

Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2008.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2008
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2020.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2020
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2021.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2021
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2009.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2009
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2023.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2023
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2022.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2022
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2019.zip → /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2019
Unzipping: /Users/ryanrunchey/Downloads/EODData/NASDAQ/NASDAQ_2018.zip → /Users/ryanrunchey/Downl

## Import Data

In [7]:
# Import stock price data into a single dataframe: df_import

exchanges = ['AMEX', 'NASDAQ', 'NYSE', 'OTCBB']

df_list = []

for exchange in exchanges:
    
    directory = '/Users/ryanrunchey/Library/CloudStorage/SynologyDrive-On-demand/Documents/Finance/Analysis/Historical Stock Data/EOD Data/' + exchange + '/unzipped'

    for root, subdirectories, files in os.walk(directory):
    #     for subdirectory in subdirectories:
    #         print(os.path.join(root, subdirectory))
        for file in files:
            if '.csv' in file:
    #             print(os.path.join(root, file))
    #             read the csv file
                df = pd.read_csv(os.path.join(root, file))
                df['exchange'] = exchange
                df_list.append(df)

df_import = pd.concat(df_list)
df_import.columns = df_import.columns.map(lambda col: col.lower())
df_import.head()

Unnamed: 0,symbol,date,open,high,low,close,volume,exchange
0,ACU,28-Jul-2009,8.21,8.66,8.11,8.11,10000,AMEX
1,AE,28-Jul-2009,15.7,15.9,15.53,15.85,700,AMEX
2,AFK,28-Jul-2009,27.45,28.1,27.13,27.5,19600,AMEX
3,AGG,28-Jul-2009,102.01,102.53,102.01,102.27,674500,AMEX
4,AGQ,28-Jul-2009,83.4,83.76,80.02,81.5,540500,AMEX


In [8]:
df_import.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41259600 entries, 0 to 185
Data columns (total 8 columns):
 #   Column    Dtype  
---  ------    -----  
 0   symbol    object 
 1   date      object 
 2   open      float64
 3   high      float64
 4   low       float64
 5   close     float64
 6   volume    int64  
 7   exchange  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 2.8+ GB


In [9]:
# Format, clean, and sort dataframe

df_import['date'] = pd.to_datetime(df_import['date'])
df_import = df_import[['exchange', 'symbol', 'date', 'open', 'high', 'low', 'close', 'volume']]
df_import.sort_values(['symbol', 'date'], inplace=True)
df_import.reset_index(drop=True, inplace=True)
df_import.head()

Unnamed: 0,exchange,symbol,date,open,high,low,close,volume
0,NYSE,A,2001-01-01,39.1631,39.1631,39.1631,39.1631,0
1,NYSE,A,2001-01-02,38.5408,38.5408,35.093,36.3948,1617800
2,NYSE,A,2001-01-03,35.1431,40.4149,34.02,40.1502,3220800
3,NYSE,A,2001-01-04,40.6867,42.6967,39.5637,41.6667,3146200
4,NYSE,A,2001-01-05,40.9514,41.6667,38.269,39.3848,2344100


In [15]:
# Export to csv
df_import.to_csv("historical_stock_prices.csv")

# Export to pickle
df_import.to_pickle("historical_stock_prices.pkl")

# Export to BigQuery
to_gbq(
    dataframe=df_import,
    destination_table="historical_stock_price_returns.historical_stock_prices",
    project_id="ryanrunchey",
    if_exists="replace"  # or "append"
)

100%|██████████| 1/1 [00:00<00:00, 3923.58it/s]


## Pivot Data

In [16]:
# Pivot to count # of stocks less than X price by exchange

list_dates = ['2002-09-16', '2003-09-15', '2004-09-14', '2005-09-16', '2006-09-15', '2007-09-14', '2008-09-15', \
              '2009-09-14', '2010-09-14', '2011-09-16', '2012-09-14', '2013-09-16', '2014-09-15', '2015-09-14', \
              '2016-09-16', '2017-09-15', '2018-09-14', '2019-09-14', '2020-09-14', '2021-09-14', '2022-09-14']

pd_list_dates = pd.to_datetime(list_dates)  # ensure correct dtype
mask = df_import['date'].isin(pd_list_dates)

df_count_equities = df_import[mask].copy(deep=True)
df_count_equities['close_under_3'] = (df_count_equities['close'] <= 3)
df_count_equities['close_under_2'] = (df_count_equities['close'] <= 2)
df_count_equities['close_under_1'] = (df_count_equities['close'] <= 1)
df_count_equities['close_under_0_50'] = (df_count_equities['close'] <= 0.5)
df_count_equities_agg = df_count_equities.groupby(['exchange', 'date']).agg({
    'symbol':'count', 
    'close_under_3':'sum',
    'close_under_2':'sum',
    'close_under_1':'sum',
    'close_under_0_50':'sum',
})
df_count_equities_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,symbol,close_under_3,close_under_2,close_under_1,close_under_0_50
exchange,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AMEX,2002-09-16,180,20,12,6,3
AMEX,2003-09-15,185,14,5,0,0
AMEX,2004-09-14,204,11,5,1,0
AMEX,2005-09-16,236,12,9,4,1
AMEX,2006-09-15,294,18,9,6,2
...,...,...,...,...,...,...
OTCBB,2017-09-15,3532,1739,1624,1487,1314
OTCBB,2018-09-14,3919,1991,1872,1698,1521
OTCBB,2020-09-14,4787,2677,2511,2279,2026
OTCBB,2021-09-14,5323,2954,2772,2499,2157


In [17]:
# Export to csv
df_count_equities_agg.to_csv("pivot_count_of_stocks_priced_less_than_x.csv")

# Export to pickle
df_count_equities_agg.to_pickle("pivot_count_of_stocks_priced_less_than_x.pkl")

# Export to BigQuery
to_gbq(
    dataframe=df_count_equities_agg,
    destination_table="historical_stock_price_returns.pivot_count_of_stocks_priced_less_than_x",
    project_id="ryanrunchey",
    if_exists="replace"  # or "append"
)

100%|██████████| 1/1 [00:00<00:00, 12633.45it/s]


## Price Return Calculations

In [18]:
# day = 850
day = 425
df = df_import[['exchange', 'symbol', 'date', 'close']].copy(deep=True)
df['number_of_days'] = day
df['date_end'] = df['date'] + timedelta(days=day)
df.head()

Unnamed: 0,exchange,symbol,date,close,number_of_days,date_end
0,NYSE,A,2001-01-01,39.1631,425,2002-03-02
1,NYSE,A,2001-01-02,36.3948,425,2002-03-03
2,NYSE,A,2001-01-03,40.1502,425,2002-03-04
3,NYSE,A,2001-01-04,41.6667,425,2002-03-05
4,NYSE,A,2001-01-05,39.3848,425,2002-03-06


In [19]:
df_merge = pd.merge(df, df, how='left', left_on=['exchange', 'symbol', 'date_end'], right_on=['exchange', 'symbol', 'date'], suffixes=('', '_y'))
df_merge.drop(columns=['date_y', 'number_of_days_y', 'date_end_y'], inplace=True)
df_merge.rename(columns={'close_y': 'close_end'}, inplace=True)
df_merge.head()

Unnamed: 0,exchange,symbol,date,close,number_of_days,date_end,close_end
0,NYSE,A,2001-01-01,39.1631,425,2002-03-02,
1,NYSE,A,2001-01-02,36.3948,425,2002-03-03,
2,NYSE,A,2001-01-03,40.1502,425,2002-03-04,25.0072
3,NYSE,A,2001-01-04,41.6667,425,2002-03-05,24.9356
4,NYSE,A,2001-01-05,39.3848,425,2002-03-06,24.6209


In [20]:
df_merge['price_difference'] = df_merge['close_end'] - df_merge['close']
df_merge['gain'] = df_merge['price_difference'] / df_merge['close']
df_merge.head()

Unnamed: 0,exchange,symbol,date,close,number_of_days,date_end,close_end,price_difference,gain
0,NYSE,A,2001-01-01,39.1631,425,2002-03-02,,,
1,NYSE,A,2001-01-02,36.3948,425,2002-03-03,,,
2,NYSE,A,2001-01-03,40.1502,425,2002-03-04,25.0072,-15.143,-0.377159
3,NYSE,A,2001-01-04,41.6667,425,2002-03-05,24.9356,-16.7311,-0.401546
4,NYSE,A,2001-01-05,39.3848,425,2002-03-06,24.6209,-14.7639,-0.374863


In [21]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41259605 entries, 0 to 41259604
Data columns (total 9 columns):
 #   Column            Dtype         
---  ------            -----         
 0   exchange          object        
 1   symbol            object        
 2   date              datetime64[ns]
 3   close             float64       
 4   number_of_days    int64         
 5   date_end          datetime64[ns]
 6   close_end         float64       
 7   price_difference  float64       
 8   gain              float64       
dtypes: datetime64[ns](2), float64(4), int64(1), object(2)
memory usage: 2.8+ GB


In [22]:
# Export to pickle
df_merge.to_pickle("stock_price_returns_2001_to_2023_number_of_days_425.pkl")

# Export to BigQuery
to_gbq(
    dataframe=df_merge,
    destination_table="historical_stock_price_returns.stock_price_returns_2001_to_2023_number_of_days_425",
    project_id="ryanrunchey",
    if_exists="replace"  # or "append"
)

100%|██████████| 1/1 [00:00<00:00, 3916.25it/s]
