In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, time, timedelta
from pytz import timezone

In [3]:
# Constants
TICKER = "TSLA"
# START = "2017-01-01"
# END = "2017-04-30"

# Stock Market Info
STOCK_MARKET_TZ = timezone('US/Eastern')
STOCK_MARKET_OPENS = time(hour=9, minute=30)
STOCK_MARKET_CLOSES = time(hour=4, minute=0)

# File Names
# ticker_raw_file_name = f"{TICKER}_raw_{START}-{END}.csv"
# index_raw_file_name = f"INDEX_raw_{START}-{END}.csv"

In [4]:
def get_different_rows(source_df, new_df):
    """Returns just the rows from the new dataframe that differ from the source dataframe"""
    merged_df = source_df.merge(new_df, indicator=True, how='outer')
    changed_rows_df = merged_df[merged_df['_merge'] == 'right_only']
    return changed_rows_df.drop('_merge', axis=1)

# Interesting stuff
We should compare the stock prices to an index. This will hopefully account for the changes that affect the entire market and are not related to Tesla news. We may want to account for the beta of the stock ("For example, if a stock's beta is 1.2, it is assumed to be 20% more volatile than the market."). Tesla's historical betas can be downloaded with `https://widget3.zacks.com/data/chart/json/TSLA/beta/www.zacks.com?` (uses the S&P 500 Index) (Src: `https://www.zacks.com/stock/chart/TSLA/fundamental/beta`). If the tesla stock increases by a higher percentage than the beta times the percentage increase in the index

The following request finds 489 news headlines starting from `Fri Feb 11 2022 06:02:00 GMT+0000` (25 days ago)
Last news headline was "Fed Rate Hikes, Zillow, Ford, Elon Musk And Stock Markets - Five Things You Must Know"
Most recent headline is "Dow Jones Futures: What To Do After Today's Stock Market Dive As Russia-Ukraine War Continues"
`https://query1.finance.yahoo.com/v1/finance/search?q=TSLA&quotesCount=0&newsCount=0`

Apparently the full query is `https://query2.finance.yahoo.com/v1/finance/search?q=apple&lang=en-US&region=US&quotesCount=6&newsCount=2&listsCount=2&enableFuzzyQuery=false&quotesQueryId=tss_match_phrase_query&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa&enableCb=true&enableNavLinks=true&enableEnhancedTrivialQuery=true&enableResearchReports=true&researchReportsCount=2`
Src: `https://github.com/ranaroussi/yfinance/issues/837`

Yahoo Finance stock price data with an interval of 1 minute can only be used with a period of 7 days. We will likely have to make one request for each week of data.
Yahoo Finance data appears to miss some minutes. Also, we should ignore posts that are outside the trading period of 9:30 AM to 4:00 PM ET.

If telsa stock is not working, we can move to cryptocurrencies (in particular a meme crypto like doge coin)

Woops. "Intraday data cannot extend last 60 days" for Yahoo Finance

In [4]:
# Get each stock ticker
stock = yf.Ticker(TICKER)
index = yf.Ticker("SNP")

In [5]:
def download_stock_data_old(ticker, start=None, end=None, interval="1m"):
    if interval != "1d":
        now = datetime.now(STOCK_MARKET_TZ)

        # max_period is how much data we can get with one request
        # date_range is how far back we can get data from
        if interval == "1m":
            max_period = timedelta(days=7)
            date_range = timedelta(days=30)
        elif interval == "2m":
            # Appears to be missing data
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "5m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "15m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "30m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "60m" or interval == "1h":
            max_period = timedelta(days=730)
            date_range = timedelta(days=730)
        else:
            raise ValueError("Interval has not been tested for max_period and date_range")

        if start is None:
            start = now - date_range

        # If the market had already opened, skip the partial day's data
        if start.time() > STOCK_MARKET_OPENS:
            # Rounding up skips the starting day
            start = start.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
        else:
            start = start.replace(hour=0, minute=0, second=0, microsecond=0)

        if end is None:
            end = now

        # If the market has not yet closed, skip the partial day's data
        if end.time() < STOCK_MARKET_CLOSES:
            # Rounding down skips the ending day
            end = end.replace(hour=0, minute=0, second=0, microsecond=0)
        else:
            end = end.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)

        prices = []

        period_start = start

        while period_start < end:
            next_week = min(period_start + max_period, end)
            hist = ticker.history(start=period_start, end=next_week, interval=interval)

            prices.append(hist)

            period_start = next_week

        full_hist = pd.concat(prices, axis=0)
    else:
        full_hist = ticker.history(period="max", interval=interval)

    start_date = full_hist.index[0].strftime('%Y-%m-%d')
    end_date = full_hist.index[-1].strftime('%Y-%m-%d')
    full_hist.to_csv(f"{ticker.ticker}_raw_{interval}_{start_date}_{end_date}.csv")
    return full_hist

In [11]:
def download_stock_data(ticker, start=None, end=None, interval="1m"):
    now = datetime.now(STOCK_MARKET_TZ)

    if end is None:
        end = now

    # If the market has not yet closed, skip the partial day's data
    if end.time() < STOCK_MARKET_CLOSES:
        # Rounding down skips the ending day
        end = end.replace(hour=0, minute=0, second=0, microsecond=0)
    else:
        end = end.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)

    if interval != "1d":
        # max_period is how much data we can get with one request
        # date_range is how far back we can get data from
        if interval == "1m":
            max_period = timedelta(days=7)
            date_range = timedelta(days=30)
        elif interval == "2m":
            # Appears to be missing data
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "5m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "15m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "30m":
            max_period = timedelta(days=60)
            date_range = timedelta(days=60)
        elif interval == "60m" or interval == "1h":
            max_period = timedelta(days=730)
            date_range = timedelta(days=730)
        else:
            raise ValueError("Interval has not been tested for max_period and date_range")

        if start is None:
            start = now - date_range

        # If the market had already opened, skip the partial day's data
        if start.time() > STOCK_MARKET_OPENS:
            # Rounding up skips the starting day
            start = start.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
        else:
            start = start.replace(hour=0, minute=0, second=0, microsecond=0)

        prices = []

        period_start = start

        while period_start < end:
            next_week = min(period_start + max_period, end)
            hist = ticker.history(start=period_start, end=next_week, interval=interval)

            prices.append(hist)

            period_start = next_week

        full_hist = pd.concat(prices, axis=0)
    else:
        full_hist = ticker.history(end=end, interval=interval, period='max')

    start_date = full_hist.index[0].strftime('%Y-%m-%d')
    end_date = full_hist.index[-1].strftime('%Y-%m-%d')
    full_hist.to_csv(f"{ticker.ticker}_raw_{interval}_{start_date}_{end_date}.csv")
    return full_hist

In [7]:
stock_prices = download_stock_data(stock)
index_prices = download_stock_data(index)

In [14]:
stock_prices = download_stock_data(stock, interval="1d")
index_prices = download_stock_data(index, interval="1d")

In [15]:
stock_prices

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-06-29,3.800000,5.000000,3.508000,4.778000,93831500,0,0.0
2010-06-30,5.158000,6.084000,4.660000,4.766000,85935500,0,0.0
2010-07-01,5.000000,5.184000,4.054000,4.392000,41094000,0,0.0
2010-07-02,4.600000,4.620000,3.742000,3.840000,25699000,0,0.0
2010-07-06,4.000000,4.000000,3.166000,3.222000,34334500,0,0.0
...,...,...,...,...,...,...,...
2022-04-18,989.030029,1014.919983,973.409973,1004.289978,17238400,0,0.0
2022-04-19,1005.059998,1034.939941,995.330017,1028.150024,16615900,0,0.0
2022-04-20,1030.000000,1034.000000,975.250000,977.200012,23570400,0,0.0
2022-04-21,1074.729980,1092.219971,996.419983,1008.780029,35138800,0,0.0


In [10]:
full_hist = stock.history(interval="1d")
full_hist

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-03-23,979.940002,1040.699951,976.400024,999.109985,40225400,0,0
2022-03-24,1009.72998,1024.48999,988.799988,1013.919983,22973600,0,0
2022-03-25,1008.0,1021.799988,997.320007,1010.640015,20677200,0,0
2022-03-28,1065.099976,1097.880005,1053.599976,1091.839966,34168700,0,0
2022-03-29,1107.98999,1114.77002,1073.109985,1099.569946,24538300,0,0
2022-03-30,1091.170044,1113.949951,1084.0,1093.98999,19955000,0,0
2022-03-31,1094.569946,1103.140015,1076.640015,1077.599976,16330900,0,0
2022-04-01,1081.150024,1094.75,1066.640015,1084.589966,18012900,0,0
2022-04-04,1089.380005,1149.910034,1072.530029,1145.449951,27345300,0,0
2022-04-05,1136.300049,1152.869995,1087.300049,1091.26001,26691700,0,0


In [35]:
# Interpolate missing minutes, but don't interpolate when the market is closed
index_prices.groupby([index_prices.index.date]).resample("T").interpolate()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Unnamed: 0_level_1,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-02-25,2022-02-25 09:30:00-05:00,50.070000,50.509998,50.070000,50.509998,7251.000,0.0,0.0
2022-02-25,2022-02-25 09:31:00-05:00,50.439999,50.439999,50.439999,50.439999,315.000,0.0,0.0
2022-02-25,2022-02-25 09:32:00-05:00,50.426249,50.426249,50.424999,50.426249,440.125,0.0,0.0
2022-02-25,2022-02-25 09:33:00-05:00,50.412499,50.412499,50.409999,50.412499,565.250,0.0,0.0
2022-02-25,2022-02-25 09:34:00-05:00,50.398750,50.398750,50.394999,50.398750,690.375,0.0,0.0
...,...,...,...,...,...,...,...,...
2022-03-07,2022-03-07 15:56:00-05:00,48.570000,48.590000,48.570000,48.580002,1097.000,0.0,0.0
2022-03-07,2022-03-07 15:57:00-05:00,48.590000,48.590000,48.570000,48.570000,769.000,0.0,0.0
2022-03-07,2022-03-07 15:58:00-05:00,48.599998,48.599998,48.570000,48.570000,2933.000,0.0,0.0
2022-03-07,2022-03-07 15:59:00-05:00,48.619999,48.625000,48.560001,48.570000,3821.000,0.0,0.0


In [36]:
# Calculate Percentage Change, but not for the first price of the day
index_prices.groupby([index_prices.index.date]).pct_change()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-02-25 09:30:00-05:00,,,,,,,
2022-02-25 09:31:00-05:00,0.007390,-0.001386,0.007390,-0.001386,-0.956558,,
2022-02-25 09:39:00-05:00,-0.002181,-0.002181,-0.002379,-0.002181,3.177778,,
2022-02-25 09:40:00-05:00,0.000000,0.000993,0.000199,0.000795,2.642857,,
2022-02-25 09:41:00-05:00,0.001192,0.000198,0.001192,0.000397,-0.965791,,
...,...,...,...,...,...,...,...
2022-03-07 15:56:00-05:00,-0.000206,0.000000,0.000206,0.000000,-0.319901,,
2022-03-07 15:57:00-05:00,0.000412,0.000000,0.000000,-0.000206,-0.298997,,
2022-03-07 15:58:00-05:00,0.000206,0.000206,0.000000,0.000000,2.814044,,
2022-03-07 15:59:00-05:00,0.000412,0.000514,-0.000206,0.000000,0.302762,,


In [37]:
# Calculate percent change, even for the missing minutes
index_prices.groupby([index_prices.index.date]).resample("T").interpolate().pct_change()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Unnamed: 0_level_1,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-02-25,2022-02-25 09:30:00-05:00,,,,,,,
2022-02-25,2022-02-25 09:31:00-05:00,0.007390,-0.001386,0.007390,-0.001386,-0.956558,,
2022-02-25,2022-02-25 09:32:00-05:00,-0.000273,-0.000273,-0.000297,-0.000273,0.397222,,
2022-02-25,2022-02-25 09:33:00-05:00,-0.000273,-0.000273,-0.000297,-0.000273,0.284294,,
2022-02-25,2022-02-25 09:34:00-05:00,-0.000273,-0.000273,-0.000298,-0.000273,0.221362,,
...,...,...,...,...,...,...,...,...
2022-03-07,2022-03-07 15:56:00-05:00,-0.000206,0.000000,0.000206,0.000000,-0.319901,,
2022-03-07,2022-03-07 15:57:00-05:00,0.000412,0.000000,0.000000,-0.000206,-0.298997,,
2022-03-07,2022-03-07 15:58:00-05:00,0.000206,0.000206,0.000000,0.000000,2.814044,,
2022-03-07,2022-03-07 15:59:00-05:00,0.000412,0.000514,-0.000206,0.000000,0.302762,,


In [10]:
index_prices.groupby([index_prices.index.date]).pct_change().dropna(how='all')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-24 09:35:00-05:00,0.004670,0.004122,0.006255,0.004122,-0.344786,,
2022-01-24 09:40:00-05:00,0.000581,0.003386,0.003497,-0.000193,-0.522713,,
2022-01-24 09:45:00-05:00,0.001429,-0.002530,-0.003097,-0.003483,-0.633663,,
2022-01-24 09:50:00-05:00,-0.004036,-0.004036,-0.005631,-0.005631,0.509154,,
2022-01-24 09:55:00-05:00,-0.005919,-0.004949,-0.000976,-0.000781,0.920855,,
...,...,...,...,...,...,...,...
2022-03-23 15:40:00-04:00,0.000000,-0.000621,-0.002073,-0.002279,0.809034,,
2022-03-23 15:45:00-04:00,-0.002073,-0.002279,-0.000623,-0.000727,-0.541162,,
2022-03-23 15:50:00-04:00,-0.000415,0.000519,-0.001247,0.001039,2.609914,,
2022-03-23 15:55:00-04:00,-0.000208,0.000934,0.001248,0.000519,1.528209,,


In [16]:
beta = pd.read_json(f"https://widget3.zacks.com/data/chart/json/{TICKER}/beta/www.zacks.com?", storage_options={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}).sort_index()
start_date = beta.index[0].strftime('%Y-%m-%d')
end_date = beta.index[-1].strftime('%Y-%m-%d')
beta.to_csv(f"{TICKER}_beta_{start_date}_{end_date}.csv")

In [14]:
stock_prices.to_csv('test.csv')

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

In [17]:
beta

Unnamed: 0,beta
2012-03-31,0.1019
2012-04-30,0.1775
2012-05-31,0.3636
2012-06-30,0.3792
2012-07-31,0.3809
...,...
2021-11-30,2.0310
2021-12-31,1.9780
2022-01-31,2.0000
2022-02-28,2.0440
