# 12.1 Simulating screeners
In backtesting, we should be able to simulate what a screener would have given, else we would have to simulate all stocks simultaneously. Because this is a computationally expensive thing to do, the results are saved in the <code>processed/cache/</code> folder. There are of course some constraints to this. It should be technically possible to get the results from screeners, either online or via APIs (e.g. IBKR TWS). This means that intraday scanners are limited. For long-term screeners it does not matter because we can actually download everything after the end of the trading day.

Warning: The screener results should not suffer from look-ahead bias. If you e.g. want to calculate the 14-day volume, you can only know this after day 14. So I always shift the trading days by 1 to account for this. Then all statistics should be available at the corresponding date.

In [1]:
from utils import remove_extended_hours, get_market_dates, get_tickers, get_data, first_trading_date_after_equal, first_trading_date_after
from datetime import datetime, date, timedelta, time
import mplfinance as mpf
import pandas as pd
import numpy as np
import os
import json
import pyarrow as pa
import pyarrow.parquet as pq
import ast
DATA_PATH = "../data/polygon/"

**Top liquid stocks**

In [11]:
def store_top_n_liquid(n=500, start = date(2000, 1, 1), end = date(2100, 1, 1)):
    """Calculated the top N liquid stocks per quarter and stores them in processed/cache/top_{n}_liquid.json

    Args:
        n (int, optional): the amount of stocks. Defaults to 500.
        start (date, optional): the start date. Defaults to date(2000, 1, 1).
        end (date, optional): the end date. Defaults to date(2100, 1, 1).
    """
    tickers = get_tickers()
    tickers = tickers[tickers['type'] == "CS"]

    dates_and_IDs = {} # {'2022-01-01': ['AAPL', 'MSFT', ...], '2022-04-01': ['NVDA', 'AMD', ...], ...}

    quarterly_all = pd.DataFrame() # Rows are dates, columns are turnovers, column names are IDs
    for index, id in enumerate(tickers['ID']):
        bars = get_data(id, columns=['volume', 'close'], start=start, end=end)
        quarterly = bars.resample('Q').agg({'close': 'last',
                                'volume': 'sum'})
        quarterly['turnover'] = quarterly['volume'] * quarterly['close']
        quarterly = quarterly.rename(columns={'turnover': id}).drop(columns=['volume', 'close'])
        quarterly_all = quarterly_all.merge(quarterly[id], how='outer', left_index=True, right_index=True)
        #print(index)
        
        # Avoids defragmentation, increasing performance. Without this it would take more than 4x longer.
        if index % 100 == 0 and index != 0:
            quarterly_all = quarterly_all.copy()
            print(index)

    # Store results in dates_and_IDs
    for datetime_, row in quarterly_all.copy().iterrows():
        top_n_stocks = row[row.notna()].nlargest(n).index.tolist()
        next_trading_date = first_trading_date_after_equal(datetime_.to_pydatetime().date() + timedelta(days=1))
        dates_and_IDs[next_trading_date.isoformat()] = top_n_stocks

    # Store to json (csv is not a convenient format to store variable length lists)
    with open(DATA_PATH + f'processed/cache/top_{n}_liquid.json', 'w') as f: 
        json.dump(dates_and_IDs, f)
    return

def get_top_n_liquid(day, n=500):
    """Retrieve the top liquid N stocks. If no cache exists, calculate them.

    Args:
        day (date): the date to query
        n (int, optional): the amount of stocks to query. Defaults to 500.

    Returns:
        list: list of IDs
    """
    if not os.path.isfile(DATA_PATH + f'processed/cache/top_{n}_liquid.json'):
        store_top_n_liquid(n)
        
    with open(DATA_PATH + f'processed/cache/top_{n}_liquid.json', 'r') as f:
        data = json.load(f)

    dates = list(data.keys())
    date_to_query = max(list(filter(lambda x: x <= day.isoformat(), dates)) )
    return date_to_query, data[date_to_query]

In [13]:
day, data = get_top_n_liquid(date(2023, 8, 25), n=100)
print(day)
print(data[:5])

2023-07-03
['TSLA-2019-01-01', 'NVDA-2019-01-01', 'AAPL-2019-01-01', 'MSFT-2019-01-01', 'AMD-2019-01-01']


**Top winners/losers**

In [167]:
def store_top_movers(sign, percentage, time_, start = date(2000, 1, 1), end = date(2100, 1, 1)):
    """Store the top gainers/losers (compared to previous close) if percentage is above threshold in processed/cache/gainers_{sign}p{percentage}_t{time}.json
    This is done for every day and specified time. 

    Args:
        sign (string): either '+' or '-' for higher or lower
        percentage (int): the percentage threshold
        time_ (time): the time to calculate the change each day
        start (date, optional): the start date. Defaults to date(2000, 1, 1).
        end (date, optional): the end date. Defaults to date(2100, 1, 1).
    """
    tickers = get_tickers()
    tickers = tickers[tickers['type'].isin(['CS', 'ADRC'])]

    dates = get_market_dates()
    dates = [day for day in dates if (day >= start and day <= end)] # Filter dates
    dates_and_IDs = {day.isoformat(): list() for day in dates} # Create dictionary with empty lists

    for index, row in tickers.iterrows():
        id = row['ID']
        daily = get_data(id, columns=['close'], start=start, end=end)
        daily['prev_close'] = daily['close'].shift(1)
        minute = get_data(id, timeframe=1, columns=['close'], start=start, end=end)
        minute.rename(columns={'close': 'close_at_time'}, inplace=True)

        # If there is no data between the start and end date, skip the ticker
        if daily.empty or minute.empty:
            continue

        # TO REMOVE
        minute = minute[~minute.index.duplicated()]
        daily = daily[~daily.index.duplicated()]

        # Build our data for this ticker
        data = minute.between_time(time_, time_)
        data.index = pd.DatetimeIndex(data.index).normalize() # Remove the 'time' part, which mean setting at 00:00 in order to merge with daily data

        data = pd.concat([data, daily[['prev_close']]], axis=1)
        data['change'] = 100*(data['close_at_time'] / data['prev_close'] - 1)

        # Select the dates that had a change higher/lower than the threshold
        if sign == '+':
            mover_dates = data[data.change > percentage].index
        elif sign == '-':
            mover_dates = data[data.change < percentage].index

        mover_dates = pd.to_datetime(mover_dates).date

        # Put it in the dictionary
        for day in mover_dates:
            dates_and_IDs[day.isoformat()].append(id)
        
        # For timing
        if index % 250 == 0:
            print(index)

    # Store to json
    with open(DATA_PATH + f'processed/cache/gainers_p{percentage}{sign}_t{time_.strftime("%H%M")}.json', 'w') as f: 
        json.dump(dates_and_IDs, f)

    return

In [168]:
store_top_movers(sign = '+', percentage = 20, time_ = time(16, 29), start = date(2019, 1, 1), end = date(2023, 9, 1))

0


In [None]:
store_top_movers(sign = '-', percentage = 20, time_ = time(16, 29), start = date(2019, 1, 1), end = date(2023, 9, 1))

Should always keep original ID.

In [154]:
start = date(2019, 1, 1)
end = date(2023, 9, 1)

id = 'PRDO-2019-01-01'
daily = get_data(id, columns=['close'], start=start, end=end)
daily['prev_close'] = daily['close'].shift(1)
minute = get_data(id, timeframe=1, columns=['close'], start=start, end=end)
minute.rename(columns={'close': 'close_at_time'}, inplace=True)

data = minute.between_time(time(17, 59), time(17, 59))
data.index = pd.DatetimeIndex(data.index).normalize() # Remove the 'time' part, which mean setting at 00:00 in order to merge with daily data

In [158]:
data.index.duplicated()

array([False, False, False, ...,  True,  True,  True])