# 11. Simulating screeners
In backtesting, we should be able to simulate what a screener would have given, else we would have to simulate all stocks simultaneously. Because this is a computationally expensive thing to do, the results are saved in the <code>output/screens</code> folder. There are of course some constraints to this. It should be technically possible to get the results from screeners, either online or via APIs (e.g. IBKR TWS). This means that intraday scanners are limited. For long-term screeners it does not matter because we can actually download everything after the end of the trading day.

Warning: The screener results should not suffer from look-ahead bias. If you e.g. want to calculate the 14-day volume, you can only know this after day 14. So I always shift the trading days by 1 to account for this. Then all statistics should be available at the corresponding date.

The screeners produce watchlists. There are two types:
* Daily watchlists. These have a value for every day and the key is a date.
* Intraday watchlists. The keys are datetimes, however there may not be a ticker list for every minute (to save time).

In [80]:
from utils import remove_extended_hours, get_market_dates, get_market_calendar, get_tickers, get_data, \
    first_trading_date_after_equal, first_trading_date_after, last_trading_date_before, last_trading_date_before_equal
from datetime import datetime, date, timedelta, time
from dateutil.relativedelta import relativedelta
import mplfinance as mpf
import pandas as pd
import numpy as np
import os
import json
import pyarrow as pa
import pyarrow.parquet as pq
import ast
DATA_PATH = "../data/polygon/"

START_DATE = date(2019, 6, 1)
END_DATE = date(2024, 3, 1)

**Top liquid stocks**

In backtesting, instead of screening all stocks, for most systems I only need e.g. the S&P500 stocks. However I don't care about the exact holdings of the S&P500. The only reason why I would choose the index is because of volume and liquidity. However then turnover is way more informative.

So to create the T100, T500, T1500 and T3000 indices, I simply select the stocks with the highest turnover in each 6-month period.

In [None]:
tickers = get_tickers()
tickers = tickers[tickers['type'] == 'CS'] # No ETFs, ADRs and indices
dates_and_IDs = {} # {'2022-01-01': ['AAPL', 'MSFT', ...], '2022-07-01': ['NVDA', 'AMD', ...], ...}

# Loop through half years
for end_datetime in pd.date_range(start= START_DATE ,end = END_DATE,freq='6M', inclusive='right'):
    start_date = end_datetime.date() - relativedelta(months=6)
    # Get trading dates
    end_date = last_trading_date_before_equal(end_datetime.date())
    end_date_to_query = last_trading_date_before(end_date) # Subtract 1 day to account for look-ahead bias
    start_date = first_trading_date_after_equal(start_date)

    all_bars = []
    # Find IDs with data in the specific half year that are not delisted before end of year
    for i, row in tickers[tickers['end_date'] >= end_date_to_query].iterrows():
        bars = get_data(row['ID'], start_date, end_date_to_query, columns=['close', 'volume'])
        bars['ID'] = row['ID']
        all_bars.append(bars)
        
    # Calculate turnover
    all_tickers = pd.concat(all_bars)
    all_tickers = all_tickers.groupby("ID").agg({"volume": "sum", "close": "mean"})
    all_tickers['turnover'] = all_tickers['close'] * all_tickers['volume']
    all_tickers = all_tickers[['turnover']]
    all_tickers.sort_values(by="turnover", ascending=False, inplace=True)
    dates_and_IDs[end_date.isoformat()] = list(all_tickers.head(500).index)
    print(end_datetime)

# Store to json
with open('../data/output/dailyscreens/T500.json', 'w') as f: 
    json.dump(dates_and_IDs, f)

In [123]:
with open('../data/output/dailyscreens/T500.json', 'r') as f: 
    T500 = json.load(f)

T500[date(2019, 6, 28).isoformat()][:5]

['AMZN-2019-06-03',
 'AAPL-2019-06-03',
 'META-2022-06-09',
 'MSFT-2019-06-03',
 'AMD-2019-06-03']

**Top winners/losers**

In [7]:
def store_top_movers(sign, percentage, time_, start = date(2000, 1, 1), end = date(2100, 1, 1)):
    """Store the top gainers/losers (compared to previous close) if percentage is above threshold in processed/cache/gainers_{sign}p{percentage}_t{time}.json
    This is done for every day and specified time. 

    Args:
        sign (string): either '+' or '-' for winners or losers
        percentage (float/int): the percentage threshold
        time_ (time): the time to calculate the change each day
        start (date, optional): the start date. Defaults to date(2000, 1, 1).
        end (date, optional): the end date. Defaults to date(2100, 1, 1).
    """
    tickers = get_tickers()
    tickers = tickers[tickers['type'].isin(['CS', 'ADRC'])]

    dates = get_market_dates()
    dates = [day for day in dates if (day >= start and day <= end)] # Filter dates
    dates_and_IDs = {day.isoformat(): list() for day in dates} # Create dictionary with empty lists

    for index, row in tickers.iterrows():
        id = row['ID']
        daily = get_data(id, columns=['close'], start=start, end=end)
        daily['prev_close'] = daily['close'].shift(1)
        minute = get_data(id, timeframe=1, columns=['close'], start=start, end=end)
        minute.rename(columns={'close': 'close_at_time'}, inplace=True)

        # If there is no data between the start and end date, skip the ticker
        if daily.empty or minute.empty:
            continue

        # TO REMOVE
        minute = minute[~minute.index.duplicated()]
        daily = daily[~daily.index.duplicated()]

        # Build our data for this ticker
        data = minute.between_time(time_, time_)
        data.index = pd.DatetimeIndex(data.index).normalize() # Remove the 'time' part, which mean setting at 00:00 in order to merge with daily data

        data = pd.concat([data, daily[['prev_close']]], axis=1)
        data['change'] = 100*(data['close_at_time'] / data['prev_close'] - 1)

        # Select the dates that had a change higher/lower than the threshold
        if sign == '+':
            mover_dates = data[data.change > percentage].index
        elif sign == '-':
            mover_dates = data[data.change < percentage].index

        mover_dates = pd.to_datetime(mover_dates).date

        # Put it in the dictionary
        for day in mover_dates:
            dates_and_IDs[day.isoformat()].append(id)
        
        # For timing
        if index % 250 == 0:
            print(index)

    # Store to json
    with open(DATA_PATH + f'processed/cache/movers_p{percentage}{sign}_t{time_.strftime("%H%M")}.json', 'w') as f: 
        json.dump(dates_and_IDs, f)

    return

In [None]:
store_top_movers(sign = '+', percentage = 20, time_ = time(13, 00), start = date(2019, 1, 1), end = date(2023, 9, 1))

In [10]:
def store_top_intraday_movers(sign, percentage, time_, start = date(2000, 1, 1), end = date(2100, 1, 1)):
    """Store the top intraday gainers/losers (compared to daily open) if percentage is above threshold in processed/cache/gainers_{sign}p{percentage}_t{time}.json
    This is done for every day and specified time. 

    Args:
        sign (string): either '+' or '-' for winners or losers
        percentage (float/int): the percentage threshold
        time_ (time): the time to calculate the change each day
        start (date, optional): the start date. Defaults to date(2000, 1, 1).
        end (date, optional): the end date. Defaults to date(2100, 1, 1).
    """
    tickers = get_tickers()
    tickers = tickers[tickers['type'].isin(['CS', 'ADRC'])]

    dates = get_market_dates()
    dates = [day for day in dates if (day >= start and day <= end)] # Filter dates
    dates_and_IDs = {day.isoformat(): list() for day in dates} # Create dictionary with empty lists

    for index, row in tickers.iterrows():
        id = row['ID']
        daily = get_data(id, columns=['open'], start=start, end=end)
        minute = get_data(id, timeframe=1, columns=['close'], start=start, end=end)
        minute.rename(columns={'close': 'close_at_time'}, inplace=True)

        # If there is no data between the start and end date, skip the ticker
        if daily.empty or minute.empty:
            continue

        # # TO REMOVE
        # minute = minute[~minute.index.duplicated()]
        # daily = daily[~daily.index.duplicated()]

        # Build our data for this ticker
        data = minute.between_time(time_, time_)
        data.index = pd.DatetimeIndex(data.index).normalize() # Remove the 'time' part, which mean setting at 00:00 in order to merge with daily data

        data = pd.concat([data, daily[['open']]], axis=1)
        data['change'] = 100*(data['close_at_time'] / data['open'] - 1)

        # Select the dates that had a change higher/lower than the threshold
        if sign == '+':
            mover_dates = data[data.change > percentage].index
        elif sign == '-':
            mover_dates = data[data.change < percentage].index

        mover_dates = pd.to_datetime(mover_dates).date

        # Put it in the dictionary
        for day in mover_dates:
            dates_and_IDs[day.isoformat()].append(id)
        
        # For timing
        if index % 250 == 0:
            print(index)

    # Store to json
    with open(DATA_PATH + f'processed/cache/movers_intraday_p{percentage}{sign}_t{time_.strftime("%H%M")}.json', 'w') as f: 
        json.dump(dates_and_IDs, f)

    return

In [None]:
store_top_intraday_movers(sign = '+', percentage = 20, time_ = time(15, 55), start = date(2019, 1, 1), end = date(2023, 9, 1))