In [5]:
import numpy as np
import pandas as pd

from numba import njit
import os
from pathlib import Path

In [6]:
# Get project root directory
project_root = Path().resolve()
data_dir = project_root / 'data' / 'stock_data' / 'despac'

# Load stock data CSV
stock_data_path = data_dir / 'stock_data.csv'
df = pd.read_csv(stock_data_path)

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Sort by ticker and date
df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

print(f"Loaded {len(df):,} rows")
print(f"Unique tickers: {df['ticker'].nunique()}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataFrame info:")
print(df.info())


Loaded 439,343 rows
Unique tickers: 587
Date range: 2019-02-19 00:00:00 to 2025-12-29 00:00:00

First few rows:
        date ticker   close
0 2024-10-25   AAGR  0.0002
1 2024-10-28   AAGR  0.0002
2 2024-10-29   AAGR  0.0086
3 2024-10-30   AAGR  0.0002
4 2024-10-31   AAGR  0.0004

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 439343 entries, 0 to 439342
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    439343 non-null  datetime64[ns]
 1   ticker  439343 non-null  object        
 2   close   439343 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 10.1+ MB
None


In [7]:
# Cell 2 — Numba forward window max (exclusive): future_max[i] = max(close[i+1 : i+N+1])

@njit
def future_window_max_exclusive_numba(close, N):
    n = close.shape[0]
    out = np.empty(n, dtype=np.float64)
    for i in range(n):
        out[i] = np.nan

    if N <= 0 or n == 0:
        return out

    # manual deque via array + head/tail pointers
    q = np.empty(n, dtype=np.int64)
    head = 0
    tail = 0

    right = 1  # next j to add

    for i in range(n):
        # extend window up to i+N
        target_right = i + N
        if target_right > n - 1:
            target_right = n - 1

        while right <= target_right:
            # maintain decreasing close[q[*]]
            while tail > head and close[right] >= close[q[tail - 1]]:
                tail -= 1
            q[tail] = right
            tail += 1
            right += 1

        # drop indices not in (i, i+N]
        while tail > head and q[head] <= i:
            head += 1

        if tail > head:
            out[i] = close[q[head]]

    return out


# Cell 3 — FAST FILTER FLAGS (row-level + ticker-level) using future_max
# "Up X% within N days" per row: future_max/close - 1 >= pct
# pct is decimal: 10% => 0.10

def add_up_within_n_days_flag(df: pd.DataFrame, N: int, pct: float) -> pd.DataFrame:
    work = df.sort_values(["ticker", "date"]).reset_index(drop=True).copy()

    close = work["close"].to_numpy(np.float64)
    tick = work["ticker"].to_numpy()

    n = len(work)
    future_max = np.full(n, np.nan, dtype=np.float64)

    # ticker block boundaries in the sorted array
    starts = np.r_[0, np.flatnonzero(tick[1:] != tick[:-1]) + 1]
    ends = np.r_[starts[1:], n]

    for s, e in zip(starts, ends):
        block = close[s:e]
        if block.shape[0] >= 2:
            future_max[s:e] = future_window_max_exclusive_numba(block, N)

    best_return = future_max / close - 1.0
    hit_up = best_return >= pct

    work["future_max"] = future_max
    work["best_return"] = best_return
    work["hit_up"] = hit_up
    return work

def tickers_that_hit(work: pd.DataFrame) -> list[str]:
    # assumes work has hit_up
    m = work.groupby("ticker")["hit_up"].any()
    return m[m].index.tolist()


# Cell 4 — OPTIONAL: "cycles" (non-repeating, chainable) for highlighting
# Cycle definition:
#   start i, end = first day within N where close[end] >= close[i]*(1+pct)
#   then set i = end (chainable, as you requested)

@njit
def find_up_cycles_one_ticker_chainable(close, N, pct):
    n = close.shape[0]
    starts = np.empty(n, dtype=np.int64)
    ends   = np.empty(n, dtype=np.int64)
    k = 0

    i = 0
    while i < n - 1:
        c0 = close[i]
        if c0 <= 0.0:
            i += 1
            continue

        target = c0 * (1.0 + pct)

        max_j = i + N
        if max_j > n - 1:
            max_j = n - 1

        hit_j = -1
        j = i + 1
        while j <= max_j:
            if close[j] >= target:
                hit_j = j
                break
            j += 1

        if hit_j != -1:
            starts[k] = i
            ends[k] = hit_j
            k += 1
            i = hit_j        # <- chainable (start immediately on hit day)
        else:
            i += 1

    return starts[:k], ends[:k]


# Cell 5 — wrapper: build a cycles DataFrame (ticker, start/end dates, returns)
# Great for "highlighting cycles" without repeated overlapping starts.

def compute_up_cycles_df(df: pd.DataFrame, N: int, pct: float) -> pd.DataFrame:
    work = df.sort_values(["ticker", "date"]).reset_index(drop=True).copy()

    close = work["close"].to_numpy(np.float64)
    tick  = work["ticker"].to_numpy()
    dates = work["date"].to_numpy()

    n = len(work)
    starts = np.r_[0, np.flatnonzero(tick[1:] != tick[:-1]) + 1]
    ends   = np.r_[starts[1:], n]

    rows = []
    for s, e in zip(starts, ends):
        block = close[s:e]
        if block.shape[0] < 2:
            continue

        st_idx, en_idx = find_up_cycles_one_ticker_chainable(block, N, pct)
        if st_idx.shape[0] == 0:
            continue

        tkr = tick[s]
        for a, b in zip(st_idx, en_idx):
            ga = s + int(a)
            gb = s + int(b)
            sc = close[ga]
            ec = close[gb]
            if sc <= 0:
                continue
            rows.append((
                tkr,
                dates[ga],
                dates[gb],
                sc,
                ec,
                ec / sc - 1.0,
                gb - ga,
                ga, gb
            ))

    cycles = pd.DataFrame(
        rows,
        columns=["ticker","start_date","end_date","start_close","end_close","return","days_to_hit","start_row","end_row"]
    )
    if len(cycles) == 0:
        return cycles

    cycles["start_date"] = pd.to_datetime(cycles["start_date"])
    cycles["end_date"] = pd.to_datetime(cycles["end_date"])
    cycles = cycles.sort_values(["ticker","start_date"], ignore_index=True)
    cycles["N"] = N
    cycles["pct"] = pct
    return cycles


In [8]:
# Cell 6 — run it

N = 6
pct = 0.10  # 10%

work = add_up_within_n_days_flag(df, N=N, pct=pct)
tickers = tickers_that_hit(work)

len(tickers), tickers[:25]


(583,
 ['AAGR',
  'ABL',
  'ABLV',
  'ABP',
  'ABVE',
  'ACEL',
  'ACHR',
  'ADGM',
  'ADN',
  'ADSE',
  'ADTH',
  'ADV',
  'AENT',
  'AEON',
  'AERT',
  'AEVA',
  'AFRI',
  'AGAE',
  'AGILQ',
  'AHCO',
  'AIEV',
  'AIIO',
  'AIRJ',
  'AISP',
  'AKLI'])

In [9]:
# Test 1: Verify logic with simple synthetic data
print("=" * 60)
print("TEST 1: Simple synthetic data")
print("=" * 60)

# Create a simple test case: stock goes from $10 to $11 in 3 days (10% gain)
test_data = pd.DataFrame({
    'ticker': ['TEST'] * 5,
    'date': pd.date_range('2024-01-01', periods=5),
    'close': [10.0, 10.5, 10.8, 11.0, 10.9]  # 10% gain from day 0 to day 3
})

print("\nTest data:")
print(test_data)

N = 6
pct = 0.10  # 10%

test_work = add_up_within_n_days_flag(test_data, N=N, pct=pct)
print("\nResult with flags:")
print(test_work[['date', 'close', 'future_max', 'best_return', 'hit_up']])

print("\n✓ Day 0: close=$10, future_max should be $11 (from day 3)")
print(f"  future_max[0] = {test_work.iloc[0]['future_max']:.2f}")
print(f"  best_return[0] = {test_work.iloc[0]['best_return']:.2%}")
print(f"  hit_up[0] = {test_work.iloc[0]['hit_up']}")

# Verify: Day 0 should hit because it goes from $10 to $11 (10% gain) within 6 days
assert test_work.iloc[0]['hit_up'] == True, "Day 0 should hit 10% gain!"
assert abs(test_work.iloc[0]['best_return'] - 0.10) < 0.001, "Best return should be ~10%"
print("\n✅ TEST 1 PASSED: Correctly identifies 10% gain within window")


TEST 1: Simple synthetic data

Test data:
  ticker       date  close
0   TEST 2024-01-01   10.0
1   TEST 2024-01-02   10.5
2   TEST 2024-01-03   10.8
3   TEST 2024-01-04   11.0
4   TEST 2024-01-05   10.9

Result with flags:
        date  close  future_max  best_return  hit_up
0 2024-01-01   10.0        11.0     0.100000    True
1 2024-01-02   10.5        11.0     0.047619   False
2 2024-01-03   10.8        11.0     0.018519   False
3 2024-01-04   11.0        10.9    -0.009091   False
4 2024-01-05   10.9         NaN          NaN   False

✓ Day 0: close=$10, future_max should be $11 (from day 3)
  future_max[0] = 11.00
  best_return[0] = 10.00%
  hit_up[0] = True

✅ TEST 1 PASSED: Correctly identifies 10% gain within window


In [10]:
# Test 2: Edge case - stock that doesn't hit the target
print("\n" + "=" * 60)
print("TEST 2: Stock that doesn't hit target")
print("=" * 60)

test_data2 = pd.DataFrame({
    'ticker': ['NO_HIT'] * 5,
    'date': pd.date_range('2024-01-01', periods=5),
    'close': [10.0, 10.2, 10.3, 10.4, 10.5]  # Only 5% gain, not 10%
})

test_work2 = add_up_within_n_days_flag(test_data2, N=6, pct=0.10)
print("\nTest data:")
print(test_data2[['date', 'close']])
print("\nResult:")
print(test_work2[['date', 'close', 'future_max', 'best_return', 'hit_up']])

# Day 0 should NOT hit because max is only $10.50 (5% gain)
assert test_work2.iloc[0]['hit_up'] == False, "Day 0 should NOT hit 10% gain!"
print("\n✅ TEST 2 PASSED: Correctly identifies when target is NOT met")



TEST 2: Stock that doesn't hit target

Test data:
        date  close
0 2024-01-01   10.0
1 2024-01-02   10.2
2 2024-01-03   10.3
3 2024-01-04   10.4
4 2024-01-05   10.5

Result:
        date  close  future_max  best_return  hit_up
0 2024-01-01   10.0        10.5     0.050000   False
1 2024-01-02   10.2        10.5     0.029412   False
2 2024-01-03   10.3        10.5     0.019417   False
3 2024-01-04   10.4        10.5     0.009615   False
4 2024-01-05   10.5         NaN          NaN   False

✅ TEST 2 PASSED: Correctly identifies when target is NOT met


In [11]:
# Test 3: Multiple tickers - verify grouping works
print("\n" + "=" * 60)
print("TEST 3: Multiple tickers")
print("=" * 60)

test_data3 = pd.DataFrame({
    'ticker': ['TICK1'] * 3 + ['TICK2'] * 3,
    'date': pd.date_range('2024-01-01', periods=6),
    'close': [10.0, 11.0, 10.5,  # TICK1: 10% gain on day 1
              20.0, 21.0, 20.5]   # TICK2: 5% gain (not enough)
})

test_work3 = add_up_within_n_days_flag(test_data3, N=6, pct=0.10)
print("\nTest data:")
print(test_data3)
print("\nResult:")
print(test_work3[['ticker', 'date', 'close', 'hit_up']])

tickers_hit = tickers_that_hit(test_work3)
print(f"\nTickers that hit: {tickers_hit}")
assert 'TICK1' in tickers_hit, "TICK1 should hit!"
assert 'TICK2' not in tickers_hit, "TICK2 should NOT hit!"
print("\n✅ TEST 3 PASSED: Correctly handles multiple tickers")



TEST 3: Multiple tickers

Test data:
  ticker       date  close
0  TICK1 2024-01-01   10.0
1  TICK1 2024-01-02   11.0
2  TICK1 2024-01-03   10.5
3  TICK2 2024-01-04   20.0
4  TICK2 2024-01-05   21.0
5  TICK2 2024-01-06   20.5

Result:
  ticker       date  close  hit_up
0  TICK1 2024-01-01   10.0    True
1  TICK1 2024-01-02   11.0   False
2  TICK1 2024-01-03   10.5   False
3  TICK2 2024-01-04   20.0   False
4  TICK2 2024-01-05   21.0   False
5  TICK2 2024-01-06   20.5   False

Tickers that hit: ['TICK1']

✅ TEST 3 PASSED: Correctly handles multiple tickers


In [12]:
# Test 4: Verify the future_max logic - should find max in forward window
print("\n" + "=" * 60)
print("TEST 4: Verify future_max calculation")
print("=" * 60)

# Create data where max appears at different positions
test_data4 = pd.DataFrame({
    'ticker': ['MAX_TEST'] * 7,
    'date': pd.date_range('2024-01-01', periods=7),
    'close': [10.0, 12.0, 11.0, 13.0, 11.5, 12.5, 11.0]  # Max is 13.0 at index 3
})

test_work4 = add_up_within_n_days_flag(test_data4, N=4, pct=0.20)  # 20% threshold
print("\nTest data:")
print(test_data4)
print("\nResult with future_max:")
print(test_work4[['date', 'close', 'future_max', 'best_return', 'hit_up']])

# Day 0: future_max should be 13.0 (max of days 1-4)
print(f"\nDay 0: close=${test_work4.iloc[0]['close']:.2f}, future_max should be $13.00")
print(f"  future_max[0] = ${test_work4.iloc[0]['future_max']:.2f}")
assert abs(test_work4.iloc[0]['future_max'] - 13.0) < 0.01, "future_max[0] should be 13.0"

# Day 0: best_return = (13.0 / 10.0) - 1 = 0.30 = 30%
print(f"  best_return[0] = {test_work4.iloc[0]['best_return']:.2%}")
assert abs(test_work4.iloc[0]['best_return'] - 0.30) < 0.01, "best_return should be 30%"

# Day 0 should hit because 30% > 20%
assert test_work4.iloc[0]['hit_up'] == True, "Day 0 should hit 20% threshold!"
print("\n✅ TEST 4 PASSED: future_max correctly finds max in forward window")



TEST 4: Verify future_max calculation

Test data:
     ticker       date  close
0  MAX_TEST 2024-01-01   10.0
1  MAX_TEST 2024-01-02   12.0
2  MAX_TEST 2024-01-03   11.0
3  MAX_TEST 2024-01-04   13.0
4  MAX_TEST 2024-01-05   11.5
5  MAX_TEST 2024-01-06   12.5
6  MAX_TEST 2024-01-07   11.0

Result with future_max:
        date  close  future_max  best_return  hit_up
0 2024-01-01   10.0        13.0     0.300000    True
1 2024-01-02   12.0        13.0     0.083333   False
2 2024-01-03   11.0        13.0     0.181818   False
3 2024-01-04   13.0        12.5    -0.038462   False
4 2024-01-05   11.5        12.5     0.086957   False
5 2024-01-06   12.5        11.0    -0.120000   False
6 2024-01-07   11.0         NaN          NaN   False

Day 0: close=$10.00, future_max should be $13.00
  future_max[0] = $13.00
  best_return[0] = 30.00%

✅ TEST 4 PASSED: future_max correctly finds max in forward window


In [13]:
# Test 5: Compare with real data - spot check a few tickers manually
print("\n" + "=" * 60)
print("TEST 5: Spot check on real data")
print("=" * 60)

N = 6
pct = 0.10  # 10%

# Get a sample ticker
sample_ticker = df['ticker'].iloc[0]
sample_data = df[df['ticker'] == sample_ticker].head(20).copy()

print(f"\nSample ticker: {sample_ticker}")
print(f"First 10 days:")
print(sample_data[['date', 'close']].head(10))

sample_work = add_up_within_n_days_flag(sample_data, N=N, pct=pct)
print(f"\nFirst 10 days with flags:")
print(sample_work[['date', 'close', 'future_max', 'best_return', 'hit_up']].head(10))

# Check if this ticker hits
sample_hit = sample_work['hit_up'].any()
print(f"\nDoes {sample_ticker} hit 10% within 6 days? {sample_hit}")

if sample_hit:
    first_hit_idx = sample_work['hit_up'].idxmax()
    first_hit = sample_work.loc[first_hit_idx]
    print(f"\nFirst hit at index {first_hit_idx}:")
    print(f"  Date: {first_hit['date']}")
    print(f"  Close: ${first_hit['close']:.2f}")
    print(f"  Future max: ${first_hit['future_max']:.2f}")
    print(f"  Best return: {first_hit['best_return']:.2%}")

print("\n✅ TEST 5 PASSED: Real data processing works")



TEST 5: Spot check on real data

Sample ticker: AAGR
First 10 days:
        date   close
0 2024-10-25  0.0002
1 2024-10-28  0.0002
2 2024-10-29  0.0086
3 2024-10-30  0.0002
4 2024-10-31  0.0004
5 2024-11-01  0.0004
6 2024-11-04  0.0002
7 2024-11-05  0.0002
8 2024-11-06  0.0002
9 2024-11-07  0.0002

First 10 days with flags:
        date   close  future_max  best_return  hit_up
0 2024-10-25  0.0002      0.0086    42.000000    True
1 2024-10-28  0.0002      0.0086    42.000000    True
2 2024-10-29  0.0086      0.0004    -0.953488   False
3 2024-10-30  0.0002      0.0004     1.000000    True
4 2024-10-31  0.0004      0.0004     0.000000   False
5 2024-11-01  0.0004      0.0004     0.000000   False
6 2024-11-04  0.0002      0.0004     1.000000    True
7 2024-11-05  0.0002      0.0004     1.000000    True
8 2024-11-06  0.0002      0.0004     1.000000    True
9 2024-11-07  0.0002      0.0004     1.000000    True

Does AAGR hit 10% within 6 days? True

First hit at index 0:
  Date: 2024-10-2

In [14]:
# Test 6: Verify cycles function works correctly
print("\n" + "=" * 60)
print("TEST 6: Test cycles function for highlighting")
print("=" * 60)

# Create data with multiple hits
test_data6 = pd.DataFrame({
    'ticker': ['CYCLE_TEST'] * 10,
    'date': pd.date_range('2024-01-01', periods=10),
    'close': [10.0, 11.0, 10.5, 12.0, 11.5, 13.0, 12.5, 14.0, 13.5, 13.0]
    # Hits: 10->11 (10%), 10.5->12 (14%), 11.5->13 (13%), 12.5->14 (12%)
})

cycles = compute_up_cycles_df(test_data6, N=6, pct=0.10)
print("\nTest data:")
print(test_data6[['date', 'close']])
print("\nCycles found:")
print(cycles[['start_date', 'end_date', 'start_close', 'end_close', 'return', 'days_to_hit']])

# Verify cycles
assert len(cycles) > 0, "Should find at least one cycle"
print(f"\nFound {len(cycles)} cycles")
for idx, row in cycles.iterrows():
    actual_return = (row['end_close'] / row['start_close']) - 1.0
    print(f"  Cycle {idx+1}: {row['start_date'].date()} -> {row['end_date'].date()}, "
          f"${row['start_close']:.2f} -> ${row['end_close']:.2f}, "
          f"{actual_return:.2%} return, {row['days_to_hit']} days")

print("\n✅ TEST 6 PASSED: Cycles function works correctly")



TEST 6: Test cycles function for highlighting

Test data:
        date  close
0 2024-01-01   10.0
1 2024-01-02   11.0
2 2024-01-03   10.5
3 2024-01-04   12.0
4 2024-01-05   11.5
5 2024-01-06   13.0
6 2024-01-07   12.5
7 2024-01-08   14.0
8 2024-01-09   13.5
9 2024-01-10   13.0

Cycles found:
  start_date   end_date  start_close  end_close    return  days_to_hit
0 2024-01-01 2024-01-02         10.0       11.0  0.100000            1
1 2024-01-02 2024-01-06         11.0       13.0  0.181818            4
2 2024-01-07 2024-01-08         12.5       14.0  0.120000            1

Found 3 cycles
  Cycle 1: 2024-01-01 -> 2024-01-02, $10.00 -> $11.00, 10.00% return, 1 days
  Cycle 2: 2024-01-02 -> 2024-01-06, $11.00 -> $13.00, 18.18% return, 4 days
  Cycle 3: 2024-01-07 -> 2024-01-08, $12.50 -> $14.00, 12.00% return, 1 days

✅ TEST 6 PASSED: Cycles function works correctly


In [15]:
# Test 7: Performance test - compare speed
print("\n" + "=" * 60)
print("TEST 7: Performance check")
print("=" * 60)

import time

N = 6
pct = 0.10

# Time the operation
start = time.time()
work = add_up_within_n_days_flag(df, N=N, pct=pct)
tickers = tickers_that_hit(work)
elapsed = time.time() - start

print(f"Processed {len(df):,} rows across {df['ticker'].nunique()} tickers")
print(f"Time: {elapsed:.3f} seconds")
print(f"Rows per second: {len(df) / elapsed:,.0f}")
print(f"Found {len(tickers)} tickers that hit 10% within 6 days")

# This should be very fast (< 1 second for 400k rows)
assert elapsed < 5.0, f"Too slow! Took {elapsed:.2f} seconds"
print("\n✅ TEST 7 PASSED: Performance is acceptable")



TEST 7: Performance check
Processed 439,343 rows across 587 tickers
Time: 0.061 seconds
Rows per second: 7,155,292
Found 583 tickers that hit 10% within 6 days

✅ TEST 7 PASSED: Performance is acceptable


In [20]:
x = pd.read_parquet('/Users/Taz/code/spac_strat/data/option_data/despac/options_data.parquet')
x.columns

Index(['date', 'ticker', 'expiration_date', 'no_options_data', 'contractID',
       'symbol', 'expiration', 'strike', 'type', 'last', 'mark', 'bid',
       'bid_size', 'ask', 'ask_size', 'volume', 'open_interest',
       'implied_volatility', 'delta', 'gamma', 'theta', 'vega', 'rho',
       'date_str'],
      dtype='object')