In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_parquet('data/prices.parquet', engine='fastparquet')[['datetime', 'mid']]
df['returns'] = df['mid'].pct_change()
df = df.dropna(subset='returns').reset_index(drop=True)
df['date'] = pd.to_datetime(df['datetime'])
df = df[df["date"] >= "2023-01-01 00:00:00"]
df_copy = df.copy()

import sys
sys.path.append("..")
from cpd.detect import detect_single_cp
from cpd.utils import infer_dt

# %pip install numba

Unnamed: 0,datetime,mid,returns,date
5699588,2023-01-12 14:56:38,133.225,-0.000113,2023-01-12 14:56:38
5699589,2023-01-12 14:56:39,133.235,0.000075,2023-01-12 14:56:39
5699590,2023-01-12 14:56:40,133.240,0.000038,2023-01-12 14:56:40
5699591,2023-01-12 14:56:41,133.230,-0.000075,2023-01-12 14:56:41
5699592,2023-01-12 14:56:42,133.235,0.000038,2023-01-12 14:56:42
...,...,...,...,...
5722983,2023-01-13 14:56:33,134.155,0.000000,2023-01-13 14:56:33
5722984,2023-01-13 14:56:34,134.155,0.000000,2023-01-13 14:56:34
5722985,2023-01-13 14:56:35,134.155,0.000000,2023-01-13 14:56:35
5722986,2023-01-13 14:56:36,134.155,0.000000,2023-01-13 14:56:36


In [None]:
df = df.iloc[183398:206798]

import multiprocessing as mp
try:
    mp.set_start_method("spawn")
except RuntimeError:
    pass

import numpy as np, pandas as pd, os, time, concurrent.futures
from tqdm import tqdm
from cpd_pool_helpers import init_pool, process_end

# --- Prepare your data once ---
df = df.sort_values("date").reset_index(drop=True)          # your existing df
dates = pd.to_datetime(df["date"]).to_numpy()
returns = df["returns"].to_numpy(float)
lookback_window_length = 7200
DT = 1.0   # matches your original 'X' trick → infer_dt(df) == 1.0

import csv
csv_fields = ["date", "t", "cp_location", "cp_location_norm", "cp_score", "gap", "runtime"]
with open("lobster_cpd_7200lbw_2.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(csv_fields)

ends = range(lookback_window_length + 1, len(returns), 30)
n_workers = os.cpu_count() or 1
chunksize = max(1, len(ends) // (n_workers * 4) or 1)

rows = []
with concurrent.futures.ProcessPoolExecutor(
    max_workers=n_workers,
    initializer=init_pool,
    initargs=(returns, dates, lookback_window_length, DT),
) as ex:
    for row in tqdm(ex.map(process_end, ends, chunksize=chunksize), total=len(ends)):
        rows.append(row)
        with open("lobster_cpd_7200lbw_2.csv", "a") as f:
            writer = csv.writer(f)
            writer.writerow(
                [row["date"], row["t"], row["cp_location"], row["cp_location_norm"], row["cp_score"], row["gap"], row["runtime"]]
            )

out = pd.DataFrame(rows).sort_values("t").reset_index(drop=True)

 47%|████▋     | 616/1320 [11:29<04:32,  2.59it/s]   

In [5]:
out_copy = out

In [6]:
out['date'] = pd.to_datetime(out['date'])
out.set_index('date', inplace=True)
df['date'] = pd.to_datetime(df['date'])

In [7]:
out.reindex(df['date']).to_csv('need_backfill.csv')

In [8]:
out.dropna()

Unnamed: 0_level_0,t,cp_location,cp_location_norm,cp_score,gap,runtime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-04 14:53:19,7200,3983,0.553194,1.0,1532.739791,16.859674
2023-01-04 14:53:29,7210,3973,0.551806,1.0,1527.231509,6.988360
2023-01-04 14:53:39,7220,3963,0.550417,1.0,1516.564668,6.524758
2023-01-04 14:53:49,7230,3953,0.549028,1.0,1504.720756,7.044664
2023-01-04 14:53:59,7240,3943,0.547639,1.0,1509.708693,7.517483
...,...,...,...,...,...,...
2023-01-05 11:55:49,19950,1063,0.147639,1.0,323.391454,3.944732
2023-01-05 11:55:59,19960,1053,0.146250,1.0,322.286889,3.609988
2023-01-05 11:56:09,19970,1043,0.144861,1.0,318.542661,3.239823
2023-01-05 11:56:19,19980,1033,0.143472,1.0,317.788417,3.203406


In [65]:
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M:%S').astype('datetime64[ns]')
out['date'] = pd.to_datetime(out['date']).dt.strftime('%Y-%m-%d %H:%M:%S').astype('datetime64[ns]')
df.merge(out, on='date', how='left').tail()

Unnamed: 0,datetime,mid,returns,date,t,cp_location,cp_location_norm,cp_score,gap,runtime
9196,2023-01-04 14:06:35,127.015,-0.000118,2023-01-04 14:06:35,,,,,,
9197,2023-01-04 14:06:36,126.99,-0.000197,2023-01-04 14:06:36,,,,,,
9198,2023-01-04 14:06:37,127.035,0.000354,2023-01-04 14:06:37,,,,,,
9199,2023-01-04 14:06:38,127.01,-0.000197,2023-01-04 14:06:38,,,,,,
9200,2023-01-04 14:06:39,127.015,3.9e-05,2023-01-04 14:06:39,,,,,,


In [18]:
from cpd.woods import calibrated_woods_score
import numpy as np

gaps = out["gap"]
# Heuristic calibration on your dataset:
nc_threshold = float(np.quantile(gaps, 0.75))
scale = float(np.std(gaps) or 1.0)
out["cp_score_calibrated"] = [calibrated_woods_score(g, nc_threshold, scale) for g in gaps]