## Jupyter notebook: Daily (“d”) failure isolation

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

# Your project root (adjust if needed)
PROJECT_ROOT = Path.cwd()
while PROJECT_ROOT.name != "m4_tsc_experiment" and PROJECT_ROOT != PROJECT_ROOT.parent:
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_EXPORT_DIR = PROJECT_ROOT / "data" / "export"
csv_file = DATA_EXPORT_DIR / "windows_tsc_l3_y.csv"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("CSV exists?  ", csv_file.exists())
print("CSV path:    ", csv_file)


PROJECT_ROOT: /home/rafmontano/Documents/PhD/2025/PycharmProjects/m4_tsc_experiment
CSV exists?   True
CSV path:     /home/rafmontano/Documents/PhD/2025/PycharmProjects/m4_tsc_experiment/data/export/windows_tsc_l3_y.csv


In [2]:
import sys
from pathlib import Path

# Find project root by walking up until we find "src"
p = Path.cwd().resolve()
while p != p.parent and not (p / "src").exists():
    p = p.parent

PROJECT_ROOT = p
print("Detected PROJECT_ROOT:", PROJECT_ROOT)

# Add root so `import src...` works
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Confirm
import importlib
print("sys.path[0]:", sys.path[0])


Detected PROJECT_ROOT: /home/rafmontano/Documents/PhD/2025/PycharmProjects/m4_tsc_experiment
sys.path[0]: /home/rafmontano/Documents/PhD/2025/PycharmProjects/m4_tsc_experiment


## Cell 2 — Read raw CSV head (sanity: columns + dtypes)

In [3]:
df_head = pd.read_csv(csv_file, nrows=5)
print("Head shape:", df_head.shape)
print("Columns:", list(df_head.columns)[:15], "...")
print("\nDtypes (first 30):")
print(df_head.dtypes.head(30))
print("\nLabel unique examples:", df_head["label"].astype(str).unique()[:10] if "label" in df_head.columns else "NO label col")


Head shape: (5, 13)
Columns: ['label', 't1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', 't11', 't12'] ...

Dtypes (first 30):
label      int64
t1       float64
t2       float64
t3       float64
t4       float64
t5       float64
t6       float64
t7       float64
t8       float64
t9       float64
t10      float64
t11      float64
t12      float64
dtype: object

Label unique examples: ['1' '2']


# Cell 3 — Use your loader on a small slice (key: confirm return types)

In [4]:
from src.python.load_tsc_windows import load_tsc_windows

X, y = load_tsc_windows(
    csv_file,
    numseries= None, #20000,          # small but enough to reproduce issues
    label_col="label",
    drop_non_numeric=False,
)

print("Type(X):", type(X))
print("Type(y):", type(y))
print("X dtype:", getattr(X, "dtype", None))
print("y dtype:", getattr(y, "dtype", None))
print("X shape:", getattr(X, "shape", None))
print("y shape:", getattr(y, "shape", None))


Type(X): <class 'numpy.ndarray'>
Type(y): <class 'numpy.ndarray'>
X dtype: float32
y dtype: int64
X shape: (467458, 12)
y shape: (467458,)


# Cell 4 — Train/test split (key: confirm types after split)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Type(X_train):", type(X_train), "shape:", getattr(X_train, "shape", None))
print("Type(X_test) :", type(X_test),  "shape:", getattr(X_test, "shape", None))
print("Type(y_train):", type(y_train), "shape:", getattr(y_train, "shape", None))
print("Type(y_test) :", type(y_test),  "shape:", getattr(y_test, "shape", None))


Type(X_train): <class 'numpy.ndarray'> shape: (373966, 12)
Type(X_test) : <class 'numpy.ndarray'> shape: (93492, 12)
Type(y_train): <class 'numpy.ndarray'> shape: (373966,)
Type(y_test) : <class 'numpy.ndarray'> shape: (93492,)


# Check data - Before 5

In [6]:
import numpy as np

n_nan = np.isnan(X_train).sum()
n_inf = np.isinf(X_train).sum()
print("NaN count:", n_nan)
print("Inf count:", n_inf)
print("Total cells:", X_train.size)
print("NaN %:", (n_nan / X_train.size) * 100)
print("Inf %:", (n_inf / X_train.size) * 100)


NaN count: 0
Inf count: 0
Total cells: 4487592
NaN %: 0.0
Inf %: 0.0


In [7]:
import numpy as np

# Replace NaN/Inf defensively (you have only 1 NaN, but make it robust)
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_test  = np.nan_to_num(X_test,  nan=0.0, posinf=0.0, neginf=0.0)

print("NaNs after fix (train):", np.isnan(X_train).sum())
print("NaNs after fix (test) :", np.isnan(X_test).sum())


NaNs after fix (train): 0
NaNs after fix (test) : 0


# Cell 5 — Minimal 1NN fit to reproduce the exact crash

In [8]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
import numpy as np

# Ensure numpy
X_train_knn = np.asarray(X_train, dtype=np.float32)

# If 3D (n, 1, L), collapse to 2D (n, L)
if X_train_knn.ndim == 3:
    X_train_knn = X_train_knn[:, 0, :]

# 1-NN baseline (fast distance)
knn = KNeighborsTimeSeriesClassifier(
    n_neighbors=1,
    distance="dtw",     # fast baseline
    algorithm="brute_incr",   # avoids precomputing full distance matrix
    n_jobs=-1,                # keep memory stable (increase later if safe)
)

print("About to fit 1-NN (euclidean) on:", X_train_knn.shape)
knn.fit(X_train_knn, y_train)
print("1-NN fit OK")


About to fit 1-NN (euclidean) on: (373966, 12)
1-NN fit OK
