In [1]:
%pip install --upgrade pip
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install joblib
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl (2.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [6]:
from scipy.sparse import csr_matrix, hstack


In [7]:
# =========================
# STEP 0 — Setup (no histograms; removed Abs_Burst_Peak_List)
# =========================
# Optional installs (only if missing):
# pip install numpy pandas scikit-learn xgboost lightgbm joblib pyarrow fastparquet

import os, re, ast, json, math, joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight

# ---- Paths / config ----
DATA_PATH = "/Users/rohan/Documents/Research/distrubution_data_ml/Compiled_Networks.csv"
LABEL_COL = "NeuronType"

# ONLY these four list columns now (Abs_Burst_Peak_List removed)
columns_to_process = [
    "Burst_Peak_List",
    "Burst_Times_List",      # used for timing, L, bursts/min
    "IBI_List",
    "SpikesPerBurst_List",
]

TIME_COL = "Burst_Times_List"

assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)

# Each row = a recording (no unique id present) → synthesize one
df = df.reset_index(drop=True)
df["recording_id"] = [f"row_{i:06d}" for i in range(len(df))]

# Sanity checks
missing = [c for c in [LABEL_COL] + columns_to_process if c not in df.columns]
assert not missing, f"Missing columns in CSV: {missing}"

df[[LABEL_COL] + columns_to_process].head()


Unnamed: 0,NeuronType,Burst_Peak_List,Burst_Times_List,IBI_List,SpikesPerBurst_List
0,MxHEMI,"14.6441,3.9478,2.4697,4.4859,3.7445,1.552,1.52...","0.5,0.4,0.5,0.4,0.5,0.6,0.5,0.7,0.5,0.6,0.6,0....","21,7.9,21.8,13.3,5.1,11.3,13.7,24.3,22.2,3.3,3...","7349,1673,1135,1915,1842,839,740,1049,7600,850..."
1,MxWT,8.4741,0.5,,4043
2,FxHET,"1.4235,6.7396,6.7726,6.6778","0.6,0.5,0.4,0.5","7.1,83,85.4",814328628603139
3,MxHEMI,"12.4079,2.199,6.2069,4.4418,1.8126,1.8054,3.72...","0.5,0.7,0.4,0.4,0.4,0.7,0.6,0.5,0.5,0.5,0.6,0....","24.5,25.7,16.4,5.3,25.2,15.3,14.6,21.5,23.3,17...","6015,1387,2649,1885,762,1115,2030,5998,886,172..."
4,MxWT,8.3836,0.5,,4153


In [8]:
# =========================
# STEP 1 — Parse list columns into numeric arrays (robust to missing 'recording_id')
# =========================
import ast, re
import numpy as np
import pandas as pd

def parse_list_cell(x):
    if pd.isna(x):
        return np.array([], dtype=float)
    if isinstance(x, (list, tuple, np.ndarray)):
        return np.array(x, dtype=float)
    s = str(x).strip()
    if s == "" or s.lower() in {"nan", "none"}:
        return np.array([], dtype=float)
    # Try Python/JSON literal like "[1,2,3]"
    try:
        v = ast.literal_eval(s)
        if isinstance(v, (list, tuple, np.ndarray)):
            return np.array(v, dtype=float)
    except Exception:
        pass
    # Fallback: comma/space separated
    try:
        toks = [t for t in re.split(r"[,\s]+", s) if t != ""]
        return np.array([float(t) for t in toks], dtype=float)
    except Exception:
        return np.array([], dtype=float)

# --- normalize and find/standardize 'recording_id' ---
norm_map = {c.lower().strip(): c for c in df.columns}
candidate_keys = [
    "recording_id", "recordingid", "rec_id", "record_id", "recordid",
    "recording id", "record id", "recording"  # last two are looser matches
]
rec_col = None
for k in candidate_keys:
    if k in norm_map:
        rec_col = norm_map[k]
        break

df = df.copy()
if rec_col is None:
    # No recording id present — create one
    df["recording_id"] = np.arange(len(df))
else:
    if rec_col != "recording_id":
        df.rename(columns={rec_col: "recording_id"}, inplace=True)

# --- make sure columns_to_process exists and only includes present columns ---
# (If columns_to_process already defined earlier, this just filters it.)
present_cols = [c for c in columns_to_process if c in df.columns]
missing_cols = [c for c in columns_to_process if c not in df.columns]
if missing_cols:
    print(f"[WARN] Skipping missing columns: {missing_cols}")

# --- parse the list-like columns ---
parsed = {c: [] for c in present_cols}
rec_ids = df["recording_id"].tolist()

for _, row in df.iterrows():
    for c in present_cols:
        parsed[c].append(parse_list_cell(row[c]))


In [9]:
# =========================
# STEP 3 — Feature helpers (robust stats only)
# =========================
def median_absolute_deviation(x):
    if x.size == 0:
        return np.nan
    med = np.nanmedian(x)
    return float(np.nanmedian(np.abs(x - med)))

def robust_stats(x):
    if x.size == 0:
        return {k: np.nan for k in [
            "mean","std","median","mad","min","q10","q25","q50","q75","q90","max","skew","kurtosis"
        ]}
    s = pd.Series(x, dtype=float)
    return {
        "mean": float(s.mean()),
        "std": float(s.std(ddof=1)),
        "median": float(s.median()),
        "mad": float(median_absolute_deviation(s.values)),
        "min": float(s.min()),
        "q10": float(s.quantile(0.10)),
        "q25": float(s.quantile(0.25)),
        "q50": float(s.quantile(0.50)),
        "q75": float(s.quantile(0.75)),
        "q90": float(s.quantile(0.90)),
        "max": float(s.max()),
        "skew": float(s.skew()),
        "kurtosis": float(s.kurtosis()),
    }


In [10]:
# =========================
# STEP 4 — Build per-recording features (counts/timing + robust stats only)
# =========================
FEATURE_ROWS = []

for i, rec_id in enumerate(rec_ids):
    # Timing & L from burst times
    times = parsed.get(TIME_COL, [np.array([], dtype=float)])[i]
    times = np.sort(times) if times.size else times

    # L: prefer count of times; fallback to max length among other lists
    fallback_lengths = [parsed[c][i].size for c in columns_to_process if parsed[c][i] is not None]
    L = int(times.size) if times.size else (int(max(fallback_lengths)) if fallback_lengths else 0)
    first_t = float(times[0]) if times.size else np.nan
    last_t  = float(times[-1]) if times.size else np.nan
    span    = float(max(0.0, last_t - first_t)) if times.size else np.nan
    bursts_per_min = (L / (span/60.0)) if (times.size and span > 0) else np.nan

    # No per-burst durations in provided columns → leave NaN (will be imputed + flagged)
    feat = {
        "recording_id": rec_id,
        "L": L,
        "bursts_per_min": bursts_per_min,
        "first_burst_time": first_t,
        "last_burst_time": last_t,
        "approx_recording_span_sec": span,
        "has_bursts": 1 if L > 0 else 0,
    }

    # Robust stats for each list feature (NO histograms)
    for c in columns_to_process:
        x = parsed[c][i]
        for k, v in robust_stats(x).items():
            feat[f"{c}__{k}"] = v

    FEATURE_ROWS.append(feat)

feat_table = pd.DataFrame(FEATURE_ROWS)
feat_table.head()


Unnamed: 0,recording_id,L,bursts_per_min,first_burst_time,last_burst_time,approx_recording_span_sec,has_bursts,Burst_Peak_List__mean,Burst_Peak_List__std,Burst_Peak_List__median,...,SpikesPerBurst_List__mad,SpikesPerBurst_List__min,SpikesPerBurst_List__q10,SpikesPerBurst_List__q25,SpikesPerBurst_List__q50,SpikesPerBurst_List__q75,SpikesPerBurst_List__q90,SpikesPerBurst_List__max,SpikesPerBurst_List__skew,SpikesPerBurst_List__kurtosis
0,row_000000,20,3000.0,0.4,0.8,0.4,1,4.36487,4.542092,2.60735,...,397.0,740.0,848.9,1036.25,1255.0,1860.25,7088.9,7600.0,1.977862,2.417899
1,row_000001,1,,0.5,0.5,0.0,1,8.4741,,8.4741,...,0.0,4043.0,4043.0,4043.0,4043.0,4043.0,4043.0,4043.0,,
2,row_000002,4,1200.0,0.4,0.6,0.2,1,5.403375,2.653541,6.7087,...,213.0,814.0,1427.8,2348.5,2999.5,3175.75,3241.9,3286.0,-1.863203,3.515524
3,row_000003,13,2600.0,0.4,0.7,0.3,1,5.103554,4.446465,3.6186,...,841.0,762.0,814.0,905.0,1727.0,2649.0,6011.6,6269.0,1.203322,-0.260229
4,row_000004,1,,0.5,0.5,0.0,1,8.3836,,8.3836,...,0.0,4153.0,4153.0,4153.0,4153.0,4153.0,4153.0,4153.0,,


In [11]:
# =========================
# STEP 5 — L=0-safe: missingness indicators + median impute
# =========================
X = feat_table.copy()

# Add __isnan flags for numeric columns (except the binary 'has_bursts')
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c]) and c != "has_bursts"]
for c in num_cols:
    X[f"{c}__isnan"] = X[c].isna().astype(int)

# Median-impute numerics
for c in num_cols:
    med = float(np.nanmedian(X[c].values)) if X[c].notna().any() else 0.0
    X[c] = X[c].fillna(med)

# Keep ID for reference but don't feed it to the model
X = X[["recording_id"] + [c for c in X.columns if c != "recording_id"]]

# Attach label (NeuronType) aligned by row order
y = df[LABEL_COL].copy()

print("Shapes → X:", X.shape, " y:", y.shape)
X.head()


Shapes → X: (186, 116)  y: (186,)


Unnamed: 0,recording_id,L,bursts_per_min,first_burst_time,last_burst_time,approx_recording_span_sec,has_bursts,Burst_Peak_List__mean,Burst_Peak_List__std,Burst_Peak_List__median,...,SpikesPerBurst_List__mad__isnan,SpikesPerBurst_List__min__isnan,SpikesPerBurst_List__q10__isnan,SpikesPerBurst_List__q25__isnan,SpikesPerBurst_List__q50__isnan,SpikesPerBurst_List__q75__isnan,SpikesPerBurst_List__q90__isnan,SpikesPerBurst_List__max__isnan,SpikesPerBurst_List__skew__isnan,SpikesPerBurst_List__kurtosis__isnan
0,row_000000,20,3000.0,0.4,0.8,0.4,1,4.36487,4.542092,2.60735,...,0,0,0,0,0,0,0,0,0,0
1,row_000001,1,4800.0,0.5,0.5,0.0,1,8.4741,3.027797,8.4741,...,0,0,0,0,0,0,0,0,1,1
2,row_000002,4,1200.0,0.4,0.6,0.2,1,5.403375,2.653541,6.7087,...,0,0,0,0,0,0,0,0,0,0
3,row_000003,13,2600.0,0.4,0.7,0.3,1,5.103554,4.446465,3.6186,...,0,0,0,0,0,0,0,0,0,0
4,row_000004,1,4800.0,0.5,0.5,0.0,1,8.3836,3.027797,8.3836,...,0,0,0,0,0,0,0,0,1,1


In [12]:
# =========================
# STEP 6 — Train/test split & label encoding
# =========================
# Drop rows with missing labels (if any)
mask = y.notna()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)

# Keep the id aside and remove from features
recording_ids = X["recording_id"].tolist()
X_model = X.drop(columns=["recording_id"])

# Encode NeuronType
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Stratified split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_model, y_enc, test_size=0.25, random_state=42, stratify=y_enc
)

# Class weights to help imbalance
classes = np.unique(y_tr)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
class_weight_map = {cls: w for cls, w in zip(classes, class_weights)}
sample_weight_tr = np.array([class_weight_map[c] for c in y_tr], dtype=float)

print("Classes:", list(le.classes_))
print("Class weights:", class_weight_map)


Classes: ['FxHET', 'MxHEMI', 'MxWT']
Class weights: {np.int64(0): np.float64(0.8910256410256411), np.int64(1): np.float64(2.0144927536231885), np.int64(2): np.float64(0.7239583333333334)}


In [13]:
from xgboost import XGBClassifier
xgb = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    n_estimators=700,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)
xgb.fit(X_tr.astype(np.float32), y_tr, sample_weight=sample_weight_tr)


0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.9
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [14]:
from sklearn.preprocessing import OneHotEncoder

leaf_tr = xgb.apply(X_tr.astype(np.float32))
leaf_te = xgb.apply(X_te.astype(np.float32))

enc = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
leaf_tr_ohe = enc.fit_transform(leaf_tr)
leaf_te_ohe = enc.transform(leaf_te)

print("Leaf index shapes:", leaf_tr.shape, leaf_te.shape)
print("Leaf embedding shapes:", leaf_tr_ohe.shape, leaf_te_ohe.shape)


Leaf index shapes: (139, 2100) (47, 2100)
Leaf embedding shapes: (139, 8291) (47, 8291)


# Approach A: leaf embeddings → Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

meta = LogisticRegression(max_iter=3000)
meta.fit(leaf_tr_ohe, y_tr)

pred = meta.predict(leaf_te_ohe)
print("Leaf→LR  F1_macro:", f1_score(y_te, pred, average="macro"))
print("Leaf→LR  Acc:", accuracy_score(y_te, pred))


Leaf→LR  F1_macro: 0.6508352758352758
Leaf→LR  Acc: 0.6808510638297872


## Approach B: concatenate X + leaf embeddings → new XGBoost

In [16]:
from scipy.sparse import csr_matrix, hstack
from xgboost import XGBClassifier

X_tr_sparse = csr_matrix(X_tr)
X_te_sparse = csr_matrix(X_te)

X_tr_aug = hstack([X_tr_sparse, leaf_tr_ohe])
X_te_aug = hstack([X_te_sparse, leaf_te_ohe])

xgb2 = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)

xgb2.fit(X_tr_aug, y_tr, sample_weight=sample_weight_tr)
pred2 = xgb2.predict(X_te_aug)

print("X+Leaf→XGB  F1_macro:", f1_score(y_te, pred2, average="macro"))
print("X+Leaf→XGB  Acc:", accuracy_score(y_te, pred2))


X+Leaf→XGB  F1_macro: 0.6348039215686274
X+Leaf→XGB  Acc: 0.6382978723404256
