# Prediction Models
- Predictive modeling **IS NOT** helpful in the context of noisy data.
- $R^2$ is very low and captures almost no patterns
- Potential alternatives: stochastic modeling, noise removal techniques, etc.

## use AFTER per-track mean-velocity cosine analysis

In [None]:
# One-cell sklearn GridSearchCV pipeline on track-level mean-velocity cosine
# Label: 1 if cos_mean_velocity < -0.7 (net motion toward wave), else 0

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)

# 1. Build track-level feature table
# cos_mean_velocity already computed in track_mean_vel
cos_track = track_mean_vel["cos_mean_velocity"].dropna()

# Mean EDGE_TIME per track
track_time_mean = merged.groupby("TRACK_ID")["EDGE_TIME"].mean()

# Mean SPEED per track
track_speed_mean = merged.groupby("TRACK_ID")["SPEED"].mean()

# Optional: displacement from tracks CSV (if available)
try:
    tracks_path = "results/trial_3/20260206_224003_subset_tracks.csv"
    tracks_df = pd.read_csv(tracks_path, header=0, skiprows=[1, 2, 3])
    disp_series = tracks_df.set_index("TRACK_ID")["TRACK_DISPLACEMENT"]
except Exception:
    disp_series = pd.Series(dtype=float)

# Align all features on common TRACK_ID index
common_ids = cos_track.index
common_ids = common_ids.intersection(track_time_mean.index)
common_ids = common_ids.intersection(track_speed_mean.index)
if not disp_series.empty:
    common_ids = common_ids.intersection(disp_series.index)

cos_track = cos_track.loc[common_ids]
X_features = pd.DataFrame({
    "mean_edge_time": track_time_mean.loc[common_ids],
    "mean_speed": track_speed_mean.loc[common_ids],
})
if not disp_series.empty:
    X_features["track_displacement"] = disp_series.loc[common_ids]

print(f"Number of tracks used for modeling: {len(common_ids)}")

# 2. Create binary labels from mean-velocity cosine
threshold = -0.7
y = (cos_track < threshold).astype(int)  # 1 = strongly toward wave, 0 = other

print(f"Class distribution (0=other, 1=toward):\n{y.value_counts().sort_index()}")

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.3, random_state=42, stratify=y
)

# 4. Define pipeline and GridSearchCV (logistic regression on standardized features)
# Use default 'l2' penalty and tune only C and class_weight to avoid deprecation warnings
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, solver="lbfgs")),
])

param_grid = {
    "clf__C": [0.01, 0.1, 1.0, 10.0],
    "clf__class_weight": [None, "balanced"],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1,
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("\nBest parameters:")
print(grid.best_params_)

# 5. Evaluation on test set
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("\n=== Test-set Performance ===")
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1-score:  {f1:.3f}")
print("Confusion matrix (rows=true, cols=pred):")
print(cm)

print("\nClassification report:")
print(classification_report(y_test, y_pred, target_names=["other", "toward"], zero_division=0))

# 6. Report best weights and bias (in original feature space ordering)
clf = best_model.named_steps["clf"]
scaler = best_model.named_steps["scaler"]

coef = clf.coef_[0]
intercept = clf.intercept_[0]

print("=== Best logistic regression weights (on standardized features) ===")
for name, w in zip(X_features.columns, coef):
    print(f"  {name}: {w:.4f}")
print(f"Intercept (bias): {intercept:.4f}")

In [None]:
# 3D logistic regression decision surface on track-level features
from utils import plot_logistic_decision_surface_3d

plot_logistic_decision_surface_3d(best_model, X_features, y, feature_indices=(0, 1), grid_size=50)

## Ideas

### Wave Statistical Questions
- Permutation test for Velocities/Speeds of groups with > cosine threshold and groups with < cosine threshold.

### Centroid Statistical Questions
- Is the mean of `v_radial_mean` significantly < 0? → one‑sample test to see if cells, on average, move inward.
- Do early vs late edges differ in `v_radial` or `r_centroid`?
  - Split `EDGE_TIME` into windows and compare distributions.
- Does initial distance predict net inward motion?
  - Correlation or simple regression on `v_radial_mean` ~ `r_initial`.