In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from pyvbmc import VBMC
import corner
from tqdm.notebook import tqdm
import pickle
import random
from scipy.integrate import cumulative_trapezoid as cumtrapz

from time_vary_norm_utils import (
    up_or_down_RTs_fit_fn, cum_pro_and_reactive_time_vary_fn,
    rho_A_t_VEC_fn, up_or_down_RTs_fit_wrt_stim_fn, rho_A_t_fn, cum_A_t_fn,
    CDF_E_minus_small_t_NORM_rate_norm_l_time_varying_fn, rho_E_minus_small_t_NORM_rate_norm_time_varying_fn)
from types import SimpleNamespace
from time_vary_and_norm_simulators import psiam_tied_data_gen_wrapper_rate_norm_fn

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.model_selection import GroupKFold


In [2]:
exp_df = pd.read_csv('../outExp.csv')

# remove wrong rows 
count = ((exp_df['RTwrtStim'].isna()) & (exp_df['abort_event'] == 3)).sum()
print("Number of rows where RTwrtStim is NaN and abort_event == 3:", count)
exp_df = exp_df[~((exp_df['RTwrtStim'].isna()) & (exp_df['abort_event'] == 3))].copy()

# comparable batch

all_df = exp_df[
    (exp_df['batch_name'] == 'Comparable') &
    (exp_df['LED_trial'].isin([np.nan, 0]))
]

animal = 37
all_df = all_df[all_df['animal'] == animal]

df = all_df.copy()


df['is_abort']   = (df['abort_event'] == 3).astype(int)
df['short_poke'] = ((df['is_abort'] == 1) & (df['TotalFixTime'] < 300)).astype(int)
df['rewarded'] = (df['success'] == 1).astype(int)

Number of rows where RTwrtStim is NaN and abort_event == 3: 16


# Create lagged predictors for each session

In [3]:
T = 3
for k in range(1, T + 1):
    df[f'rewarded_{k}']   = df.groupby('session')['rewarded'].shift(k)
    df[f'abort_{k}']      = df.groupby('session')['is_abort'].shift(k)
    df[f'short_poke_{k}'] = df.groupby('session')['short_poke'].shift(k)


# Remove first T trials in each session (they have NaNs)


In [4]:
df['session'].unique()

array([ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
        85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
        98,  99, 100, 101, 102, 103, 104, 105, 106])

In [5]:
lag_cols = [f'{var}_{k}'
            for k in range(1, T + 1)
            for var in ['rewarded', 'abort', 'short_poke']]

df = df.dropna(subset=lag_cols).reset_index(drop=True)


In [6]:
df['session'].unique()


array([ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
        85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
        98,  99, 100, 101, 102, 103, 104, 105, 106])

# Randomly split sessions → train 25, test 10

In [7]:
keep_cols = lag_cols + ['short_poke', 'session', 'animal']  # plus anything else you need
df = df[keep_cols].copy()

In [8]:
sessions = df['session'].unique()
np.random.shuffle(sessions)
train_sessions = sessions[:25]
test_sessions = sessions[25:]

train_df = df[df['session'].isin(train_sessions)].copy()
test_df = df[df['session'].isin(test_sessions)].copy()

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")



Train shape: (19146, 12), Test shape: (7757, 12)


In [10]:
train_df.columns

Index(['rewarded_1', 'abort_1', 'short_poke_1', 'rewarded_2', 'abort_2',
       'short_poke_2', 'rewarded_3', 'abort_3', 'short_poke_3', 'short_poke',
       'session', 'animal'],
      dtype='object')

# Prepare predictors and target

In [11]:
# automatically generate predictor column names based on T
predictor_cols = [f'{var}_{k}'
                  for k in range(1, T + 1)
                  for var in ['rewarded', 'abort', 'short_poke']]


# check they exist in the dataframe
missing_cols = [col for col in predictor_cols if col not in train_df.columns]
if missing_cols:
    print(f"Warning: these predictors are missing in the dataframe → {missing_cols}")

# assign X and y
X_train = train_df[predictor_cols]
y_train = train_df['short_poke']

X_test = test_df[predictor_cols]
y_test = test_df['short_poke']


# log regression

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)


In [13]:
print(model.n_iter_)


[11]


In [14]:
print(model.coef_)
print(model.intercept_)


[[0.05841412 0.21011313 0.21011313 0.35373983 0.22924894 0.22924894
  0.29264352 0.13603035 0.13603035]]
[-2.55489401]


# check training set performance

In [15]:
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:,1]


In [16]:
from sklearn.metrics import accuracy_score, roc_auc_score

acc = accuracy_score(y_train, y_train_pred)
auc = roc_auc_score(y_train, y_train_prob)

print(f"Training Accuracy: {acc:.3f}, ROC-AUC: {auc:.3f}")


Training Accuracy: 0.883, ROC-AUC: 0.581
