In [None]:
import os
import argparse
import torch
from sklearn.model_selection import train_test_split
from lib.utils import (
    load_config,
    get_experiment_dir,
    make_windowed_dataset_from_sessions,
    get_participant_id,
    get_participant_projects,
    get_raw_dataset_path,
    get_sessions_for_project,
    generate_dataset_summary
)

participant = 'ejaz'

participant_X_train = []
participant_y_train = []
participant_X_test = []
participant_y_test = []
    
# Get participant_id from participant code
participant_id = get_participant_id(participant)
print(f"Participant ID: {participant_id}")
    
# Get projects for the participant
projects = get_participant_projects(participant_id)
print(f"Projects for {participant}: {projects}")

sessions = []
for project_name in projects:
    print(f"Processing project: {project_name}")
    raw_dataset_path = get_raw_dataset_path(project_name)
    ss = [s for s in get_sessions_for_project(project_name) if s.get('keep') != 0 and s.get('smoking_verified') == 1]
    for s in ss:
        s['raw_dataset_path'] = raw_dataset_path
    sessions.extend(ss)

# 3-way split for target participants: 60% train, 20% val, 20% test
train_sessions, temp_sessions = train_test_split(
    sessions,
    test_size=0.2,  # 40% for val+test
    random_state=42
)
val_sessions, test_sessions = train_test_split(
    temp_sessions,
    test_size=0.5,  # 50% of 40% = 20% total for test
    random_state=42
)

val_sessions = val_sessions + test_sessions
print(f"Target participant split - Train: {len(train_sessions)}, Val: {len(val_sessions)}")

X_train, y_train = make_windowed_dataset_from_sessions(
    sessions=train_sessions,
    window_size=3000,
    window_stride=3000,
    raw_dataset_path=raw_dataset_path,
    labeling='andrew smoking labels',
    sensor_config={'use_accelerometer': True, 'use_gyroscope': True}
)

X_val, y_val = make_windowed_dataset_from_sessions(
    sessions=val_sessions,
    window_size=3000,
    window_stride=3000,
    raw_dataset_path=raw_dataset_path,
    labeling='andrew smoking labels',
    sensor_config={'use_accelerometer': True, 'use_gyroscope': True}
)

y_train = y_train.reshape(-1,1)
y_val = y_val.reshape(-1,1)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import plotly.express as px
from torch import nn
from torch.nn.functional import relu

class ObnoxiouslySimpleCNN(nn.Module):
    def __init__(self, input_channels, base_channels, dropout):
        super(ObnoxiouslySimpleCNN, self).__init__()
        self.stem = nn.Conv1d(input_channels, base_channels, kernel_size=7, padding=1) # 3000 -> 2994, receptive field 7
        self.conv1 = nn.Conv1d(base_channels, base_channels, kernel_size=5, padding=2) # 2994 -> 2994, receptive field 11
        self.conv2 = nn.Conv1d(base_channels, base_channels, kernel_size=5, padding=2) # 2994 -> 2994, receptive field 15
        self.conv3 = nn.Conv1d(base_channels, base_channels, kernel_size=5, padding=2) # 2994 -> 2994, receptive field 19
        self.conv4 = nn.Conv1d(base_channels, base_channels, kernel_size=5, padding=2) # 2994 -> 2994, receptive field 23
        
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_channels, 1)

    def forward(self, x):
        x = self.stem(x)
        x = relu(x)
        x = self.conv1(x)
        x = relu(x)
        x = self.conv2(x)
        x = relu(x)
        x = self.conv3(x)
        x = relu(x)
        x = self.conv4(x)
        x = relu(x)
        x = self.gap(x).squeeze(-1)
        x = self.dropout(x)
        x = self.classifier(x)        
        return x

model = ObnoxiouslySimpleCNN(input_channels=6, base_channels=8, dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=5)

In [None]:
model.to('cuda')
criterion = criterion.to('cuda')

In [None]:
trainloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
valloader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_val, y_val), batch_size=32, shuffle=False)

In [None]:
lossi = []
val_lossi = []
val_f1i = []
val_i = []

epoch = 0

In [None]:
from sklearn.metrics import f1_score

model.train()

for _ in range(500):
    loss_epoch = 0
    for Xi,yi in trainloader:
        Xi = Xi.to('cuda')
        yi = yi.to('cuda')
        logits = model(Xi)
        loss = criterion(logits, yi)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_epoch += loss.item()
    loss_epoch /= len(trainloader)
    lossi.append(loss_epoch)

    if epoch % 10 == 0:
        model.eval()
        val_loss_total = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for Xi, yi in valloader:
                Xi = Xi.to('cuda')
                yi = yi.to('cuda')
                logits = model(Xi)
                loss = criterion(logits, yi)
                val_loss_total += loss.item()

                all_preds.append(torch.sigmoid(logits).round().cpu())
                all_labels.append(yi.cpu())

        val_loss = val_loss_total / len(valloader)
        val_f1 = f1_score(
        torch.cat(all_labels).numpy(),
        torch.cat(all_preds).numpy(),
        average='macro'
        )

        scheduler.step(val_f1)

        val_lossi.append(val_loss)
        val_i.append(epoch)
        val_f1i.append(val_f1)

        fig,ax= plt.subplots(nrows=2,ncols=1,figsize=(7.2,10))
        ax[0].plot(lossi)
        ax[0].plot(val_i, val_lossi, color='red')
        ax[0].set_yscale('log')

        ax[1].plot(val_i, val_f1i, color='green')

        plt.savefig(f'loss.png')
        plt.close()
        
    print(f'Epoch {epoch}: train loss {loss_epoch:.4f}, val loss {val_loss:.4f}, val f1 {val_f1:.4f}, lr {optimizer.param_groups[0]["lr"]:.6f}')
    model.train()
    epoch += 1

In [None]:

    if epoch % 10 == 0:
        model.eval()
        val_loss_total = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for Xi, yi in valloader:
                Xi = Xi.to('cuda')
                yi = yi.to('cuda')
                logits = model(Xi)
                loss = criterion(logits, yi)
                val_loss_total += loss.item()

                all_preds.append(torch.sigmoid(logits).round().cpu())
                all_labels.append(yi.cpu())

        val_loss = val_loss_total / len(valloader)
        val_f1 = f1_score(
        torch.cat(all_labels).numpy(),
        torch.cat(all_preds).numpy(),
        average='macro'
        )

        scheduler.step(val_f1)

        val_lossi.append(val_loss)
        val_i.append(epoch)
        val_f1i.append(val_f1)

        fig,ax= plt.subplots(nrows=2,ncols=1,figsize=(7.2,10))
        ax[0].plot(lossi)
        ax[0].plot(val_i, val_lossi, color='red')
        ax[0].set_yscale('log')

        ax[1].plot(val_i, val_f1i, color='green')

        plt.savefig(f'loss.png')
        plt.close()

In [None]:
# Visualize one of x_train sessions in time domain with model predictions
session = val_sessions[0]
labeling = f'andrew smoking labels'
window_size = 3000
window_stride = 3000

session_name = session['session_name']
raw_dataset_path = session['raw_dataset_path']
start_ns = session.get('start_ns')
stop_ns = session.get('stop_ns')

print(f"Generating windowed dataset for session: {session_name}")


X = []
y = []

sensor_config = {'use_accelerometer': True, 'use_gyroscope': True}

# Determine which columns to use based on sensor config
sensor_columns = []
if sensor_config.get('use_accelerometer', True):
    sensor_columns.extend(['accel_x', 'accel_y', 'accel_z'])
if sensor_config.get('use_gyroscope', False):
    sensor_columns.extend(['gyro_x', 'gyro_y', 'gyro_z'])




bouts = [b for b in session['bouts'] if b['label'] == labeling]

from lib.utils import load_data, resample

df = load_data(raw_dataset_path, session_name, sensor_config, start_ns, stop_ns)
df = resample(df)
df['label'] = 0

for bout in bouts:
    start = bout['start']
    end = bout['end']
    df.loc[(df['ns_since_reboot'] >= start) & (df['ns_since_reboot'] <= end), 'label'] = 1

if 'accel_x' not in df.columns and sensor_config.get('use_accelerometer', True):
    if 'x' in df.columns:
        df.rename(columns={'x': 'accel_x', 'y': 'accel_y', 'z': 'accel_z'}, inplace=True)
data_columns = sensor_columns + ['label']
missing_columns = [col for col in data_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: Missing columns {missing_columns} in session {session_name}")
    raise ValueError(f"Missing columns {missing_columns} in session {session_name}")

data = torch.tensor(df[data_columns].values, dtype=torch.float32)

if data.shape[0] < window_size:
    # Zero pad the data to window size
    padding_length = window_size - data.shape[0]
    padding = torch.zeros((padding_length, data.shape[1]), dtype=torch.float32)
    data = torch.cat([data, padding], dim=0)
    print(f"Zero-padded session {session_name} from {data.shape[0] - padding_length} to {data.shape[0]} samples")

windowed_data = data.unfold(dimension=0,size=window_size,step=250)
X.append(windowed_data[:,:-1,:])
y.append(windowed_data[:,-1,:])

X = torch.cat(X)
y = (~(torch.cat(y) == 0).all(axis=1)).float()

model.eval()

with torch.no_grad():
    logits = model(X.to('cuda'))
    predictions = torch.sigmoid(logits).cpu().numpy()


window_size = 3000
step = 250

num_windows = predictions.shape[0]  # 907

# Initialize output
time_domain_length = (num_windows - 1) * step + window_size  # 909000
time_domain_preds = torch.zeros(time_domain_length)
overlap_counts = torch.zeros(time_domain_length)

# Accumulate predictions
for i, pred in enumerate(predictions):
    start_idx = i * step
    end_idx = start_idx + window_size
    time_domain_preds[start_idx:end_idx] += pred.item()
    overlap_counts[start_idx:end_idx] += 1

# Average where overlapping
time_domain_preds = time_domain_preds / overlap_counts

df = df.iloc[:len(time_domain_preds)]
df['logits'] = time_domain_preds
df['y_pred'] = (time_domain_preds > .9).int()

from sklearn.metrics import ConfusionMatrixDisplay, classification_report
y_true = df['label'].values
y_pred = df['y_pred'].astype(int).values
ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
print(classification_report(y_true, y_pred))

In [None]:

import plotly.express as px
fig = px.line(df.iloc[::20], x='ns_since_reboot', y=['accel_x','accel_y','accel_z','label','y_pred','logits'], title=f'Session {session_name} Smoking Prediction')
fig.show(renderer='browser')