In [1]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta, datetime

def generate_dataset(num_subjects=10, min_timesteps=10, max_timesteps=20, seed=37):
    random.seed(seed)
    np.random.seed(seed)
    all_rows = []                     
    true_changepoints = {} 
    # Define CPTs (Conditional Probability Tables)
    
    # Segment 1: Mapping from blood_pressure_t-1 to pain_level
    pain_cpt_seg1 = {
        0: [0.8, 0.1, 0.1],  # low BP → no pain
        1: [0.1, 0.8, 0.1],  # normal BP → mild pain
        2: [0.1, 0.1, 0.8],  # high BP → severe pain
    }

    # Segment 2: Noisier mapping from blood_pressure_t-1 to pain_level
    pain_cpt_seg2 = {
        0: [0.4, 0.3, 0.3],
        1: [0.3, 0.4, 0.3],
        2: [0.3, 0.3, 0.4],
    }

    # Segment 1: pain_level_t-1 predicts mobility
    mobility_cpt_seg1 = {
        0: [0.9, 0.1],  # no pain → walking
        1: [0.4, 0.6],  # mild pain → mostly walking
        2: [0.2, 0.8],  # severe pain → mostly bedridden
    }

    # Segment 2: Noisier mapping between pain_level_t-1 and mobility
    mobility_cpt_seg2 = {
        0: [0.6, 0.4],
        1: [0.5, 0.5],
        2: [0.8, 0.2],
    }

    # Generate data per subject
    for subject_id in range(1, num_subjects + 1):
        # Random number of time steps per subject
        num_timesteps = random.randint(min_timesteps, max_timesteps)

        # Define the changepoint (midpoint of time steps)
        changepoint = num_timesteps // 2
        true_changepoints[subject_id] = changepoint

        # Generate time series dates for the subject
        start_date = datetime(2023, random.randint(1, 12), random.randint(1, 28))
        timestamps = [start_date + timedelta(days=2 * i) for i in range(num_timesteps)]

        # Initialize first values randomly
        bp = [random.choice([0, 1, 2])]  # blood_pressure
        pain = [random.choice([0, 1, 2])]
        mobility = [random.choice([0, 1])]
        hr = [random.choice([0, 1, 2])]
        ox = [random.choice([0, 1])]

        # Generate remaining values using CPTs
        for t in range(1, num_timesteps):
            bp.append(random.choice([0, 1, 2]))

            # Use CPTs depending on segment (before/after changepoint)
            if t < changepoint:
                pain_probs = pain_cpt_seg1[bp[t - 1]]
                mobility_probs = mobility_cpt_seg1[pain[t - 1]]
            else:
                pain_probs = pain_cpt_seg2[bp[t - 1]]
                mobility_probs = mobility_cpt_seg2[pain[t - 1]]

            # Sample next values using the CPTs
            pain.append(np.random.choice([0, 1, 2], p=pain_probs))
            mobility.append(np.random.choice([0, 1], p=mobility_probs))
            hr.append(random.choice([0, 1, 2]))   
            ox.append(random.choice([0, 1]))      

        # Collect all rows for this subject
        for t in range(num_timesteps):
            all_rows.append({
                'subject_id': subject_id,
                'timestamp': timestamps[t].date(),
                'blood_pressure': bp[t],
                'pain_level': pain[t],
                'mobility': mobility[t],
                'heart_rate': hr[t],
                'oxygen_level': ox[t],
            })

    # Create a DataFrame and sort for proper ordering
    df = pd.DataFrame(all_rows)
    df = df.sort_values(by=['subject_id', 'timestamp']).reset_index(drop=True)

    return df, true_changepoints

# Generate the dataset and print the head
df_health, ground_truth_cp = generate_dataset()
subject_lengths = df_health.groupby("subject_id").size().to_dict()
df_health.head()


Unnamed: 0,subject_id,timestamp,blood_pressure,pain_level,mobility,heart_rate,oxygen_level
0,1,2023-10-03,2,2,0,2,1
1,1,2023-10-05,1,2,1,2,0
2,1,2023-10-07,2,1,1,1,1
3,1,2023-10-09,1,2,1,2,1
4,1,2023-10-11,1,1,1,0,0


In [None]:
# Group by subject_id and count rows
subject_lengths = df_health.groupby("subject_id").size()
print("Number of rows per subject:")
for sid, length in subject_lengths.items():
    print(f"Subject {sid}: {length} time steps")


Number of rows per subject:
Subject 1: 20 time steps
Subject 2: 14 time steps
Subject 3: 19 time steps
Subject 4: 12 time steps
Subject 5: 17 time steps
Subject 6: 15 time steps
Subject 7: 20 time steps
Subject 8: 11 time steps
Subject 9: 14 time steps
Subject 10: 13 time steps


In [2]:
# Preprocessing Step: Add Lagged Variables and Encode Categories
# Function to create lagged (t-1) variables per subject
from sklearn.preprocessing import LabelEncoder
def create_lagged_df(df, id_col='subject_id', time_col='timestamp'):
    """
    For each subject:
    - Sorts data by timestamp
    - Adds lagged (t-1) versions of all feature columns
    - Returns a combined dataframe with current and lagged variables
    """
    all_lagged_rows = []

    for subject_id, group in df.groupby(id_col):
        group = group.sort_values(by=time_col).reset_index(drop=True)

        # Create lagged version (shift down by 1)
        lagged = group.shift(1)

        # Rename lagged columns to include "_t-1", except id and timestamp
        lagged.columns = [f"{col}_t-1" if col not in [id_col, time_col] else col for col in lagged.columns]

        # Combine original and lagged data side-by-side
        combined = pd.concat([group, lagged], axis=1).dropna().reset_index(drop=True)

        # Drop duplicate columns like subject_id and timestamp if they exist twice
        combined = combined.loc[:, ~combined.columns.duplicated()]

        all_lagged_rows.append(combined)

    return pd.concat(all_lagged_rows).reset_index(drop=True)

# Function to encode categorical columns using LabelEncoder

def encode_categorical_columns(df, exclude_cols=['subject_id', 'timestamp']):
    
    df_encoded = df.copy()
    encoders = {}

    for col in df.columns:
        if col in exclude_cols:
            continue
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df[col])
        encoders[col] = le

    return df_encoded, encoders

# Run Preprocessing

df_lagged = create_lagged_df(df_health)
df_encoded, encoders = encode_categorical_columns(df_lagged)
print("Original (lagged) columns:", df_lagged.columns.tolist())
print("Encoded data sample:")
print(df_encoded.head())


Original (lagged) columns: ['subject_id', 'timestamp', 'blood_pressure', 'pain_level', 'mobility', 'heart_rate', 'oxygen_level', 'blood_pressure_t-1', 'pain_level_t-1', 'mobility_t-1', 'heart_rate_t-1', 'oxygen_level_t-1']
Encoded data sample:
   subject_id   timestamp  blood_pressure  pain_level  mobility  heart_rate  \
0           1  2023-10-05               1           2         1           2   
1           1  2023-10-07               2           1         1           1   
2           1  2023-10-09               1           2         1           2   
3           1  2023-10-11               1           1         1           0   
4           1  2023-10-13               0           1         1           1   

   oxygen_level  blood_pressure_t-1  pain_level_t-1  mobility_t-1  \
0             0                   2               2             0   
1             1                   1               2             1   
2             1                   2               1             1   
3    

In [11]:
import numpy as np
from scipy.special import gammaln

# Calculates BDe score for one variable given its parent set
def bde_node_score(data, target, parents, alpha=1.0):
    state_count = data[target].nunique()
    prior = alpha / state_count
    score = 0.0

    if parents:
        grouped = data.groupby(parents)
    else:
        grouped = [(None, data)]

    for _, group in grouped:
        counts = group[target].value_counts().to_dict()
        total = sum(counts.values())

        part = gammaln(alpha) - gammaln(alpha + total)
        for k in range(state_count):
            part += gammaln(prior + counts.get(k, 0)) - gammaln(prior)
        score += part

    return score

# Computes total BDe score for a graph (dict of {child: [parents]})
def bde_graph_score(data, graph, alpha=1.0):
    return sum(bde_node_score(data, child, parents, alpha) for child, parents in graph.items())

# Define the actual (true) dependency structure
true_graph = {
    'pain_level': ['blood_pressure_t-1'],
    'mobility': ['pain_level_t-1'],
    'blood_pressure': [],
    'heart_rate': [],
    'oxygen_level': [],
    'blood_pressure_t-1': [],
    'pain_level_t-1': [],
    'mobility_t-1': [],
    'heart_rate_t-1': [],
    'oxygen_level_t-1': []
}

# Compare with the null (empty) graph
empty_graph = {
    col: [] for col in df_encoded.columns if col not in ['subject_id', 'timestamp']
}

# Score both structures
score_real = bde_graph_score(df_encoded, true_graph)
score_null = bde_graph_score(df_encoded, empty_graph)

# Print comparison
print("BDe Score Comparison:")
print("  True graph score :", score_real)
print("  Empty graph score:", score_null)
print("  Difference        :", score_real - score_null)


BDe Score Comparison:
  True graph score : -1372.4736720260603
  Empty graph score: -1394.4686811783158
  Difference        : 21.995009152255534


In [None]:
# RJMCMC per subjec

import numpy as np
import pandas as pd
import random
from collections import Counter, defaultdict
from scipy.special import gammaln
import copy

# BDe score for a single variable given its parent set
def node_bde(data, target, parents, alpha=1.0):
    r = data[target].nunique()
    prior = alpha / r
    score = 0.0

    grouped = data.groupby(parents) if parents else [(None, data)]

    for _, group in grouped:
        counts = group[target].value_counts().to_dict()
        total = sum(counts.values())
        term = gammaln(alpha) - gammaln(alpha + total)
        for val in range(r):
            term += gammaln(prior + counts.get(val, 0)) - gammaln(prior)
        score += term
    return score

# Score across segments (changepoint-aware)
def segmented_score(data, segments, child, parents, alpha=1.0):
    result = 0.0
    for seg in np.unique(segments):
        subset = data.iloc[np.where(segments == seg)[0]]
        if not subset.empty:
            result += node_bde(subset, child, parents, alpha)
    return result

# Log-prior on changepoint configurations
def cp_prior_penalty(segments, lam=1.0):
    return -lam * np.sum(segments[1:] != segments[:-1])

# Suggest graph neighbors by adding/removing edges
def suggest_neighbors(graph, nodes, limit=3):
    proposals = []

    for child in nodes:
        parents = set(graph.get(child, []))
        for other in nodes:
            if other == child or other in parents:
                continue
            if len(parents) >= limit or "_t-1" not in other:
                continue
            new_graph = copy.deepcopy(graph)
            new_graph[child] = list(parents | {other})
            proposals.append(new_graph)

        for p in parents:
            new_graph = copy.deepcopy(graph)
            new_graph[child] = list(parents - {p})
            proposals.append(new_graph)

    return proposals

# Changepoint vector initialization
def init_segments(data, variables):
    return {var: np.zeros(len(data), dtype=int) for var in variables}

# Segment birth
def propose_birth(seg):
    points = [t for t in range(1, len(seg) - 1) if seg[t] == seg[t - 1]]
    if not points:
        return seg, False
    pos = random.choice(points)
    new_seg = seg.copy()
    current = seg[pos]
    new_id = max(seg) + 1
    for i in range(pos, len(seg)):
        if seg[i] == current:
            new_seg[i] = new_id
        else:
            break
    return new_seg, True

# Segment death
def propose_death(seg):
    unique = np.unique(seg)
    if len(unique) <= 1:
        return seg, False
    merge = random.choice(unique[1:])
    old = merge - 1
    new_seg = seg.copy()
    new_seg[seg == merge] = old
    _, reindexed = np.unique(new_seg, return_inverse=True)
    return reindexed, True

# Segment shift
def propose_shift(seg):
    points = [i for i in range(1, len(seg)) if seg[i] != seg[i - 1]]
    if not points:
        return seg, False
    move = random.choice(points)
    new_idx = move + random.choice([-1, 1])
    if not (0 < new_idx < len(seg)):
        return seg, False
    new_seg = seg.copy()
    before, after = seg[move - 1], seg[move]
    if new_idx < move:
        new_seg[new_idx:move] = after
        new_seg[new_idx] = before
    else:
        new_seg[move:new_idx] = before
        new_seg[move] = after
    return new_seg, True

# Main RJMCMC per-subject runner with BMA
def run_rjmcmc(df, features, **kwargs):
    all_segments = {feat: np.zeros(len(df), dtype=int) for feat in features}
    edge_log = Counter()
    sample_total = 0

    for subj_id, data in df.groupby("subject_id"):
        print(f"Processing subject {subj_id}")
        data = data.reset_index(drop=True)
        nodes = [f for f in data.columns if f not in ['subject_id', 'timestamp']]
        segs = init_segments(data, nodes)
        graph = {n: [] for n in nodes}

        def total_logscore(G, V):
            return sum(
                segmented_score(data, V[n], n, G[n], kwargs['alpha']) + cp_prior_penalty(V[n], kwargs['lambda_cp'])
                for n in nodes
            )

        score = total_logscore(graph, segs)
        best = copy.deepcopy(segs)
        best_val = score

        for i in range(kwargs['num_iters']):
            print(f"  Step {i}", end='')

            if random.random() < kwargs['p_structure']:
                choices = suggest_neighbors(graph, nodes, kwargs['max_parents'])
                new_graph = random.choice(choices)
                new_score = total_logscore(new_graph, segs)
                accept = np.exp(new_score - score)
                print(f" | structure move | Δ={new_score - score:.2f}, p={accept:.2f}")
                if random.random() < min(1.0, accept):
                    graph = new_graph
                    score = new_score
                    if score > best_val:
                        best_val = score
                        best = copy.deepcopy(segs)
            else:
                node = random.choice(nodes)
                move_type = random.choice(['birth', 'death', 'shift'])
                move_fn = {'birth': propose_birth, 'death': propose_death, 'shift': propose_shift}[move_type]
                trial, ok = move_fn(segs[node])
                if not ok:
                    continue
                new_score = segmented_score(data, trial, node, graph[node], kwargs['alpha']) + cp_prior_penalty(trial, kwargs['lambda_cp'])
                old_score = segmented_score(data, segs[node], node, graph[node], kwargs['alpha']) + cp_prior_penalty(segs[node], kwargs['lambda_cp'])
                delta = new_score - old_score
                accept = np.exp(delta)
                print(f" | changepoint {move_type} on {node} | Δ={delta:.2f}, p={accept:.2f}")
                if random.random() < min(1.0, accept):
                    segs[node] = trial
                    score = total_logscore(graph, segs)
                    if score > best_val:
                        best_val = score
                        best = copy.deepcopy(segs)

            if i % 50 == 0:
                for tgt, srcs in graph.items():
                    for src in srcs:
                        edge_log[(tgt, src)] += 1
                sample_total += 1

        for node in nodes:
            all_segments[node][data.index] = best[node]

        print(f"Subject {subj_id} done | best score = {best_val:.2f}")

    final = defaultdict(list)
    for (child, parent), count in edge_log.items():
        if count / sample_total >= 0.3:
            final[child].append(parent)

    return final, all_segments

# Example usage
cols = [c for c in df_encoded.columns if c not in ['subject_id', 'timestamp']]
learned_graph, segment_map = run_rjmcmc(
    df_encoded,
    features=cols,
    num_iters=4000,
    alpha=1.0,
    lambda_cp=4.0,
    max_parents=3,
    p_structure=0.5
)

print("Final Graph Structure:")
for tgt, srcs in learned_graph.items():
    print(f"{tgt} <- {srcs}")


Processing subject 1
  Step 0 | structure move | Δ=-0.44, p=0.65
  Step 1 | structure move | Δ=-1.76, p=0.17
  Step 2 | changepoint birth on blood_pressure_t-1 | Δ=-2.70, p=0.07
  Step 3 | structure move | Δ=-0.91, p=0.40
  Step 4 | structure move | Δ=-1.24, p=0.29
  Step 5 | structure move | Δ=-1.57, p=0.21
  Step 6 | structure move | Δ=-1.31, p=0.27
  Step 7 | changepoint birth on pain_level_t-1 | Δ=-4.56, p=0.01
  Step 8 | changepoint birth on oxygen_level | Δ=-5.97, p=0.00
  Step 9 | structure move | Δ=0.55, p=1.73
  Step 10 | changepoint birth on heart_rate | Δ=-6.72, p=0.00
  Step 11 | structure move | Δ=-1.77, p=0.17
  Step 12 | structure move | Δ=-0.72, p=0.49
  Step 13  Step 14 | structure move | Δ=-0.25, p=0.78
  Step 15  Step 16  Step 17 | changepoint birth on blood_pressure_t-1 | Δ=-3.88, p=0.02
  Step 18  Step 19  Step 20 | structure move | Δ=-1.57, p=0.21
  Step 21  Step 22  Step 23  Step 24 | structure move | Δ=-1.77, p=0.17
  Step 25 | structure move | Δ=-1.29, p=0.27
 

In [None]:
def print_changepoint_summary(segments, data):
    print("Changepoint Summary:\n")
    for var, seg_vec in segments.items():
        # Detect changepoints (where segment ID changes)
        breaks = [i for i in range(1, len(seg_vec)) if seg_vec[i] != seg_vec[i - 1]]
        times = data.loc[breaks, 'timestamp'].tolist() if 'timestamp' in data.columns else None
        num_segments = len(np.unique(seg_vec))

        print(f"{var}:")
        print(f"  • Segments: {num_segments}")
        print(f"  • Changepoints: {len(breaks)}")

        if breaks:
            print(f"  • Indices: {breaks}")
            if times:
                print(f"  • Timestamps: {times}")
        print()
print_changepoint_summary(segment_map, df_encoded)



🔍 Changepoint Summary:

blood_pressure:
  • Segments: 2
  • Changepoints: 2
  • Indices: [7, 12]
  • Timestamps: [datetime.date(2023, 10, 19), datetime.date(2023, 10, 29)]

pain_level:
  • Segments: 1
  • Changepoints: 0

mobility:
  • Segments: 1
  • Changepoints: 0

heart_rate:
  • Segments: 1
  • Changepoints: 0

oxygen_level:
  • Segments: 1
  • Changepoints: 0

blood_pressure_t-1:
  • Segments: 2
  • Changepoints: 2
  • Indices: [13, 19]
  • Timestamps: [datetime.date(2023, 10, 31), datetime.date(2023, 12, 9)]

pain_level_t-1:
  • Segments: 1
  • Changepoints: 0

mobility_t-1:
  • Segments: 1
  • Changepoints: 0

heart_rate_t-1:
  • Segments: 3
  • Changepoints: 3
  • Indices: [13, 15, 19]
  • Timestamps: [datetime.date(2023, 10, 31), datetime.date(2023, 11, 4), datetime.date(2023, 12, 9)]

oxygen_level_t-1:
  • Segments: 1
  • Changepoints: 0

