In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Paths to the files in Google Drive
train_data_path = '/content/drive/My Drive/Train_data.csv'
test_data_path = '/content/drive/My Drive/Test_data.csv'
val_data_path = '/content/drive/My Drive/Val_data.csv'

# Load the CSV files
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
val_data = pd.read_csv(val_data_path)

# Combine the datasets
combined_data = pd.concat([train_data, test_data, val_data], axis=0, ignore_index=True)

# Check the combined data
print(f"Combined dataset contains {combined_data.shape[0]} rows and {combined_data.shape[1]} columns.")


Combined dataset contains 4727 rows and 83 columns.


In [None]:
# Dropping 'File_Path' and 'Split' columns from the dataset
combined_data = combined_data.drop(columns=['File_Path', 'Split'])

# Display the updated DataFrame
print(combined_data.columns)


Index(['Patient_ID', 'Age', 'Size_cm', 'Recurrence', 'Sex_F', 'Sex_M',
       'Polyp_Location_ Descending', 'Polyp_Location_ Sigmoid',
       'Polyp_Location_ Splenic Flexure', 'Polyp_Location_Anastomosis',
       'Polyp_Location_Asc / Transverse ?', 'Polyp_Location_Ascending',
       'Polyp_Location_Cecum', 'Polyp_Location_Cecum\nKissing Polyps',
       'Polyp_Location_Descending', 'Polyp_Location_Descending ',
       'Polyp_Location_Hepatic Flexure', 'Polyp_Location_Ileocecal Valve',
       'Polyp_Location_Ileocecal Valve\nKissing Polyps',
       'Polyp_Location_Rectosigmoid', 'Polyp_Location_Rectosigmoid ',
       'Polyp_Location_Rectum', 'Polyp_Location_Sigmoid',
       'Polyp_Location_Sigmoid, Splenic Flexure',
       'Polyp_Location_Splenic Flexure', 'Polyp_Location_Transvers Colon',
       'Polyp_Location_Transverse', 'Circum_0.3333333333333333', 'Circum_<1/3',
       'Circum_>1/3', 'Cross_Two_Folds_Between Folds', 'Cross_Two_Folds_Neg',
       'Cross_Two_Folds_On Fold', 'Cross_

In [None]:
combined_data.columns

Index(['Patient_ID', 'Age', 'Size_cm', 'Recurrence', 'Sex_F', 'Sex_M',
       'Polyp_Location_ Descending', 'Polyp_Location_ Sigmoid',
       'Polyp_Location_ Splenic Flexure', 'Polyp_Location_Anastomosis',
       'Polyp_Location_Asc / Transverse ?', 'Polyp_Location_Ascending',
       'Polyp_Location_Cecum', 'Polyp_Location_Cecum\nKissing Polyps',
       'Polyp_Location_Descending', 'Polyp_Location_Descending ',
       'Polyp_Location_Hepatic Flexure', 'Polyp_Location_Ileocecal Valve',
       'Polyp_Location_Ileocecal Valve\nKissing Polyps',
       'Polyp_Location_Rectosigmoid', 'Polyp_Location_Rectosigmoid ',
       'Polyp_Location_Rectum', 'Polyp_Location_Sigmoid',
       'Polyp_Location_Sigmoid, Splenic Flexure',
       'Polyp_Location_Splenic Flexure', 'Polyp_Location_Transvers Colon',
       'Polyp_Location_Transverse', 'Circum_0.3333333333333333', 'Circum_<1/3',
       'Circum_>1/3', 'Cross_Two_Folds_Between Folds', 'Cross_Two_Folds_Neg',
       'Cross_Two_Folds_On Fold', 'Cross_

In [None]:

# Define the path where you want to save the file
save_path = '/content/drive/My Drive/combined_data.csv'

# Save the updated DataFrame to a CSV file
combined_data.to_csv(save_path, index=False)

print(f"Dataset saved to {save_path}")


Dataset saved to /content/drive/My Drive/combined_data.csv


In [None]:
# Extract relevant features
morphological_features = [col for col in combined_data.columns if 'Polyp_Location_' in col or
                          'Paris_' in col or
                          'Pit_' in col or
                          'LST_Type_' in col or
                          'Diagnosis_' in col or
                          'Dysplasia_Grade_' in col]

# Include 'Size_cm' as it's essential for growth modeling
selected_features = ['Patient_ID', 'Size_cm'] + morphological_features

# Subset the dataset to selected features
prepared_data = combined_data[selected_features]

# Check the shape and columns of the prepared dataset
print(f"Prepared dataset contains {prepared_data.shape[0]} rows and {prepared_data.shape[1]} columns.")
prepared_data.head()


Prepared dataset contains 4727 rows and 65 columns.


Unnamed: 0,Patient_ID,Size_cm,Polyp_Location_ Descending,Polyp_Location_ Sigmoid,Polyp_Location_ Splenic Flexure,Polyp_Location_Anastomosis,Polyp_Location_Asc / Transverse ?,Polyp_Location_Ascending,Polyp_Location_Cecum,Polyp_Location_Cecum\nKissing Polyps,...,Diagnosis_T + V,Diagnosis_Traditional Serrated Adenoma,Diagnosis_Tubular,Diagnosis_Villous,Dysplasia_Grade_Differentiation_HGD,Dysplasia_Grade_Differentiation_LGD,Dysplasia_Grade_Differentiation_Mod Diff\nOrigin: Tubulovillous,Dysplasia_Grade_Differentiation_Mod: Well Diff \nOrigin: Tubulovillous,Dysplasia_Grade_Differentiation_T + V,Dysplasia_Grade_Differentiation_Well Diff
0,23002,0.042553,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,23002,0.042553,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,23002,0.042553,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,23002,0.042553,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,23002,0.042553,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [None]:
# Target variable: Size_cm (continuous regression target)
target_variable = 'Size_cm'

# Morphological targets (classification tasks)
morphological_targets = [col for col in combined_data.columns if 'Paris_' in col or 'Diagnosis_' in col]

print("Target variable for regression:", target_variable)
print("Morphological classification targets:", morphological_targets)


Target variable for regression: Size_cm
Morphological classification targets: ['Paris_0-IIa', 'Paris_0-IIa + IIc', 'Paris_0-IIa + Is', 'Paris_0-IIa /c', 'Paris_0-IIb', 'Paris_0-Ip', 'Paris_0-Ips', 'Paris_0-Is', 'Paris_0-lps', 'Paris_0-ls', 'Diagnosis_Adenocarcinoma', 'Diagnosis_Hyperplastic', 'Diagnosis_Inflammatory', 'Diagnosis_Serrated', 'Diagnosis_Serrated, Hyperplastic', 'Diagnosis_T + V', 'Diagnosis_Traditional Serrated Adenoma', 'Diagnosis_Tubular', 'Diagnosis_Villous']


In [None]:
import numpy as np

# Parameters for simulation
n_time_points = 5  # Number of time points
growth_rate = 0.05  # Growth rate per time point
max_size = 5.0  # Maximum size to prevent unrealistic growth

# Initialize a list to store simulated data
simulated_data = []

# Loop through each unique Patient_ID
for patient_id, group in prepared_data.groupby('Patient_ID'):
    initial_row = group.iloc[0]  # Use the first record for each patient

    # Create time steps for the patient
    for t in range(n_time_points):
        row = initial_row.copy()
        row['Time_Point'] = t  # Add time point

        # Simulate size growth
        row['Size_cm'] = min(row['Size_cm'] * (1 + growth_rate * t), max_size)

        # Simulate morphological changes (e.g., transitions in Paris classifications)
        if t > 0:
            paris_cols = [col for col in morphological_targets if 'Paris_' in col]
            random_paris = np.random.choice(paris_cols, size=1)
            row[random_paris[0]] = 1  # Assign a new Paris classification

        # Append to the simulated dataset
        simulated_data.append(row)

# Combine simulated data into a new DataFrame
simulated_data = pd.DataFrame(simulated_data)

# Check the structure of the simulated data
print(f"Simulated dataset contains {simulated_data.shape[0]} rows and {simulated_data.shape[1]} columns.")
simulated_data.head()


Simulated dataset contains 515 rows and 66 columns.


Unnamed: 0,Patient_ID,Size_cm,Polyp_Location_ Descending,Polyp_Location_ Sigmoid,Polyp_Location_ Splenic Flexure,Polyp_Location_Anastomosis,Polyp_Location_Asc / Transverse ?,Polyp_Location_Ascending,Polyp_Location_Cecum,Polyp_Location_Cecum\nKissing Polyps,...,Diagnosis_Traditional Serrated Adenoma,Diagnosis_Tubular,Diagnosis_Villous,Dysplasia_Grade_Differentiation_HGD,Dysplasia_Grade_Differentiation_LGD,Dysplasia_Grade_Differentiation_Mod Diff\nOrigin: Tubulovillous,Dysplasia_Grade_Differentiation_Mod: Well Diff \nOrigin: Tubulovillous,Dysplasia_Grade_Differentiation_T + V,Dysplasia_Grade_Differentiation_Well Diff,Time_Point
0,23002.0,0.042553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
0,23002.0,0.044681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
0,23002.0,0.046809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
0,23002.0,0.048936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0
0,23002.0,0.051064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0


In [None]:
# Save the simulated dataset for later use
output_path = '/content/drive/MyDrive/simulated_data.csv'
simulated_data.to_csv(output_path, index=False)

print(f"Simulated dataset saved at {output_path}.")


Simulated dataset saved at /content/drive/MyDrive/simulated_data.csv.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



# Step 2.1: Splitting the Dataset
# Define the target variable and morphological classification targets
target_variable = 'Size_cm'
morphological_targets = [col for col in simulated_data.columns if 'Paris_' in col or 'Diagnosis_' in col]

# Add lagged features for Size_cm
def add_lag_features(df, target, lag_steps=2):
    for lag in range(1, lag_steps + 1):
        df[f"{target}_t-{lag}"] = df[target].shift(lag, fill_value=0)
    return df

# Add lag features to the dataset
simulated_data_with_lags = simulated_data.copy()
simulated_data_with_lags = add_lag_features(simulated_data_with_lags, target="Size_cm", lag_steps=2)

# Define features (X) and targets (y)
X = simulated_data_with_lags.drop(columns=[target_variable] + morphological_targets)
y_size = simulated_data_with_lags[target_variable]
y_morphology = simulated_data_with_lags[morphological_targets]

# Perform a train-test split for size prediction and morphology classification
X_train, X_temp, y_size_train, y_size_temp, y_morphology_train, y_morphology_temp = train_test_split(
    X, y_size, y_morphology, test_size=0.3, random_state=42
)

X_val, X_test, y_size_val, y_size_test, y_morphology_val, y_morphology_test = train_test_split(
    X_temp, y_size_temp, y_morphology_temp, test_size=0.5, random_state=42
)

# Step 2.2: Feature Engineering
# Scale the features
#scaler = StandardScaler()
X_train_scaled = X_train
X_val_scaled = X_val
X_test_scaled = X_test
# Display the updated training set structure
print("Updated dataset splits after scaling and adding lag features:")
print(f"Training set: X_train_scaled: {X_train_scaled.shape}, y_size_train: {y_size_train.shape}, y_morphology_train: {y_morphology_train.shape}")
print(f"Validation set: X_val_scaled: {X_val_scaled.shape}, y_size_val: {y_size_val.shape}, y_morphology_val: {y_morphology_val.shape}")
print(f"Test set: X_test_scaled: {X_test_scaled.shape}, y_size_test: {y_size_test.shape}, y_morphology_test: {y_morphology_test.shape}")


Updated dataset splits after scaling and adding lag features:
Training set: X_train_scaled: (360, 48), y_size_train: (360,), y_morphology_train: (360, 19)
Validation set: X_val_scaled: (77, 48), y_size_val: (77,), y_morphology_val: (77, 19)
Test set: X_test_scaled: (78, 48), y_size_test: (78,), y_morphology_test: (78, 19)


In [None]:
# Convert data into sequences for LSTM
def create_sequences(features, target, time_steps=3):
    X, y = [], []
    for i in range(len(features) - time_steps):
        # Create a sequence of features
        X.append(features.iloc[i: i + time_steps].values)
        # Use the target corresponding to the last time step in the sequence
        y.append(target.iloc[i + time_steps])
    return np.array(X), np.array(y)

# Create sequences for regression (Size_cm)
time_steps = 3
X_train_seq, y_size_train_seq = create_sequences(X_train_scaled, y_size_train, time_steps)
X_val_seq, y_size_val_seq = create_sequences(X_val_scaled, y_size_val, time_steps)
X_test_seq, y_size_test_seq = create_sequences(X_test_scaled, y_size_test, time_steps)

print(f"Training data shape (Regression): {X_train_seq.shape}, {y_size_train_seq.shape}")


Training data shape (Regression): (357, 3, 48), (357,)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
lstm_model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
history = lstm_model.fit(
    X_train_seq, y_size_train_seq,
    validation_data=(X_val_seq, y_size_val_seq),
    epochs=500, batch_size=16, verbose=1
)

# Evaluate the model
lstm_eval = lstm_model.evaluate(X_test_seq, y_size_test_seq, verbose=0)
print(f"LSTM Regression Model Test Loss: {lstm_eval[0]}, Test MAE: {lstm_eval[1]}")


Epoch 1/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 91ms/step - loss: 366549.5938 - mae: 469.0813 - val_loss: 671.0646 - val_mae: 24.3196
Epoch 2/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 105268.6328 - mae: 238.4593 - val_loss: 1082.4766 - val_mae: 32.8205
Epoch 3/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 66706.6562 - mae: 190.1884 - val_loss: 541.7000 - val_mae: 23.0418
Epoch 4/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 56179.9531 - mae: 172.1221 - val_loss: 341.0141 - val_mae: 18.1963
Epoch 5/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 35332.3711 - mae: 137.4743 - val_loss: 12.8247 - val_mae: 3.2856
Epoch 6/500
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 26326.7793 - mae: 115.5117 - val_loss: 320.4519 - val_mae: 17.8462
Epoch 7/500
[1m23/23[0m [32m━━━━━━━

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, num_labels, nhead=4, num_layers=2, dim_feedforward=128, dropout=0.1):
        super(SimpleTransformer, self).__init__()
        # Transformer Encoder Layer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Fully Connected Layer for Output
        self.fc = nn.Linear(input_dim, num_labels)

    def forward(self, x):
        # Transformer expects input shape: (seq_length, batch_size, input_dim)
        x = x.permute(1, 0, 2)  # Change to (seq_length, batch_size, input_dim)
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # Pooling (mean over sequence)
        x = self.fc(x)  # Output layer
        return torch.sigmoid(x)  # Apply sigmoid for multi-label classification

# Model Hyperparameters
input_dim = X_train_seq.shape[2]  # Number of features
num_labels = y_morphology_train_seq.shape[1]  # Number of output labels
nhead = 4
num_layers = 2
dim_feedforward = 128
dropout = 0.1

# Initialize the model
model = SimpleTransformer(input_dim, num_labels, nhead, num_layers, dim_feedforward, dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and Optimizer
loss_fn = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Data Preparation
train_loader = DataLoader(list(zip(X_train_seq, y_morphology_train_seq)), batch_size=16, shuffle=True)
val_loader = DataLoader(list(zip(X_val_seq, y_morphology_val_seq)), batch_size=16, shuffle=False)
test_loader = DataLoader(list(zip(X_test_seq, y_morphology_test_seq)), batch_size=16, shuffle=False)

# Training Loop
epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        X_batch, y_batch = batch
        X_batch = X_batch.to(device).float()
        y_batch = y_batch.to(device).float()

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Evaluation Loop
model.eval()
test_labels = []
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        X_batch, y_batch = batch
        X_batch = X_batch.to(device).float()
        outputs = model(X_batch)
        test_preds.append(outputs.cpu().numpy())
        test_labels.append(y_batch.numpy())

# Combine predictions and true labels
test_labels = np.vstack(test_labels)
test_preds = np.vstack(test_preds)

# Threshold predictions
threshold = 0.5
test_preds_binary = (test_preds > threshold).astype(int)

# Calculate Evaluation Metrics
accuracy = accuracy_score(test_labels.flatten(), test_preds_binary.flatten())
f1 = f1_score(test_labels, test_preds_binary, average='micro')
roc_auc = roc_auc_score(test_labels, test_preds, average='micro')

print("Test Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"F1-Score (Micro): {f1}")
print(f"ROC-AUC (Micro): {roc_auc}")




Epoch 1, Loss: 0.5453366326249164
Epoch 2, Loss: 0.3825240718281787
Epoch 3, Loss: 0.3576572589252306
Epoch 4, Loss: 0.35416153721187427
Epoch 5, Loss: 0.3524011878863625
Epoch 6, Loss: 0.34911297456077905
Epoch 7, Loss: 0.34688816899838654
Epoch 8, Loss: 0.3473867411198823
Epoch 9, Loss: 0.3412993848323822
Epoch 10, Loss: 0.33860402133153833
Test Evaluation:
Accuracy: 0.8547368421052631
F1-Score (Micro): 0.14107883817427386
ROC-AUC (Micro): 0.7636265493802479


In [None]:
import joblib

# Save the model
torch.save(model.state_dict(), "simple_transformer_model.pth")
joblib.dump(scaler, "scaler.pkl")  # Save the scaler if used


['scaler.pkl']

In [None]:
import numpy as np
import pandas as pd
import torch

def simulate_polyp_evolution_with_debug(initial_features, size_model, morphology_model, time_steps=5):
    """
    Simulate the evolution of polyp characteristics over time with debug logs.

    Args:
        initial_features (np.array): Initial feature vector for the polyp.
        size_model: Trained TensorFlow/Keras regression model for size prediction.
        morphology_model: Trained PyTorch classification model for morphological evolution.
        time_steps (int): Number of future time steps to simulate.

    Returns:
        pd.DataFrame: Simulated polyp evolution over time.
    """
    morphology_feature_count = morphology_model.fc.out_features  # Number of morphology features
    results = []
    current_features = initial_features.copy()

    for t in range(1, time_steps + 1):
        # Predict size
        reshaped_features_tf = current_features.reshape(1, 1, -1)  # Add batch and timestep dimensions
        predicted_size = size_model.predict(reshaped_features_tf)[0, 0]

        # Log current state
        print(f"Time Step {t}: Predicted Size = {predicted_size}")

        # Update size in the feature vector
        current_features[0] = predicted_size

        # Predict morphology
        reshaped_features_torch = torch.tensor(current_features, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        with torch.no_grad():
            morphology_probs = morphology_model(reshaped_features_torch).numpy()
        morphology_classes = (morphology_probs > 0.5).astype(int)

        # Update morphology features
        morphology_start_index = 1
        morphology_end_index = morphology_start_index + morphology_feature_count
        current_features[morphology_start_index:morphology_end_index] = morphology_classes.flatten()

        # Save results
        results.append({
            "Time_Step": t,
            "Predicted_Size": predicted_size,
            "Morphology_Classes": morphology_classes.flatten().tolist()
        })

    return pd.DataFrame(results)

# Example usage
# Select a sample polyp's initial features
sample_polyp_features = X_test_seq[0, -1, :]  # Last timestep features for a polyp

# Simulate evolution
predicted_evolution = simulate_polyp_evolution(
    initial_features=sample_polyp_features,
    size_model = lstm_model,  # Replace with the actual size regression model
    morphology_model = model,  # Replace with the actual morphology model
    time_steps=5
)

# Display results
print("Predicted Polyp Evolution Over Time:")
print(predicted_evolution)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Predicted Polyp Evolution Over Time:
   Time_Step  Predicted_Size  \
0          1        0.088947   
1          2        0.099268   
2          3        0.099270   
3          4        0.099270   
4          5        0.099270   

                                  Morphology_Classes  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [None]:
import pandas as pd

# Morphology labels based on the dataset
morphology_labels = [
    "Paris_0-IIa", "Paris_0-IIa + IIc", "Paris_0-IIa + Is", "Paris_0-IIa /c",
    "Paris_0-IIb", "Paris_0-Ip", "Paris_0-Ips", "Paris_0-Is", "Paris_0-lps", "Paris_0-ls",
    "Diagnosis_Adenocarcinoma", "Diagnosis_Hyperplastic", "Diagnosis_Inflammatory",
    "Diagnosis_Serrated", "Diagnosis_Serrated, Hyperplastic", "Diagnosis_T + V",
    "Diagnosis_Traditional Serrated Adenoma", "Diagnosis_Tubular", "Diagnosis_Villous"
]

def decode_morphology_classes(morphology_classes, morphology_labels):
    """
    Decode morphology classes into clinical terms.

    Args:
        morphology_classes (list): Binary list (0s and 1s) indicating active classes.
        morphology_labels (list): List of clinical terms for morphology classes.

    Returns:
        str: Decoded clinical description.
    """
    # Identify active classes
    active_indices = [i for i, value in enumerate(morphology_classes) if value == 1]
    active_terms = [morphology_labels[i] for i in active_indices]

    # Create a descriptive statement
    if active_terms:
        return ", ".join(active_terms)
    else:
        return "no significant morphological changes"

def generate_clinical_statements(predicted_evolution, morphology_labels):
    """
    Generate clinical statements from simulation results.

    Args:
        predicted_evolution (pd.DataFrame): DataFrame with simulation results.
        morphology_labels (list): List of clinical terms for morphology classes.

    Returns:
        list: List of clinical statements for each time step.
    """
    statements = []
    for _, row in predicted_evolution.iterrows():
        time_step = row["Time_Step"]
        size = row["Predicted_Size"]
        morphology_classes = row["Morphology_Classes"]

        # Decode morphology classes
        morphology_desc = decode_morphology_classes(morphology_classes, morphology_labels)

        # Generate clinical statement
        statement = (
            f"At time step {time_step}, the polyp size is {size:.2f} cm with morphology: {morphology_desc}."
        )
        statements.append(statement)

    return statements

# Example usage
# Assuming `predicted_evolution` is the DataFrame from the simulation
clinical_statements = generate_clinical_statements(predicted_evolution, morphology_labels)

# Print the clinical statements
print("Clinical Statements:")
for statement in clinical_statements:
    print(statement)


Clinical Statements:
At time step 1, the polyp size is 0.09 cm with morphology: Diagnosis_Tubular.
At time step 2, the polyp size is 0.10 cm with morphology: Diagnosis_Tubular.
At time step 3, the polyp size is 0.10 cm with morphology: Diagnosis_Tubular.
At time step 4, the polyp size is 0.10 cm with morphology: Diagnosis_Tubular.
At time step 5, the polyp size is 0.10 cm with morphology: Diagnosis_Tubular.
