# RCT Analysis: Preprocessing

This notebook implements a step-by-step approach for randomized control trial (RCT) data analysis, focusing on progression-free survival (PFS) as the outcome measure.

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import plotly.express as px

## 2. Load and Process Data

In [2]:
# Load trial data from two sheets
df_1 = pd.read_excel("~/Downloads/20250521_Trials for dev.xlsx", sheet_name="trials by arm", skiprows=2)

# Fill NA values with 0 for specific columns
na_fill_cols = ['brain_metastase_yes', 'disease_stage_recurrent', 'disease_stage_III', 
                'disease_stage_IV', 'EGFR_wild', 'no_smoker_percent']
df_1[na_fill_cols] = df_1[na_fill_cols].fillna(0)

# Load and process second data sheet
df_2 = pd.read_excel("~/Downloads/20250521_Trials for dev.xlsx", sheet_name="250529_NSCLC", skiprows=2)
df_2 = df_2[df_2['to keep'] == 1.0]  # Keep only rows marked to keep
df_2 = df_2.rename(columns={'arm_n': 'Population'})  # Rename column for consistency
df_2['NCT_ID'] = df_2['NCT_ID'].ffill()  # Forward fill NCT_ID values

# Combine the two datasets
df = pd.concat([df_1, df_2], ignore_index=True, axis=0)

# Handle age columns - fill age_median with age_clean where missing
df['age_median'] = df['age_median'].fillna(df['age_clean'])

# Display the combined dataset shape
print(f"Combined dataset shape: {df.shape}")

Combined dataset shape: (79, 52)


## 3. Select Relevant Columns for Analysis

In [3]:
# Define columns to keep for analysis
cols = [
    "NCT_ID",                  # Trial identifier
    "Arm",                     # Treatment arm 
    'Population',              # Number of patients
    "intervention",            # Treatment description
    'RCT_with_control_inter',  # Has control arm
    
    # Patient demographics
    'gender_male_percent',     # Percentage of male patients
    'age_median',              # Median age
    'no_smoker_percent',       # Percentage of non-smokers
    'ecog_1',                  # ECOG performance status
    
    # Disease characteristics
    'brain_metastase_yes',     # Brain metastases
    'disease_stage_recurrent', # Recurrent disease
    'disease_stage_III',       # Stage III disease
    'disease_stage_IV',        # Stage IV disease
    'EGFR_wild',               # EGFR wild type
    'EGFR_positive_mutation',  # EGFR mutation positive
    
    # Outcome and quality flags
    'PFS_median_months',       # Primary outcome - progression-free survival
    'PFS_median_CI',           # Confidence interval for PFS
    'CI',                      # Confidence interval name for PFS
    'need_to_be_dropped',      # Quality control flag
    
    # Treatment classification
    "First-in-Class",          # Novel mechanism of action
    "Next-Generation",         # Improved version of existing drug
    
    # Drug class classification scheme 1
    'EGFR_TKI',                # EGFR tyrosine kinase inhibitor
    'Platinum_Chemotherapy',   # Platinum-based chemotherapy
    'Anti_VEGF',               # Anti-vascular endothelial growth factor
    'PD1_PDL1_Inhibitor',      # PD-1/PD-L1 inhibitor
    'Antimetabolite',          # Antimetabolite chemotherapy
    'Taxane',                  # Taxane chemotherapy
    'Antibody',                # Monoclonal antibody
    'Placebo_Supportive-Care', # Placebo or best supportive care
    
    # Drug class classification scheme 2
    'Chemotherapy',            # Any chemotherapy
    'Targeted_Therapy',        # Any targeted therapy
    'Immunotherapy',           # Any immunotherapy
    'Anti-angiogenic_Other',   # Other anti-angiogenic therapy
    
    # Additional information
    'subgroup (Y/N)',          # Indicates subgroup analysis
    'Subgroup characteristics'  # Subgroup description


]

# Create the training dataset
training_df = df[cols].copy()

# Add control arm indicator
training_df["is_arm_control"] = (training_df["Arm"] == 'Control').astype(int)

# Remove rows that need to be dropped
training_df = training_df.loc[training_df["need_to_be_dropped"] != 1, :]

# Remove specific trial that should be excluded (consider only arms of major populations, less data)
training_df = training_df.drop([65, 66], axis=0)
training_df = training_df[~training_df['NCT_ID'].isin(['NCT02590965'])]

training_df = training_df.replace("need help", None)
training_df = training_df.replace("?", None)
training_df = training_df.replace("??", None)

# Special handling for EGFR_wild column
if "EGFR_wild" in training_df.columns:
    training_df["EGFR_wild"] = training_df["EGFR_wild"].fillna(0)

# Display training dataset information
print(f"Training dataset shape: {training_df.shape}")
print(f"Number of unique trials: {training_df.NCT_ID.nunique()}")
training_df.head()

Training dataset shape: (57, 36)
Number of unique trials: 27


Unnamed: 0,NCT_ID,Arm,Population,intervention,RCT_with_control_inter,gender_male_percent,age_median,no_smoker_percent,ecog_1,brain_metastase_yes,...,Taxane,Antibody,Placebo_Supportive-Care,Chemotherapy,Targeted_Therapy,Immunotherapy,Anti-angiogenic_Other,subgroup (Y/N),Subgroup characteristics,is_arm_control
0,NCT01364012,Intervention,138,bevacizumab + platinum doublet chemo (carbopla...,1.0,54.0,57.0,50.0,75.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,,,0
1,NCT01364012,Control,138,placebo + platinum doublet chemo (carboplatin ...,1.0,56.0,56.0,50.0,80.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,,1
2,NCT01469000,Intervention,126,pemetrexed + gefitinib,1.0,35.0,62.0,64.0,69.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,,0
3,NCT01469000,Control,65,gefitinib,1.0,37.0,62.0,72.0,68.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,1
4,NCT02099058,Intervention,28,telisotuzumab vedotin + erlotinib,0.0,32.0,60.0,0.0,71.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,,,0


## 4. Feature Engineering

In [4]:
def add_features(df):
    """Add engineered features to the dataframe.
    
    Args:
        df: Input dataframe with raw features
        
    Returns:
        Dataframe with added engineered features
    """
    # Create a copy to avoid modifying the original
    df = df.copy()
    
    # Handle special characters in the data
    df = df.replace("need help", None)
    df = df.replace("?", None)
    df = df.replace("??", None)
    
    # 1. Treatment combinations
    df['combo_therapy'] = ((df['Chemotherapy'] + df['Targeted_Therapy'] + 
                          df['Immunotherapy'] + df['Anti-angiogenic_Other']) > 1).astype(int)
    
    # 2. EGFR status interaction with treatments
    df['egfr_targeted'] = df['EGFR_positive_mutation'] * df['Targeted_Therapy']
    df['egfr_tki_use'] = df['EGFR_positive_mutation'] * df['EGFR_TKI']
    
    # 3. Patient risk profile
    df['high_risk_profile'] = ((df['brain_metastase_yes'] > 0) | 
                              (df['disease_stage_IV'] > 0)).astype(int)
    
    # 4. Treatment novelty score (combines First-in-Class and Next-Generation)
    df['novelty_score'] = df['First-in-Class'] + df['Next-Generation']
    
    # 5. Patient demographics composite
    df['elderly_male'] = ((df['gender_male_percent'] > 60) & 
                         (df['age_median'] > 60)).astype(int)
    
    # 6. Trial size category (might affect reliability)
    df['large_trial'] = (df['Population'] > df['Population'].median()).astype(int)
    
    # 7. Treatment complexity
    treatment_cols = ['EGFR_TKI', 'Anti_VEGF', 'PD1_PDL1_Inhibitor', 
                      'Antimetabolite', 'Taxane', 'Antibody']
    df['treatment_complexity'] = df[treatment_cols].sum(axis=1)
    
    # 8. Calculate percentage of smokers
    df['smoker_percent'] = 100 - df['no_smoker_percent']
    
    return df

# Apply feature engineering
training_df = add_features(training_df)

# Display the first few rows with the new features
print("Dataset with engineered features:")
training_df[['NCT_ID', 'Arm', 'combo_therapy', 'egfr_targeted', 'high_risk_profile', 
              'novelty_score', 'elderly_male', 'large_trial', 'treatment_complexity', 
              'smoker_percent']].head()

Dataset with engineered features:


Unnamed: 0,NCT_ID,Arm,combo_therapy,egfr_targeted,high_risk_profile,novelty_score,elderly_male,large_trial,treatment_complexity,smoker_percent
0,NCT01364012,Intervention,1,0.0,1,1.0,0,0,3.0,50.0
1,NCT01364012,Control,0,0.0,1,0.0,0,0,1.0,50.0
2,NCT01469000,Intervention,1,100.0,1,1.0,0,0,2.0,36.0
3,NCT01469000,Control,0,100.0,1,1.0,0,0,1.0,28.0
4,NCT02099058,Intervention,0,100.0,1,1.0,0,0,2.0,100.0


## 5. Prepare Data for Modeling, EDA (Helper functions...)

In [5]:
from utilities.utils import prepare_data, run_fillna

ftr = ['gender_male_percent',
 'age_median',
 'no_smoker_percent',
 'ecog_1',
 'smoker_percent',
 'brain_metastase_yes',
 'disease_stage_recurrent',
 'disease_stage_III',
 'disease_stage_IV',
 'EGFR_wild',
 'EGFR_positive_mutation',
 'EGFR_TKI',
 'Platinum_Chemotherapy',
 'Anti_VEGF',
 'PD1_PDL1_Inhibitor',
 'Antimetabolite',
 'Taxane',
 'Antibody',
 'Placebo_Supportive-Care',
 'Chemotherapy',
 'Targeted_Therapy',
 'Immunotherapy',
 'Anti-angiogenic_Other',
 'combo_therapy',
 'treatment_complexity',
 'RCT_with_control_inter',
 'is_arm_control',
 'Population',
 'First-in-Class',
 'Next-Generation']

# Prepare data for modeling
x_train = prepare_data(training_df, ftr)
x_train = run_fillna(x_train)

# Verify data preparation
print(f"Final feature matrix shape: {x_train.shape}")
print(f"Any remaining missing values: {x_train.isnull().any().any()}")
x_train.head()

Final feature matrix shape: (57, 30)
Any remaining missing values: False


Unnamed: 0,gender_male_percent,age_median,no_smoker_percent,ecog_1,smoker_percent,brain_metastase_yes,disease_stage_recurrent,disease_stage_III,disease_stage_IV,EGFR_wild,...,Targeted_Therapy,Immunotherapy,Anti-angiogenic_Other,combo_therapy,treatment_complexity,RCT_with_control_inter,is_arm_control,Population,First-in-Class,Next-Generation
0,54.0,57.0,50.0,75.0,50.0,0.0,3.0,6.0,91.0,73.0,...,0.0,0.0,1.0,1.0,3.0,1.0,0.0,138.0,1.0,0.0
1,56.0,56.0,50.0,80.0,50.0,0.0,2.0,7.0,91.0,74.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,138.0,0.0,0.0
2,35.0,62.0,64.0,69.0,36.0,0.0,17.0,0.0,83.0,0.0,...,1.0,0.0,0.0,1.0,2.0,1.0,0.0,126.0,0.0,1.0
3,37.0,62.0,72.0,68.0,28.0,0.0,12.0,0.0,88.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,65.0,1.0,0.0
4,32.0,60.0,0.0,71.0,100.0,0.0,0.0,0.0,100.0,0.0,...,1.0,0.0,0.0,0.0,2.0,0.0,0.0,28.0,1.0,0.0


## 6. Exploratory Data Analysis

In [6]:
# Define target variable
target_col = "PFS_median_months"

# Calculate and sort correlations with target
correlations = x_train.assign(target=training_df[target_col]).corr()["target"].abs().sort_values(ascending=False)

# Display top correlations
print(f"Top 10 features correlated with {target_col}:")
correlations.head(10)

Top 10 features correlated with PFS_median_months:


target                     1.000000
EGFR_TKI                   0.463644
smoker_percent             0.451801
no_smoker_percent          0.451801
Next-Generation            0.436662
disease_stage_recurrent    0.426289
combo_therapy              0.424210
is_arm_control             0.416556
Anti-angiogenic_Other      0.392182
Anti_VEGF                  0.384324
Name: target, dtype: float64

In [7]:
# Create histogram of target variable
fig = px.histogram(training_df, x=target_col, nbins=30, title=f"Histogram of {target_col}")
fig.show()

# Summary statistics of target variable
print(f"\nSummary statistics for {target_col}:")
print(training_df[target_col].describe())


Summary statistics for PFS_median_months:
count    57.000000
mean      7.447368
std       2.601792
min       4.200000
25%       5.500000
50%       7.100000
75%       9.000000
max      15.800000
Name: PFS_median_months, dtype: float64


In [8]:
training_df

Unnamed: 0,NCT_ID,Arm,Population,intervention,RCT_with_control_inter,gender_male_percent,age_median,no_smoker_percent,ecog_1,brain_metastase_yes,...,is_arm_control,combo_therapy,egfr_targeted,egfr_tki_use,high_risk_profile,novelty_score,elderly_male,large_trial,treatment_complexity,smoker_percent
0,NCT01364012,Intervention,138.0,bevacizumab + platinum doublet chemo (carbopla...,1.0,54.0,57.0,50.0,75.0,0.0,...,0,1,0.0,0.0,1,1.0,0,0,3.0,50.0
1,NCT01364012,Control,138.0,placebo + platinum doublet chemo (carboplatin ...,1.0,56.0,56.0,50.0,80.0,0.0,...,1,0,0.0,0.0,1,0.0,0,0,1.0,50.0
2,NCT01469000,Intervention,126.0,pemetrexed + gefitinib,1.0,35.0,62.0,64.0,69.0,0.0,...,0,1,100.0,100.0,1,1.0,0,0,2.0,36.0
3,NCT01469000,Control,65.0,gefitinib,1.0,37.0,62.0,72.0,68.0,0.0,...,1,0,100.0,100.0,1,1.0,0,0,1.0,28.0
4,NCT02099058,Intervention,28.0,telisotuzumab vedotin + erlotinib,0.0,32.0,60.0,0.0,71.0,0.0,...,0,0,100.0,100.0,1,1.0,0,0,2.0,100.0
5,NCT03515837,Intervention,245.0,pembrolizumab + pemtrexed + platinum-based chemo,1.0,38.0,62.0,66.0,71.0,20.8,...,0,1,0.0,0.0,1,2.0,0,1,3.0,34.0
6,NCT03515837,Control,247.0,placebo + pemtrexed + platinum-based chemo,1.0,39.0,64.0,66.0,62.8,19.0,...,1,0,0.0,0.0,1,0.0,0,1,1.0,34.0
7,NCT03736837,Intervention,57.0,anlotinib + icontinib,0.0,44.0,62.0,68.0,59.6,36.8,...,0,0,0.0,100.0,1,2.0,0,0,2.0,32.0
8,NCT04129502,Intervention,179.0,mobocertinib,1.0,40.0,64.0,54.0,53.0,33.0,...,0,0,100.0,100.0,1,1.0,0,1,1.0,46.0
9,NCT04129502,Control,175.0,platinum-based chemothrapy,1.0,34.0,62.0,61.0,58.0,31.0,...,1,0,0.0,0.0,1,1.0,0,0,1.0,39.0


In [9]:
training_df.to_csv("processed_training_data.csv", index=False)

## 7. Summary and Next Steps

We have successfully:
1. Loaded and preprocessed the clinical trial data
2. Selected relevant features and engineered new ones
3. Prepared the data for modeling by handling missing values
4. Conducted initial exploratory data analysis

The `training_df` dataset is now ready for modeling approaches, including:
- Feature Selection
- Linear regression models
- Tree-based models (Random Forest, LightGBM)
- Average treatment effect (ATE) estimation
- Similarity-based modeling