In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install lifelines -q

In [None]:
TEST_PATH = "/content/drive/MyDrive/1:1_Shriya_Wagholikar/Datasets/experiment_2b/updated_exp2b_rfe_test.csv"

In [None]:
# read the test data
import pandas as pd
test_data = pd.read_csv(TEST_PATH)
test_data.head()

In [None]:
import pickle
# Load the saved model
MODEL_PATH = "/content/drive/MyDrive/1:1_Shriya_Wagholikar/Results/experiment_2b/FE_RFE_survival_model"
with open(MODEL_PATH, "rb") as pickle_file:
    cph_final = pickle.load(pickle_file)

In [None]:
from lifelines.utils import concordance_index
# Evaluate on the test set
test_predictions = cph_final.predict_partial_hazard(test_data)
test_c_index = concordance_index(test_data['Disease Free (Months)'], -test_predictions, test_data['DFS_STATUS_ENCODED'])
print(f"Test C-index: {test_c_index}")

Risk Scores vs Patient

In [None]:
# sort the risk values
sorted_risk_score = test_predictions .sort_values()
# draw the plot
sorted_risk_score.plot.bar(ylabel = "Risk Score", title = "Risk Score vs Each Individual");

Hazard Ratio vs Covariate Plot

In [None]:
hazard_ratios = cph_final.hazard_ratios_
hazard_ratios_sort = hazard_ratios.sort_values()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 10))  # Adjust width and height as needed
# Create the bar plot
hazard_ratios_sort.plot.bar(ylabel="Hazard Ratio", title="Hazard Ratio vs. Covariate",fontsize=16)
#hazard_ratios_sort.plot.bar(ylabel="Hazard Ratio", title="Hazard Ratio vs. Covariate", log=True) # use this to see y small scale y labels
# Show the plot
plt.xticks(rotation=90,fontsize=16)
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/1:1_Shriya_Wagholikar/Results/experiment_2b/hazardratiovs_covariate.png")
plt.show()

Time vs Survival Probability vs Reoccurence Time

In [None]:
# survival probabilities
survival_probabilities_original = cph_final.predict_survival_function(test_data, times = test_data["Disease Free (Months)"].to_list())
survival_probabilities = survival_probabilities_original.sort_index()
survival_probabilities.head()

In [None]:
#Functions
import random
import numpy as np
import matplotlib.colors as mcolors
def generate_random_colors(n):
    # Get a list of all matplotlib color names
    color_names = list(mcolors.CSS4_COLORS.keys())
    # Check if the requested number of colors is greater than the available colors
    if n > len(color_names):
        raise ValueError(f"Requested number of colors exceeds available colors ({len(color_names)}).")
    # Select n random colors
    random_colors = random.sample(color_names, n)
    return random_colors

def generate_interpolation(id, index, intervals):
    time_occured = test_data["Disease Free (Months)"].to_list()
    intersection = np.interp(time_occured[index], intervals, survival_probabilities[id])
    return intersection

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# patients
patients_id = list(survival_probabilities.columns)
time_intervals = list(survival_probabilities.index)
color_palatte = generate_random_colors(len(patients_id))
time_occured = test_data["Disease Free (Months)"].to_list()
plt.figure(figsize = (15,8))
for index, id in enumerate(patients_id):
    if int(DISEASE_MAPPER[str(id)]) == 0: #identifying patients who are not censored and plot the graph only for those who have experienced recurrence.
        continue
    plt.plot(time_intervals, survival_probabilities[id].to_list(), label = f"Patient {id}", color = color_palatte[index])
    plt.scatter(x = time_occured[index], y = generate_interpolation(id, index, time_intervals), color = color_palatte[index])
plt.legend(loc = "best")
plt.grid(True)
plt.xlabel("Time")
plt.ylabel('Survival Probability')
plt.title("Time vs Survival Probability vs Re-occurence Time ")
plt.savefig("/content/drive/MyDrive/1:1_Shriya_Wagholikar/Results/experiment_2b/timevs_probabilityvs_reoccurence.png")
plt.show()

Three Random Patient Plotting

In [2]:
# Get patient IDs and time intervals
patients_id = list(survival_probabilities.columns)
time_intervals = list(survival_probabilities.index)
# Select 3 random patients
selected_patients = random.sample(patients_id, 3)
color_palette = generate_random_colors(len(selected_patients))
# Get the time occurred values from the test data
time_occured = test_data["Disease Free (Months)"].to_list()
# Create the plot
plt.figure(figsize=(15, 8))
for index, patient_id in enumerate(selected_patients):
    plt.plot(time_intervals, survival_probabilities[patient_id].to_list(),
             label=f"Patient {patient_id}", color=color_palette[index])
    # Plot dots on top of lines
    patient_index = patients_id.index(patient_id)
    plt.scatter(x=time_occured[patient_index],
                y=generate_interpolation(patient_id, patient_index, time_intervals),
                color=color_palette[index],edgecolor='black',zorder=5)


plt.legend(loc="best")
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Survival Probability")
plt.title("Survival Probability vs Re-occurrence Time vs Time for 3 random patients")
plt.savefig("/content/drive/MyDrive/1:1_Shriya_Wagholikar/Results/experiment_2b/3random_patients.png")
plt.show()

NameError: name 'survival_probabilities' is not defined

Predict Survival Function

In [None]:
# Predict survival functions for the test data
survival_functions = cph_final.predict_survival_function(test_data)

In [None]:
# Determine the time point where the survival probability drops below 0.70
# (which corresponds to a 30% chance of recurrence)
THRESHOLD = 0.70

In [None]:
predictions = []
time_period = []
for index, survival_function in survival_functions.items():
    # Find the first time point where the survival probability drops below the threshold
    time_point = survival_function[survival_function < THRESHOLD].index[0] if any(survival_function < THRESHOLD) else None
    # Classify as recurrence (1) if such a time point exists, otherwise classify as no recurrence (0)
    prediction = "Re-occurence" if time_point is not None else "No-occurence"
    predictions.append(prediction)
    time_period.append(time_point)
# make copy of the original dataframe
pred_test_df = test_data.copy()
# attach the predictions
column_name = "Re-occurence_{}%".format(int(100 - (0.70 * 100)))
column_time_name = "Re-occurence-time"
pred_test_df[column_name] = predictions
pred_test_df[column_time_name] = time_period
pred_test_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.plot(figsize = (5,5))
sns.countplot(pred_test_df, x = column_name, palette="muted")
plt.ylabel("Frequency")
plt.xlabel(f"Re-occurence at {int(100 - (0.70 * 100))}%")
plt.title(f"Number of Patients with Re-occurence at {int(100 - (0.70 * 100))}%")
plt.grid(True)
plt.savefig("/content/drive/MyDrive/1:1_Shriya_Wagholikar/Results/experiment_2b/30percent_reoccurence.png")
plt.show();

In [None]:
# get the error term
pred_test_df["error_time"] = abs(pred_test_df["Disease Free (Months)"] - pred_test_df[column_time_name])
print("Mean Time Error: {}".format(pred_test_df["error_time"].mean()))