In [None]:
!pip install lifelines

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from lifelines import CoxPHFitter
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from lifelines import KaplanMeierFitter
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/kaggle/input/colorectal-cancer-patients/Colorectal Cancer Patient Data.csv')

print(df.info())

<div>
<div>ID_REF - Patient ID.</div>
<div>Age (in years) - Patient's age at the time of diagnosis (numeric variable).</div>
<div>Dukes Stage - The stage of colorectal cancer according to the Dukes classification (categorical variable: A, B, C, D).</div>
<div>Gender - The sex of the patient (categorical variable: Male or Female).</div>
<div>Location - Tumor location (categorical variable: Left, Right, Colon, Rectum).</div>
<div>DFS (in months) - Recurrence-free survival in months (numeric variable).</div>
<div>DFS event - Survival event (target: 1 = recurrence or death, 0 = no recurrence).</div>
<div>Adj_Radio - Receipt of radiation therapy (1 = received, 0 = did not receive).</div>
<div>Adj_Chem - Receipt of chemotherapy (1 = received, 0 = did not receive).</div>
<br />
<div>Target variable: DFS event</div>
<div>Time variable: DFS (in months)</div>
</div>

In [None]:
df_cleaned = df.drop(columns=['Unnamed: 0', 'ID_REF'])

df_cleaned.columns = ['Age', 'Dukes_Stage', 'Gender', 'Location', 'DFS_months', 'DFS_event', 'Adj_Radio', 'Adj_Chem']

df_cleaned = df_cleaned.dropna()

kmf = KaplanMeierFitter()

age_group = df_cleaned['Age']
dfs_months = df_cleaned['DFS_months']
dfs_event = df_cleaned['DFS_event']

kmf.fit(durations=dfs_months, event_observed=dfs_event)

<div>
<div>General plot of survival analysis according to age</div>
</div>

In [None]:
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title('Survival analysis (Kaplan-Meier) according to age')
plt.xlabel('Time')
plt.ylabel('Survival probability')
plt.grid(True)
plt.show()

<div>
<div>Overall analysis shows that survival decreases over time, which is expected in patients with colorectal cancer.</div>
</div>

<div>
<div>Graph of survival analysis depending on the selected groups</div>
</div>

In [None]:
df_encoded = pd.get_dummies(df_cleaned, columns=['Dukes_Stage', 'Gender', 'Location'], drop_first=True)

X = df_encoded.drop(columns=['DFS_event', 'DFS_months'])
y = df_encoded['DFS_event']

selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)

scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_,
    'P-value': selector.pvalues_
})

print("Features:")
print(scores.sort_values(by='P-value', ascending=False))

<div>
<div>Since no significant variables turned out to be significant, we will plot the survival rate by the closest significant variable, which in this case is the tumor location in the rectum.</div>
<div><br />We will categorize the patients into two groups: those with tumors located in the rectum and those with tumors in other locations, and analyze their survival probabilities over tim/></div>
</div>

In [None]:
kmf = KaplanMeierFitter()
plt.figure(figsize=(10, 6))

median_location_rectum = df_cleaned['Location'].apply(lambda x: 1 if x == 'Rectum' else 0).median()
df_cleaned['Location_Rectum_group'] = df_cleaned['Location'].apply(lambda x: 1 if x == 'Rectum' else 0)

for group in df_cleaned['Location_Rectum_group'].unique():
    mask = df_cleaned['Location_Rectum_group'] == group
    kmf.fit(df_cleaned[mask]['DFS_months'], event_observed=df_cleaned[mask]['DFS_event'], label=f'Location_Rectum={group}')
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Analysis by Location (Rectum)')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.show()

<div>
<div>Groups of patients with tumors in the rectum and in other locations show differences in survival.</div>
<div><br />Patients with tumors in the rectum have a lower survival rate.</div>
</div>

<div>
<div>Application of PCA for dimensionality reduction</div>
</div>

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

pca_components = pd.DataFrame(pca.components_, columns=X.columns, index=['PCA1', 'PCA2'])
print("\nContributions of baseline traits to PCA1 and PCA2:")
print(pca_components.T)

<div>
<div>PCA1:</div>
<div>Age (0.999833): Age has the strongest positive influence on PCA1.</div>
<div>Adj_Radio (0.009006): Receipt of radiation therapy contributes positively to PCA1, albeit minimally.</div>
<div>Adj_Chem (0.002962): Receipt of chemotherapy also contributes positively to PCA1.</div>
<div>Dukes_Stage_B (-0.010304): Patients classified as Dukes Stage B show a slight negative influence on PCA1.</div>
<div>Dukes_Stage_C (0.005166): Dukes Stage C has a minor positive influence on PCA1.</div>
<div>Dukes_Stage_D (-0.003641): Dukes Stage D shows a slight negative influence on PCA1.</div>
<div>Gender_Male (0.000284): Male gender has a negligible positive influence on PCA1.</div>
<div>Location_Left (0.001755): Tumor location on the left side contributes positively to PCA1.</div>
<div>Location_Rectum (0.006442): Tumor location in the rectum contributes positively to PCA1.</div>
<div>Location_Right (-0.007264): Tumor location on the right side has a negative influence on PCA1.</div>
<br />
<div>PCA2:</div>
<div>Location_Right (0.592757): Tumor location on the right side has the strongest positive influence on PCA2.</div>
<div>Adj_Chem (0.246288): Receipt of chemotherapy contributes positively to PCA2.</div>
<div>Gender_Male (0.106829): Male gender contributes positively to PCA2.</div>
<div>Dukes_Stage_B (0.137560): Dukes Stage B shows a moderate positive influence on PCA2.</div>
<div>Dukes_Stage_C (-0.452607): Dukes Stage C has the strongest negative influence on PCA2.</div>
<div>Dukes_Stage_D (0.159971): Dukes Stage D contributes positively to PCA2.</div>
<div>Location_Left (-0.409850): Tumor location on the left side has a strong negative influence on PCA2.</div>
<div>Location_Rectum (-0.202928): Tumor location in the rectum also contributes negatively to PCA2.</div>
<div>Adj_Radio (-0.343331): Receipt of radiation therapy has a notable negative impact on PCA2.</div>
</div>

In [None]:
df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
df_pca['DFS_months'] = df_cleaned['DFS_months']
df_pca['DFS_event'] = df_cleaned['DFS_event']

kmf = KaplanMeierFitter()
plt.figure(figsize=(10, 6))

median_pca1 = df_pca['PCA1'].median()
df_pca['PCA1_group'] = np.where(df_pca['PCA1'] > median_pca1, 'Group 1', 'Group 2')

for group in df_pca['PCA1_group'].unique():
    mask = df_pca['PCA1_group'] == group
    kmf.fit(df_pca[mask]['DFS_months'], event_observed=df_pca[mask]['DFS_event'], label=f'PCA {group}')
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Analysis based on PCA groups')
plt.xlabel('Time (months)')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.show()

<div>
<div>Patients in Group 2 (higher PCA2) show better survival probability over time compared to Group 1.</div>
</div>