In [None]:
import pandas as pd
import os
# Load the CSV files
participants_df = pd.read_csv('Participants.csv')
diagnosis_df = pd.read_csv('Diagnosis.csv')
survival_df = pd.read_csv('Survival.csv')
studies_df = pd.read_csv('Studies.csv')

# Perform the joins based on 'Participant Id'
cancer_df = participants_df.merge(diagnosis_df, on='Participant Id', how='left').merge(survival_df, on='Participant Id', how='left')

# Join the result with Studies.csv based on 'Study Id'
combined_df = cancer_df.merge(studies_df, on='Study ID', how='left')



# Clean the 'Diagnosis' and 'Anatomic Site' columns by removing codes and colons
combined_df['Diagnosis'] = combined_df['Diagnosis'].str.split(':').str[1].str.strip()
combined_df['Anatomic Site'] = combined_df['Anatomic Site'].str.split(':').str[1].str.strip()

# # Convert age columns from months to "years and months"
# def convert_months_to_years_months(months):
#     years = months // 12
#     remaining_months = months % 12
#     return f"{years} years {remaining_months} months"

# age_columns = ['Age at Diagnosis', 'Age at Last Contact', 'Age at Death']  # Example column names

# for col in age_columns:
#     combined_df[col] = combined_df[col].apply(convert_months_to_years_months)
# Drop the redundant 'Study Id' columns
combined_df.drop(columns=['Study ID_x', 'Study ID_y'], inplace=True)
# Define the output path
output_filename = 'child_cancer_cleaned.csv'
output_path = os.path.join(os.getcwd(), output_filename)

# Save the combined dataframe to a CSV file
combined_df.to_csv(output_path, index=False)

# Print the full path where the file is saved
print(f"File saved to: {output_path}")

# Statistical Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter

import os
# Load the provided CSV file
file_path = 'child_cancer_cleaned.csv'
data = pd.read_csv(file_path)

# Convert relevant columns to numeric and handle non-numeric values
for col in ['Age at Diagnosis (days)', 'Age at Last Known Survival Status', 'Age at Event-Free Survival Status', 'Event-Free Survival Status']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows with NaNs in critical columns
data = data.dropna(subset=['Age at Diagnosis (days)', 'Age at Last Known Survival Status', 'Age at Event-Free Survival Status', 'Event-Free Survival Status'])


data.describe()

# Age group distribution and diagnosis

In [None]:
# Plotting the distribution of age at diagnosis
plt.hist(data['Age at Diagnosis (days)'], bins=30, edgecolor='black')
plt.title('Age Distribution at Diagnosis')
plt.xlabel('Age at Diagnosis (days)')
plt.ylabel('Number of Patients')
plt.show()

# Kaplan-Meier Survival Analysis

In [None]:
# Kaplan-Meier survival curve for different age groups at diagnosis
# Kaplan-Meier Survival Analysis
kmf = KaplanMeierFitter()

# Define age groups
data['Age Group'] = pd.cut(data['Age at Diagnosis (days)'], bins=[0, 365, 365*5, 365*10, 365*15, 365*20], labels=['<1 year', '1-5 years', '5-10 years', '10-15 years', '15-20 years'])

# Plotting survival curves for each age group
plt.figure()
for group in data['Age Group'].unique():
    mask = data['Age Group'] == group
    kmf.fit(data['Age at Last Known Survival Status'][mask], event_observed=data['Event-Free Survival Status'][mask], label=str(group))
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Curves by Age Group')
plt.xlabel('Days')
plt.ylabel('Survival Probability')
plt.show()


# Cox proportional hazards model

In [None]:
# Prepare data for Cox model
cph_data = data[['Age at Diagnosis (days)', 'Event-Free Survival Status', 'Age at Last Known Survival Status']].dropna()

# Cox proportional-hazards model
cph = CoxPHFitter()
cph.fit(cph_data, duration_col='Age at Last Known Survival Status', event_col='Event-Free Survival Status')
cph.plot()
plt.title('Cox Proportional-Hazards Model')
plt.show()


# Survival status by diagnosis type


In [None]:
# Bar chart of survival status by diagnosis type
survival_by_diagnosis = data.groupby('Diagnosis')['Last Known Survival Status'].value_counts().unstack().fillna(0)
survival_by_diagnosis.plot(kind='bar', stacked=True)
plt.title('Survival Status by Diagnosis Type')
plt.xlabel('Diagnosis')
plt.ylabel('Number of Patients')
plt.legend(title='Survival Status')
plt.show()


# Evaluating the Effectiveness of GenAI in Personalized Treatment Plans for Pediatric Cancer

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'child_cancer_cleaned1.csv'
data = pd.read_csv(file_path)

# Clean the data by removing rows with NaN values
cleaned_data = data.dropna()

# Visualization 1: Survival Rate by Race
plt.figure(figsize=(10, 6))
sns.countplot(x='Race', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Race')
plt.xlabel('Race')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_race.png')
plt.show()

# Visualization 2: Survival Rate by Ethnicity
plt.figure(figsize=(10, 6))
sns.countplot(x='Ethnicity', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_ethnicity.png')
plt.show()

# Visualization 3: Survival Rate by Sex at Birth
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex at Birth', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Sex at Birth')
plt.xlabel('Sex at Birth')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_sex.png')
plt.show()

# Visualization 4: Survival Rate by Diagnosis
plt.figure(figsize=(12, 8))
sns.countplot(y='Diagnosis', hue='Last Known Survival Status', data=cleaned_data, order=cleaned_data['Diagnosis'].value_counts().index)
plt.title('Survival Rate by Diagnosis')
plt.xlabel('Count')
plt.ylabel('Diagnosis')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_diagnosis.png')
plt.show()

# Visualization 5: Age at Diagnosis vs. Survival Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Last Known Survival Status', y='Age at Diagnosis (days)', data=cleaned_data)
plt.title('Age at Diagnosis vs. Survival Status')
plt.xlabel('Survival Status')
plt.ylabel('Age at Diagnosis (days)')
plt.savefig('age_at_diagnosis_vs_survival_status.png')
plt.show()


# Leveraging Machine Learning to Predict Outcomes in Pediatric Cancer Patients

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

# Load the dataset
file_path = 'child_cancer_cleaned1.csv'
data = pd.read_csv(file_path)

# Clean the data by removing rows with NaN values
cleaned_data = data.dropna()

# Verify that the data is cleaned
print(cleaned_data.info())

# Descriptive statistics
desc_stats = cleaned_data.describe(include='all')
print(desc_stats)

# Visualization 1: Survival Rate by Race
plt.figure(figsize=(10, 6))
sns.countplot(x='Race', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Race')
plt.xlabel('Race')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_race.png')
plt.show()

# Visualization 2: Survival Rate by Ethnicity
plt.figure(figsize=(10, 6))
sns.countplot(x='Ethnicity', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Ethnicity')
plt.xlabel('Ethnicity')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_ethnicity.png')
plt.show()

# Visualization 3: Survival Rate by Sex at Birth
plt.figure(figsize=(10, 6))
sns.countplot(x='Sex at Birth', hue='Last Known Survival Status', data=cleaned_data)
plt.title('Survival Rate by Sex at Birth')
plt.xlabel('Sex at Birth')
plt.ylabel('Count')
plt.legend(title='Survival Status')
plt.savefig('survival_rate_by_sex.png')
plt.show()

# Encode categorical variables
label_encoders = {}
for column in ['Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site', 'Last Known Survival Status']:
    le = LabelEncoder()
    cleaned_data[column] = le.fit_transform(cleaned_data[column])
    label_encoders[column] = le

# Define features and target
features = cleaned_data.drop(['Participant Id', 'Last Known Survival Status', 'Age at Last Known Survival Status'], axis=1)
target = cleaned_data['Last Known Survival Status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Visualization 4: Confusion Matrix
plt.figure(figsize=(10, 6))
plot_confusion_matrix(model, X_test, y_test, display_labels=label_encoders['Last Known Survival Status'].classes_, cmap='Blues')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Visualization 5: Feature Importance
plt.figure(figsize=(10, 6))
feature_importances = pd.Series(model.feature_importances_, index=features.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.show()


# cox regression

In [None]:
import pandas as pd
from lifelines import CoxPHFitter, KaplanMeierFitter
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'child_cancer_cleaned1.csv'
data = pd.read_csv(file_path)

# Clean the data by removing rows with NaN values
cleaned_data = data.dropna()

# Prepare data for Cox regression analysis
# Define the duration and event columns
cleaned_data['Event'] = cleaned_data['Last Known Survival Status'].apply(lambda x: 1 if x == 'Dead' else 0)
cleaned_data['Age at Last Known Survival Status (days)'] = cleaned_data['Age at Last Known Survival Status']

# Encode categorical variables
label_encoders = {}
for column in ['Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site']:
    le = LabelEncoder()
    cleaned_data[column] = le.fit_transform(cleaned_data[column])
    label_encoders[column] = le

# Fit the Cox proportional hazards model
cph = CoxPHFitter()
cph.fit(cleaned_data[['Age at Diagnosis (days)', 'Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site', 'Age at Last Known Survival Status (days)', 'Event']], 
        duration_col='Age at Last Known Survival Status (days)', event_col='Event')
cox_summary = cph.summary
cph.print_summary()

# Kaplan-Meier analysis
kmf = KaplanMeierFitter()
kmf.fit(durations=cleaned_data['Age at Last Known Survival Status (days)'], event_observed=cleaned_data['Event'])

# Plot the Kaplan-Meier estimate
plt.figure(figsize=(10, 6))
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Curve')
plt.xlabel('Days')
plt.ylabel('Survival Probability')
plt.savefig('kaplan_meier_survival_curve.png')
plt.show()

# Save the summary of Cox regression analysis
cox_summary.to_csv('cox_regression_summary.csv')


# Complete Code for Kaplan-Meier Curves by Diagnosis

# To perform a comprehensive survival analysis on different factors using the Kaplan-Meier method and Cox proportional hazards model, follow the steps below:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'child_cancer_cleaned1.csv'
data = pd.read_csv(file_path)

# Clean the data by removing rows with NaN values
cleaned_data = data.dropna()

# Prepare data for survival analysis
cleaned_data['Event'] = cleaned_data['Last Known Survival Status'].apply(lambda x: 1 if x == 'Dead' else 0)
cleaned_data['Age at Last Known Survival Status (days)'] = cleaned_data['Age at Last Known Survival Status']

# Select top 10 diagnoses with the most occurrences
top_10_diagnoses = cleaned_data['Diagnosis'].value_counts().nlargest(10).index

# Kaplan-Meier Analysis for top 10 diagnoses
kmf = KaplanMeierFitter()
plt.figure(figsize=(12, 8))

for diagnosis in top_10_diagnoses:
    mask = cleaned_data['Diagnosis'] == diagnosis
    kmf.fit(durations=cleaned_data[mask]['Age at Last Known Survival Status (days)'], event_observed=cleaned_data[mask]['Event'], label=diagnosis)
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Curves by Top 10 Diagnoses')
plt.xlabel('Days')
plt.ylabel('Survival Probability')
plt.legend(title='Diagnosis')
plt.savefig('kaplan_meier_by_top_10_diagnoses.png')
plt.show()

# Encode categorical variables for Cox Proportional Hazards Model
encoded_data = cleaned_data.copy()
label_encoders = {}
for column in ['Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site']:
    le = LabelEncoder()
    encoded_data[column] = le.fit_transform(encoded_data[column])
    label_encoders[column] = le

# Cox Proportional Hazards Model for top 10 diagnoses
cph = CoxPHFitter()
top_10_encoded_data = encoded_data[encoded_data['Diagnosis'].isin(label_encoders['Diagnosis'].transform(top_10_diagnoses))]
cph.fit(top_10_encoded_data[['Age at Diagnosis (days)', 'Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site', 'Age at Last Known Survival Status (days)', 'Event']], 
        duration_col='Age at Last Known Survival Status (days)', event_col='Event')
cph.print_summary()

# Save the summary of Cox regression analysis
cox_summary = cph.summary
cox_summary.to_csv('cox_regression_summary_top_10_diagnoses.csv')


# Anatomic site on survival

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'child_cancer_cleaned1.csv'
data = pd.read_csv(file_path)

# Clean the data by removing rows with NaN values
cleaned_data = data.dropna()

# Prepare data for survival analysis
cleaned_data['Event'] = cleaned_data['Last Known Survival Status'].apply(lambda x: 1 if x == 'Dead' else 0)
cleaned_data['Age at Last Known Survival Status (days)'] = cleaned_data['Age at Last Known Survival Status']

# Select top 10 anatomic sites with the most occurrences
top_10_sites = cleaned_data['Anatomic Site'].value_counts().nlargest(10).index

# Kaplan-Meier Analysis for top 10 anatomic sites
kmf = KaplanMeierFitter()
plt.figure(figsize=(12, 8))

for site in top_10_sites:
    mask = cleaned_data['Anatomic Site'] == site
    kmf.fit(durations=cleaned_data[mask]['Age at Last Known Survival Status (days)'], event_observed=cleaned_data[mask]['Event'], label=site)
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Curves by Top 10 Anatomic Sites')
plt.xlabel('Days')
plt.ylabel('Survival Probability')
plt.legend(title='Anatomic Site')
plt.savefig('kaplan_meier_by_top_10_anatomic_sites.png')
plt.show()

# Encode categorical variables for Cox Proportional Hazards Model
encoded_data = cleaned_data.copy()
label_encoders = {}
for column in ['Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site']:
    le = LabelEncoder()
    encoded_data[column] = le.fit_transform(encoded_data[column])
    label_encoders[column] = le

# Cox Proportional Hazards Model for top 10 anatomic sites
cph = CoxPHFitter()
top_10_encoded_data = encoded_data[encoded_data['Anatomic Site'].isin(label_encoders['Anatomic Site'].transform(top_10_sites))]
cph.fit(top_10_encoded_data[['Age at Diagnosis (days)', 'Race', 'Ethnicity', 'Sex at Birth', 'Diagnosis', 'Anatomic Site', 'Age at Last Known Survival Status (days)', 'Event']], 
        duration_col='Age at Last Known Survival Status (days)', event_col='Event')
cph.print_summary()

# Save the summary of Cox regression analysis
cox_summary = cph.summary
cox_summary.to_csv('cox_regression_summary_top_10_anatomic_sites.csv')
