In [2]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# use pandas to read excel file
patient_data = pd.read_excel('cleaned_rCBVmeasurements.xlsx')

In [4]:
# print the first 5 rows of the data
print(patient_data.head())

In [None]:
# Convert the necessary columns to numeric for analysis
# patient_data['tumor_rcbv_average'] = pd.to_numeric(patient_data['tumor_rcbv_average'])
patient_data['time_to_recurrence'] = pd.to_numeric(patient_data['time_to_recurrence'])
patient_data['days_used_for_calculation'] = pd.to_numeric(patient_data['days_used_for_calculation'])

# Create a new binary feature
patient_data['elevated_perfusion'] = np.where(patient_data['rcbv_ratio'] > 1.5, 1, 0)
print(patient_data['elevated_perfusion'].value_counts())
# print(patient_data.head())
# create new feature, relatve tumor_rcbv_variance 
# patient_data['tumor_rcbv_cv'] = patient_data['tumor_rcbv_stdev'] / patient_data['tumor_rcbv_average']
# convert NaN to 0
# patient_data['tumor_rcbv_cv'] = patient_data['tumor_rcbv_cv'].fillna(0)

# Create the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=patient_data, x='elevated_perfusion', y='days_used_for_calculation')
plt.title('Boxplot of Time to Recurrence for Patients\n(Tumor rCBV Average = 0 vs Tumor rCBV Average > 0)')
plt.xlabel('Elevated perfusion')

In [16]:
# print the number of patients for each treatment type
print(patient_data.head())
print(patient_data['treatment'].value_counts())
print(patient_data['operation'].value_counts())
# print(patient_data['sex'].value_counts())

In [10]:
# Load the second dataset
treatment_data_path = 'treatment.csv'
treatment_data = pd.read_csv(treatment_data_path)

In [11]:
treatment_data['rectime_days'] = treatment_data['rectime'] * 1.083  # calculated from experimental data

In [30]:
treatment_data['theta'].unique(), treatment_data['r_d'].unique()

In [52]:
# Filtering the data into two groups based on kappa. 
group1_kappa_positive = treatment_data[(treatment_data['kappa'] > 0) & (treatment_data['r_d'] >= 0.1) & (treatment_data['theta'] > 0.3) & (treatment_data['theta'] <= .9)]['rectime_days']
group2_kappa_negative = treatment_data[(treatment_data['kappa'] < 0) & (treatment_data['r_d'] >= 0.1) & (treatment_data['theta'] > 0.3) & (treatment_data['theta'] <= .9)]['rectime_days']
# group1_kappa_positive = treatment_data[treatment_data['kappa'] > 0 & treatment_data['r_d'] == 0.25]['rectime_days']
# group2_kappa_negative = treatment_data[treatment_data['kappa'] < 0 & treatment_data['r_d'] == 0.25]['rectime_days']

# Preparing the data for plotting
boxplot_data_kappa = [group1_kappa_positive, group2_kappa_negative]
labels_kappa = ['Kappa > 0', 'Kappa < 0']

# Creating the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=boxplot_data_kappa)
plt.xticks([0, 1], labels_kappa)
plt.title('Comparison of Recurrence Time for Kappa > 0 and Kappa < 0')
plt.ylabel('Recurrence Time')
plt.show()


In [53]:
data_for_boxplot = [patient_data[patient_data['elevated_perfusion'] == 0]['days_used_for_calculation'],
                    patient_data[patient_data['elevated_perfusion'] == 1]['days_used_for_calculation']]

labels_rcbv = ['No elevated perfusion', 'elevated perfusion']
# plot all boxplots in the same axes
plt.figure(figsize=(10, 6))
sns.boxplot(data=data_for_boxplot+boxplot_data_kappa)
plt.title('Boxplot of Time to Recurrence for Patients\n(Tumor rCBV Average = 0 vs Tumor rCBV Average > 0)')
plt.xlabel('Group')
plt.ylabel('Time to Recurrence (days)')
plt.xticks([0, 1, 2, 3], labels_rcbv+labels_kappa)

In [49]:
# create barplots, two times two bars next to each other, no elevation vs kappa > 0, elevation vs kappa < 0
sns.barplot()

In [46]:
# Preprocess the data for Random Forest classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.linear_model import LinearRegression


# Selecting relevant features and target variable
features = ['sex', 'age', 'rcbv_ratio', 'operation']
target = 'days_used_for_calculation'

X = patient_data[features]
y = patient_data[target]
# Checking for missing values
missing_values = X.isnull().sum()
missing_values = missing_values[missing_values > 0]

print(missing_values)

In [47]:
from sklearn.preprocessing import FunctionTransformer

# Pipeline for numerical features
numeric_features = ['age', 'rcbv_ratio']

numeric_transformer = Pipeline(steps=[
    ('identity', FunctionTransformer())
    #('scaler', StandardScaler())
])

# Pipeline for categorical features
categorical_features = ['sex', 'operation']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='if_binary'))])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# X_test = X
# y_train = y
# y_test = y
# Apply the transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape

In [48]:
# Initialize the Linear Regression model
linear_regression = LinearRegression()

# Fit the model on the training data
linear_regression.fit(X_train_transformed, y_train)

# Make predictions on the testing data
y_pred = linear_regression.predict(X_test_transformed)

# Calculate and print the Mean Squared Error (MSE) and Mean Absolute Error (MAE)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

In [50]:
# linear regression plot using seaborn
sns.regplot(x='rcbv_ratio', y='days_used_for_calculation', ci=None, data=patient_data)

In [51]:
# Training the model
# Create the Random Forest Regressor
random_forest = RandomForestRegressor(n_estimators=10, random_state=0)

# Train the model
random_forest.fit(X_train_transformed, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test_transformed)

# Calculate the evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

mse, mae

In [52]:
# Extract the feature importances from the trained model
importances = random_forest.feature_importances_

# For the categorical features
categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

# Combine the two lists
features = np.concatenate([numeric_features, categorical_features])

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Plot the feature importances as a bar chart
plt.figure(figsize=(10, 6))
plt.title("Feature importances")
plt.bar(range(len(features)), importances[indices])
plt.xticks(range(len(features)), features[indices], rotation=90)
plt.show()

In [53]:
# plot the first tree of the random forest
from sklearn.tree import plot_tree

# Get the Random Forest model
model = random_forest
# Plot the first five trees
for i in range(5):
    plt.figure(figsize=(10, 6))
    plot_tree(model.estimators_[i], feature_names=list(features), filled=True, max_depth=2)
    plt.show()

# plt.savefig('tree_continuous_rcbv.png', dpi=300)

In [54]:
from sklearn.inspection import PartialDependenceDisplay

# Get the index of the rcbv_ratio feature in the transformed dataset
feature_idx = list(features).index('rcbv_ratio')

# Create the PartialDependenceDisplay
display = PartialDependenceDisplay.from_estimator(
    random_forest, X_train_transformed, features=[feature_idx], feature_names=features)

In [18]:
# Add the new feature to the list of features
features = ['sex', 'age', 'operation', 'elevated_perfusion']

X = patient_data[features]
y = patient_data[target]

# Pipeline for numerical features
numeric_features = ['age']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Pipeline for categorical features
categorical_features = ['sex', 'operation', 'elevated_perfusion']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='if_binary'))])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Apply the transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Create the Random Forest Regressor
random_forest = RandomForestRegressor(n_estimators=100, random_state=0)

# Train the model
random_forest.fit(X_train_transformed, y_train)

# Extract the feature importances from the trained model
importances = random_forest.feature_importances_

# Get the feature names from the preprocessor
numeric_features = preprocessor.transformers_[0][2]
categorical_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

# Combine the two lists
features = np.concatenate([numeric_features, categorical_features])
# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# print mse and mae
y_pred = random_forest.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

# Plot the feature importances as a bar chart
plt.figure(figsize=(10, 6))
plt.title("Feature importances")
plt.bar(range(len(features)), importances[indices])
plt.xticks(range(len(features)), features[indices], rotation=90)
plt.show()

In [None]:
# Get the index of the rcbv_ratio feature in the transformed dataset
feature_idx = list(features).index('rcbv_ratio')

# Create the PartialDependenceDisplay
display = PartialDependenceDisplay.from_estimator(
    random_forest, X_train_transformed, features=[feature_idx])

## Now let's repeat the same process for the synthetic dataset

In [73]:
synth_data = pd.read_csv('treatment.csv')
synth_data.head()

In [85]:
# Define the features and the target variable
features = ['kappa', 'theta', 'r_d']
target = 'rectime_clean'

# Check for NaN values in 'rectime'
synth_data['rectime_clean'] = synth_data['rectime'].copy()
nan_rectime = synth_data['rectime'].isna()

# If there are any NaN values, replace them with the maximum of 'rectime'
if nan_rectime.sum() > 0:
    max_rectime = synth_data['rectime'].max()
    synth_data['rectime_clean'].fillna(max_rectime, inplace=True)
    
X = synth_data[features]
y = synth_data[target]


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize the Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, random_state=0)

# Fit the model on the training data
random_forest.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = random_forest.predict(X_test)

# Calculate and print the Mean Squared Error (MSE) and Mean Absolute Error (MAE)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

In [86]:
# Extract the feature importances from the trained model
importances = random_forest.feature_importances_

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Plot the feature importances as a bar chart
plt.figure(figsize=(10, 6))
plt.title("Feature importances")
plt.bar(range(len(features)), importances[indices])
plt.xticks(range(len(features)), np.array(features)[indices], rotation=90)
plt.show()

In [91]:
# Get the index of the rcbv_ratio feature in the transformed dataset
feature_idx = list(features).index('kappa')

# Create the PartialDependenceDisplay
display = PartialDependenceDisplay.from_estimator(
    random_forest, X_train_transformed, features=[feature_idx], feature_names=features)

In [94]:
# Get the Random Forest model
model = random_forest
# Plot the first five trees
for i in range(5):
    plt.figure(figsize=(10, 6))
    plot_tree(model.estimators_[i], feature_names=list(features), filled=True, max_depth=2)
    plt.show()


In [None]:
from sklearn.feature_selection import RFE

# Define the method
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=1)

# Fit the model
rfe = rfe.fit(X, y)

# Get the selected features
selected_features = rfe.support_

# Print the names of the selected features
print('Selected features:')
print(np.array(features)[selected_features])