In [None]:
### Start of NOTEBOOK ###

In [None]:
# header files
import os
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
print("Header files loaded!")

In [None]:
tcga_oc_files = glob.glob("survival_analysis_results/ovary/tcga/*")
upmc_oc_files = glob.glob("survival_analysis_results/ovary/upmc/*")
print(len(tcga_oc_files))
print(len(upmc_oc_files))

In [None]:
# train  - clinical data (output) and features (input)
flag = -1
train_os_event = []
train_os_days = []
train_age = []
train_debulking = []
train_brca = []
train_cr = []
train_pr = []
train_pd = []
with open("survival_analysis_results/ovary/tcga_upmc_clinical.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
        else:
            array = row
            if array[1] == "TCGA":
                train_os_days.append(float(row[4]))
                train_age.append(float(row[7]))
                train_debulking.append(int(row[8]))
                train_brca.append(int(row[9]))
                train_cr.append(int(row[10]))
                train_pr.append(int(row[11]))
                train_pd.append(int(row[12]))
                if row[3] == "True":
                    train_os_event.append(True)
                else:
                    train_os_event.append(False)

tcga_features = []
for file in tcga_oc_files:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("survival_analysis_results/ovary/tcga/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                array = row
                for index in range(0, len(row)):
                    val = float(row[index])
                    file_features.append(val)                
    tcga_features.append(file_features)
print(len(tcga_features))
print(len(train_os_event))
print(len(train_os_days))

In [None]:
# test - clinical data from UPMC (output) and features (input)
flag = -1
test_os_event_1 = []
test_os_days_1 = []
test_age_1 = []
test_debulking_1 = []
test_brca_1 = []
test_cr_1 = []
test_pr_1 = []
test_pd_1 = []
with open("survival_analysis_results/ovary/tcga_upmc_clinical.csv", newline='', encoding = "ISO-8859-1") as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            flag = 1
        else:
            array = row
            if array[1] == "UPMC":
                test_os_days_1.append(float(row[4]))
                test_age_1.append(float(row[7]))
                test_debulking_1.append(int(row[8]))
                test_brca_1.append(int(row[9]))
                test_cr_1.append(int(row[10]))
                test_pr_1.append(int(row[11]))
                test_pd_1.append(int(row[12]))
                if row[3] == "True":
                    test_os_event_1.append(True)
                else:
                    test_os_event_1.append(False)

upmc_features = []
for file in upmc_oc_files:
    filename = file.split("/")[-1]
    flag = -1
    file_features = []
    with open("survival_analysis_results/ovary/upmc/"+filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                for index in range(0, len(row)):
                    val = float(row[index])
                    file_features.append(val)
    upmc_features.append(file_features)
print(len(upmc_features))
print(len(test_os_event_1))
print(len(test_os_days_1))

In [None]:
# model building (train on TCGA and test on either UPMC/UCLA)
train_features = np.array(tcga_features)
test_features = np.array(upmc_features)
train_censor = np.array(train_os_event)
train_days = np.array(train_os_days)
test_censor = np.array(test_os_event_1)
test_days = np.array(test_os_days_1)

train_y = []
for index in range(0, len(train_censor)):
    train_y.append([train_censor[index], train_days[index]])
print(len(train_y))
test_y = []
for index in range(0, len(test_censor)):
    test_y.append([test_censor[index], test_days[index]])
print(len(test_y))

In [None]:
# train and validate
test_group = []
train_group = []
features_train = train_features
features_test = test_features
y_train = train_y
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)
estimator = CoxnetSurvivalAnalysis()
estimator.fit(features_train_df, y_train)
score, _, _, _, _ = concordance_index_censored(test_censor, test_days, estimator.predict(features_test_df))
score, _, _, _, _ = concordance_index_censored(train_censor, train_days, estimator.predict(features_train_df))
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

In [None]:
# get risk groups - Median threshold common choice
median = np.median(train_risk_scores)
count_low = 0
count_high = 0
for index in range(0, len(train_risk_scores)):
    if train_risk_scores[index] > median:
        count_high += 1
        train_group.append(1)
    else:
        count_low += 1
        train_group.append(0)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        test_group.append(1)
    else:
        count_low += 1
        test_group.append(0)
print(median)

In [None]:
print(*train_days, sep="; ")

In [None]:
a = []
for index in range(0, len(train_censor)):
    if train_censor[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

In [None]:
print(*train_risk_scores, sep='; ')

In [None]:
print(*train_group, sep="; ")

In [None]:
print(len(test_days))
print(*test_days, sep="; ")

In [None]:
a = []
for index in range(0, len(test_censor)):
    if test_censor[index] == False:
        a.append(0)
    else:
        a.append(1)
print(len(a))
print(*a, sep="; ")

In [None]:
print(*test_risk_scores, sep='; ')

In [None]:
print(len(test_group))
print(*test_group, sep="; ")

In [None]:
### END of NOTEBOOK ###

In [None]:
# Additonal work: find prognostic features from model trained above
count = 0
i = []
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, 100):
        if estimator.coef_[index1][index2] > 0 or estimator.coef_[index1][index2] < 0:
            flag = 1
            print(str(index1) + " " + str(estimator.coef_[index1][index2]))
            if estimator.coef_[index1][index2] > 0.001 or estimator.coef_[index1][index2] < -0.001:
                i.append(index1)
            break
    if flag == 1:
        count += 1
print()
print(i)
print("Prognostic features count = " + str(count))