In [5]:
"""Feature Data 01"""
from lifelines.utils import concordance_index
import pandas as pd

ci_list=[]
type_list=[]

# Base: Survival data
data_surv = pd.read_csv(filepath_or_buffer='./data/00-TCGA-CDR.csv', encoding='utf-8')

# Extract tumor type in type
tumor_types = data_surv['type'].astype('category').cat.categories.tolist()

for i in range(len(tumor_types)):
    # Extract matched tumor type datasets
    data_tumor = data_surv.loc[data_surv['type'] == tumor_types[i]]
    type_list.append(tumor_types[i])

    # --------

    # Feature data 01
    f01_cnv = pd.read_csv(filepath_or_buffer='./data/01-CNV burden scores - seg_based_scores.tsv', sep='\\t', encoding='utf-8')

    # Remove last three characters in Sample column
    f01_cnv['Sample'] = f01_cnv['Sample'].str[:-3]

    # Extract feature rows by survival bcr_patient_barcode and remain main-feature
    f01_cnv_bytype = f01_cnv[f01_cnv['Sample'].isin(data_tumor['bcr_patient_barcode'])]
    f01_cnv_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
    f01_cnv_mfeature = f01_cnv_bytype.drop(columns=['n_segs', 'n_extrema'])

    # ---------

    # Preprocessing
    # Common elements between survival - feature
    common_elements = list(set(data_tumor['bcr_patient_barcode']).intersection(f01_cnv_mfeature['bcr_patient_barcode']))

    # Extract common elements from both dataframes
    data_tumor_common = data_tumor[data_tumor['bcr_patient_barcode'].isin(common_elements)]

    # Concat two dataframes by bcr value
    data_merged = pd.merge(data_tumor_common, f01_cnv_mfeature, on='bcr_patient_barcode')

    data_cindex = data_merged.loc[:, ['bcr_patient_barcode', 'OS.time', 'OS', 'frac_altered']]

    data_cindex.fillna(0, inplace=True)

    # concordance_index must have same shape
    c_index = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['frac_altered'])

    ci_list.append(c_index)
df_result = pd.DataFrame({'tumor_type': type_list, 'frac_altered': ci_list})
df_result.to_csv(path_or_buf='./data/cindex_feature_01.csv', encoding='utf-8')

  f01_cnv = pd.read_csv(filepath_or_buffer='./data/01-CNV burden scores - seg_based_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f01_cnv_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
  f01_cnv = pd.read_csv(filepath_or_buffer='./data/01-CNV burden scores - seg_based_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f01_cnv_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
  f01_cnv = pd.read_csv(filepath_or_buffer='./data/01-CNV burden scores - seg_based_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set on a copy of a slice f

In [2]:
"""Feature Data 02"""
from lifelines.utils import concordance_index
import pandas as pd

ci_list=[]
type_list=[]

# Base: Survival data
data_surv = pd.read_csv(filepath_or_buffer='./data/00-TCGA-CDR.csv', encoding='utf-8')

# Extract tumor type in type
tumor_types = data_surv['type'].astype('category').cat.categories.tolist()

for i in range(len(tumor_types)):
    # Extract matched tumor type datasets
    data_tumor = data_surv.loc[data_surv['type'] == tumor_types[i]]
    type_list.append(tumor_types[i])

    # Feature data 02
    f02_loh = pd.read_csv(filepath_or_buffer='./data/02-Aneuploidy and LOG Scores - ABSOLUTE_scores.tsv', sep='\\t', encoding='utf-8')

    # Remove last three characters in Sample column
    f02_loh['Sample'] = f02_loh['Sample'].str[:-3]

    # Extract feature rows by survival bcr_patient_barcode and remain main-feature
    f02_loh_bytype = f02_loh[f02_loh['Sample'].isin(data_tumor['bcr_patient_barcode'])]
    f02_loh_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
    f02_loh_mfeature = f02_loh_bytype.loc[:, ['bcr_patient_barcode', 'LOH_frac_altered']]
    #f02_loh_mfeature.fillna(0, inplace=True)

    # Preprocessing
    # Common elements between survival - feature
    common_elements = list(set(data_tumor['bcr_patient_barcode']).intersection(f02_loh_mfeature['bcr_patient_barcode']))

    # Extract common elements from both dataframes
    data_tumor_common = data_tumor[data_tumor['bcr_patient_barcode'].isin(common_elements)]

    # Concat two dataframes by bcr value
    data_merged = pd.merge(data_tumor_common, f02_loh_mfeature, on='bcr_patient_barcode')

    data_cindex = data_merged.loc[:, ['bcr_patient_barcode', 'OS.time', 'OS', 'LOH_frac_altered']]

    data_cindex.fillna(0, inplace=True)

    # concordance_index must have same shape
    c_index = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['LOH_frac_altered'])

    ci_list.append(c_index)
df_result = pd.DataFrame({'tumor_type': type_list, 'LOH_frac_altered': ci_list})
df_result.to_csv(path_or_buf='./data/cindex_feature_02.csv', encoding='utf-8')

  f02_loh = pd.read_csv(filepath_or_buffer='./data/02-Aneuploidy and LOG Scores - ABSOLUTE_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f02_loh_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
  f02_loh = pd.read_csv(filepath_or_buffer='./data/02-Aneuploidy and LOG Scores - ABSOLUTE_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f02_loh_bytype.rename(columns={'Sample':'bcr_patient_barcode'}, inplace=True)
  f02_loh = pd.read_csv(filepath_or_buffer='./data/02-Aneuploidy and LOG Scores - ABSOLUTE_scores.tsv', sep='\\t', encoding='utf-8')
A value is trying to be set o

In [1]:
"""Feature Data 03+04"""
from lifelines.utils import concordance_index
import pandas as pd

ci_purity_ls=[]
ci_ploidy_ls=[]
type_list=[]

# Base: Survival data
data_surv = pd.read_csv(filepath_or_buffer='./data/00-TCGA-CDR.csv', encoding='utf-8')

# Extract tumor type in type
tumor_types = data_surv['type'].astype('category').cat.categories.tolist()

for i in range(len(tumor_types)):
    # Extract matched tumor type datasets
    data_tumor = data_surv.loc[data_surv['type'] == tumor_types[i]]
    type_list.append(tumor_types[i])

    # Feature data 03: purity
    # Feature data 04: ploidy
    f34_puriploi = pd.read_csv(filepath_or_buffer='./data/03+04-ABSOLUTE ploidy file - TCGA_mastercalls.abs_tables_JSedit.fixed.txt', sep='\\t', usecols=['array', 'purity', 'ploidy'], encoding='utf-8')

    # Remove last three characters in Sample column
    f34_puriploi['array'] = f34_puriploi['array'].str[:-3]

    # Extract feature rows by survival bcr_patient_barcode and remain main-feature
    f34_puriploi_bytype = f34_puriploi[f34_puriploi['array'].isin(data_tumor['bcr_patient_barcode'])]
    f34_puriploi_bytype.rename(columns={'array':'bcr_patient_barcode'}, inplace=True)
    f34_puriploi_mfeature = f34_puriploi_bytype.loc[:, ['bcr_patient_barcode', 'purity', 'ploidy']]
    #f34_puriploi_mfeature.fillna(0, inplace=True)

    # ---------
    # Preprocessing
    # Check common elements between survival - feature
    common_elements = list(set(data_tumor['bcr_patient_barcode']).intersection(f34_puriploi_mfeature['bcr_patient_barcode']))

    # Extract common elements from both dataframes
    data_tumor_common = data_tumor[data_tumor['bcr_patient_barcode'].isin(common_elements)]

    # Concat two dataframes by bcr value
    data_merged = pd.merge(data_tumor_common, f34_puriploi_mfeature, on='bcr_patient_barcode')
    data_cindex = data_merged.loc[:, ['bcr_patient_barcode', 'OS.time', 'OS', 'purity', 'ploidy']]
    data_cindex.fillna(0, inplace=True)

    # concordance_index must have same shape
    # c-index for purity
    ci_purity = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['purity'])
    ci_purity_ls.append(ci_purity)

    # c-index for ploidy
    ci_ploidy = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['ploidy'])
    ci_ploidy_ls.append(ci_ploidy)

df_result = pd.DataFrame({'tumor_type':type_list, 'ci_purity':ci_purity_ls, 'ci_ploidy':ci_ploidy_ls})
df_result.to_csv(path_or_buf='./data/cindex_feature_03+04.csv', encoding='utf-8')

  f34_puriploi = pd.read_csv(filepath_or_buffer='./data/03+04-ABSOLUTE ploidy file - TCGA_mastercalls.abs_tables_JSedit.fixed.txt', sep='\\t', usecols=['array', 'purity', 'ploidy'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f34_puriploi_bytype.rename(columns={'array':'bcr_patient_barcode'}, inplace=True)
  f34_puriploi = pd.read_csv(filepath_or_buffer='./data/03+04-ABSOLUTE ploidy file - TCGA_mastercalls.abs_tables_JSedit.fixed.txt', sep='\\t', usecols=['array', 'purity', 'ploidy'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f34_puriploi_bytype.rename(columns={'array':'bcr_patient_barcode'}, inplace=True)
  f34_puriploi = pd.

In [2]:
"""Feature Data 05"""
from lifelines.utils import concordance_index
import pandas as pd

ci_silent_ls=[]
ci_nonsilent_ls=[]
type_list=[]

# Base: Survival data
data_surv = pd.read_csv(filepath_or_buffer='./data/00-TCGA-CDR.csv', encoding='utf-8')

# Extract tumor type in type
tumor_types = data_surv['type'].astype('category').cat.categories.tolist()

for i in range(len(tumor_types)):
    # Extract matched tumor type datasets
    data_tumor = data_surv.loc[data_surv['type'] == tumor_types[i]]
    type_list.append(tumor_types[i])

    # Feature data 05-1: Silent per Mb
    # Feature data 05-2: Non-silent per Mb
    f05_silent = pd.read_csv(filepath_or_buffer='./data/05-Mutation Load - mutation-load_updated.txt', sep='\\t', usecols=['Patient_ID', 'Silent per Mb', 'Non-silent per Mb'], encoding='utf-8')

    # Extract feature rows by survival bcr_patient_barcode and remain main-feature
    f05_silent_bytype = f05_silent[f05_silent['Patient_ID'].isin(data_tumor['bcr_patient_barcode'])]
    f05_silent_bytype.rename(columns={'Patient_ID':'bcr_patient_barcode'}, inplace=True)
    f05_silent_mfeature = f05_silent_bytype.loc[:, ['bcr_patient_barcode', 'Silent per Mb', 'Non-silent per Mb']]

    # ---------
    # Preprocessing
    # Check common elements between survival - feature
    common_elements = list(set(data_tumor['bcr_patient_barcode']).intersection(f05_silent_mfeature['bcr_patient_barcode']))

    # Extract common elements from both dataframes
    data_tumor_common = data_tumor[data_tumor['bcr_patient_barcode'].isin(common_elements)]

    # Concat two dataframes by bcr value
    data_merged = pd.merge(data_tumor_common, f05_silent_mfeature, on='bcr_patient_barcode')
    data_cindex = data_merged.loc[:, ['bcr_patient_barcode', 'OS.time', 'OS', 'Silent per Mb', 'Non-silent per Mb']]
    data_cindex.fillna(0, inplace=True)

    # concordance_index must have same shape
    # c-index for purity
    ci_silent = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['Silent per Mb'])
    ci_silent_ls.append(ci_silent)

    # c-index for ploidy
    ci_nonsilent = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['Non-silent per Mb'])
    ci_nonsilent_ls.append(ci_nonsilent)

df_result = pd.DataFrame({'tumor_type':type_list, 'ci_silent':ci_silent_ls, 'ci_non-silent':ci_nonsilent_ls})
df_result.to_csv(path_or_buf='./data/cindex_feature_05.csv', encoding='utf-8')

  f05_silent = pd.read_csv(filepath_or_buffer='./data/05-Mutation Load - mutation-load_updated.txt', sep='\\t', usecols=['Patient_ID', 'Silent per Mb', 'Non-silent per Mb'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f05_silent_bytype.rename(columns={'Patient_ID':'bcr_patient_barcode'}, inplace=True)
  f05_silent = pd.read_csv(filepath_or_buffer='./data/05-Mutation Load - mutation-load_updated.txt', sep='\\t', usecols=['Patient_ID', 'Silent per Mb', 'Non-silent per Mb'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f05_silent_bytype.rename(columns={'Patient_ID':'bcr_patient_barcode'}, inplace=True)
  f05_silent = pd.read_csv(fil

In [2]:
"""Feature Data 06"""
from lifelines.utils import concordance_index
import pandas as pd

ci_hrd_ls=[]
type_list=[]

# Base: Survival data
data_surv = pd.read_csv(filepath_or_buffer='./data/00-TCGA-CDR.csv', encoding='utf-8')

# Extract tumor type in type
tumor_types = data_surv['type'].astype('category').cat.categories.tolist()

for i in range(len(tumor_types)):
    # Extract matched tumor type datasets
    data_tumor = data_surv.loc[data_surv['type'] == tumor_types[i]]
    type_list.append(tumor_types[i])

    # Feature data 06: HRD
    f06_hrd = pd.read_csv(filepath_or_buffer='./data/06-Homologous Repair Deficiency - TCGA.HRD_withSampleID.txt', sep='\\t', usecols=['sampleID', 'HRD'], encoding='utf-8')
    
    # Remove last three characters in Sample column
    f06_hrd['sampleID'] = f06_hrd['sampleID'].str[:-3]

    # Extract feature rows by survival bcr_patient_barcode and remain main-feature
    f06_hrd_bytype = f06_hrd[f06_hrd['sampleID'].isin(data_tumor['bcr_patient_barcode'])]
    f06_hrd_bytype.rename(columns={'sampleID':'bcr_patient_barcode'}, inplace=True)
    f06_hrd_mfeature = f06_hrd_bytype.loc[:, ['bcr_patient_barcode', 'HRD']]

    # ---------
    # Preprocessing
    # Check common elements between survival - feature
    common_elements = list(set(data_tumor['bcr_patient_barcode']).intersection(f06_hrd_mfeature['bcr_patient_barcode']))

    # Extract common elements from both dataframes
    data_tumor_common = data_tumor[data_tumor['bcr_patient_barcode'].isin(common_elements)]

    # Concat two dataframes by bcr value
    data_merged = pd.merge(data_tumor_common, f06_hrd_mfeature, on='bcr_patient_barcode')
    data_cindex = data_merged.loc[:, ['bcr_patient_barcode', 'OS.time', 'OS', 'HRD']]
    data_cindex.fillna(0, inplace=True)

    # concordance_index must have same shape
    # c-index for HRD
    ci_hrd = concordance_index(
        event_times=data_cindex['OS.time'],
        event_observed=data_cindex['OS'],
        predicted_scores=-data_cindex['HRD'])
    ci_hrd_ls.append(ci_hrd)

df_result = pd.DataFrame({'tumor_type':type_list, 'ci_hrd':ci_hrd_ls})
df_result.to_csv(path_or_buf='./data/cindex_feature_06.csv', encoding='utf-8')

  f06_hrd = pd.read_csv(filepath_or_buffer='./data/06-Homologous Repair Deficiency - TCGA.HRD_withSampleID.txt', sep='\\t', usecols=['sampleID', 'HRD'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f06_hrd_bytype.rename(columns={'sampleID':'bcr_patient_barcode'}, inplace=True)
  f06_hrd = pd.read_csv(filepath_or_buffer='./data/06-Homologous Repair Deficiency - TCGA.HRD_withSampleID.txt', sep='\\t', usecols=['sampleID', 'HRD'], encoding='utf-8')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f06_hrd_bytype.rename(columns={'sampleID':'bcr_patient_barcode'}, inplace=True)
  f06_hrd = pd.read_csv(filepath_or_buffer='./data/06-Homologous Repair Deficiency

In [None]:
# feature 마다 한번에 묶는것 비추
# feature - survival 사이에 공통 데이터 갯수 차이가 조금 있음.
# 따로 해서, c-index 구해야함.

In [1]:
# Merge all
import pandas as pd

df_f1 = pd.read_csv(filepath_or_buffer='./data/cindex_feature_01.csv')
df_f2 = pd.read_csv(filepath_or_buffer='./data/cindex_feature_02.csv')
df_f3 = pd.read_csv(filepath_or_buffer='./data/cindex_feature_03+04.csv')
df_f5 = pd.read_csv(filepath_or_buffer='./data/cindex_feature_05.csv')
df_f6 = pd.read_csv(filepath_or_buffer='./data/cindex_feature_06.csv')

df_merged12 = pd.merge(df_f1, df_f2, on='tumor_type')
df_merged35 = pd.merge(df_f3, df_f5, on='tumor_type')
df_merged1235 = pd.merge(df_merged12, df_merged35, on='tumor_type')
df_merged = pd.merge(df_merged1235, df_f6, on='tumor_type')

df_merged.to_csv(path_or_buf='./data/cindex_overall.csv', encoding='utf-8')

In [None]:
# predict_partial_hazard model
from lifelines.utils import concordance_index
from lifelines import CoxPHFitter

# numeric data
numeric_list = data_cindex.select_dtypes(include=[int, float]).columns.tolist()
data_cindex_numeric = data_cindex.loc[:, numeric_list]

#C-index
# Survival Analysis에서 가장 많이 사용하는 정확도 지표입니다. 대상의 정확한 생존 시간을 평가하지 않고,
# 대신 여러 대상의 생존 시간(또는 위험)을 상대적으로 비교합니다. 즉, 사망 순서를 잘 예측하는지 판단합니다.

cph = CoxPHFitter().fit(data_cindex_numeric, duration_col='OS.time', event_col='OS')
c_index = concordance_index(
    event_times=data_cindex['OS.time'],
    predicted_scores=-cph.predict_partial_hazard(data_cindex_numeric),
    event_observed=data_cindex['OS'])
cph.check_assumptions(data_cindex_numeric, p_value_threshold=0.05, show_plots=True)
cph.print_summary()

#adding a penalizer to the model,
#ex: CoxPHFitter(penalizer=0.1).fit(…) until the model converges.
#In the print_summary(), the coefficients that have high collinearity will have large (absolute) magnitude in the coefs column.

In [None]:
# Feature data 05: Mutation Load
f05_muload = pd.read_csv(filepath_or_buffer='./data/05-Mutation Load - mutation-load_updated.txt', sep='\\t', encoding='utf-8')

# Drop unuse column
f05_muload.drop(['Tumor_Sample_ID'], inplace=True)

# ----------