In [2]:
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test 

# PATH = '/lclhome/mmamu009/FIU/PhD/BioInfo/lncRNA/DeepCC/Data/survival/'
PATH = '/lclhome/mmamu009/FIU/PhD/BioInfo/lncRNA/SubType/Data/GDC/'

In [4]:
#Finding Samples from survival

def seprate_ids_multi():
    
    cancer_name = ['BLCA','CESC', 'COAD', 'HNSC', 'KIRP', 'LGG', 'LIHC', 'LUAD'] 
#     cancer_name = ['BLCA'] 
    for i in cancer_name:
        df_survival = pd.read_csv('../Data/survival/TCGA-'+i+'.survival.tsv.gz', compression='gzip', header=0, sep='\t', quotechar='"')
        df_lncRNA = pd.read_csv('../Data/TCGA_'+i+'_htseq_fpkm_sorted_lncRNA-reduced-37-T.csv')
        key_survival = df_survival['sample']
        key_lncRNA = df_lncRNA['Ensembl_ID']

        key = set(key_lncRNA)&set(key_survival)

        df_survival = df_survival.loc[df_survival['sample'].isin(key)]
        df_lncRNA = df_lncRNA.loc[df_lncRNA['Ensembl_ID'].isin(key)]
        df_survival = df_survival.sort_values(by=['sample'])
        df_lncRNA = df_lncRNA.sort_values(by=['Ensembl_ID'])

        df_lncRNA['E'] = df_survival['_EVENT'].values
        df_lncRNA['T'] = df_survival['_TIME_TO_EVENT'].values

        df_lncRNA.to_csv('../Data/survival/TCGA_'+i+'_htseq_fpkm_sorted_lncRNA-reduced-37-T-with-survival.csv', index=0)
        print(i, 'finish')

def seprate_ids_single(fname):
        df_survival = pd.read_csv(PATH+'TCGA-Clinical-BRCA.csv', header=0)
        df_lncRNA = pd.read_csv(PATH+fname+'genes-exp.csv')
        key_survival = df_survival['sample']
        key_lncRNA = df_lncRNA['Ensembl_ID']

        key = set(key_lncRNA)&set(key_survival)

        df_survival = df_survival.loc[df_survival['sample'].isin(key)]
        df_lncRNA = df_lncRNA.loc[df_lncRNA['Ensembl_ID'].isin(key)]
        df_survival = df_survival.sort_values(by=['sample'])
        df_lncRNA = df_lncRNA.sort_values(by=['Ensembl_ID'])

        df_lncRNA['OS'] = df_survival['OS'].values
        df_lncRNA['OS.time'] = df_survival['OS.time'].values
        
        df_lncRNA['DSS'] = df_survival['DSS'].values
        df_lncRNA['DSS.time'] = df_survival['DSS.time'].values
        
        df_lncRNA['DFI'] = df_survival['DFI'].values
        df_lncRNA['DFI.time'] = df_survival['DFI.time'].values
        
        df_lncRNA['PFI'] = df_survival['PFI'].values
        df_lncRNA['PFI.time'] = df_survival['PFI.time'].values

        df_lncRNA.to_csv(PATH+fname+'survival.csv', index=0)
        print('finish')

if __name__== "__main__":
    filename = 'RF-BRCA-20k-257-'
    seprate_ids_single(filename)


finish


In [None]:
#
df_lncRNA = pd.read_csv('../Data/survival/TCGA_LGG_htseq_fpkm_sorted_lncRNA-reduced-37-T-with-survival.csv')

df_dead = df_lncRNA.loc[df_lncRNA['E'] == 0]
df_alive = df_lncRNA.loc[df_lncRNA['E'] == 1]

print('dead sample:', df_dead.shape)
print('alive sample:', df_alive.shape)

df_dead_h_1 = df_dead.sample(frac=0.5, replace=0, random_state=1)
df_alive_h_1 = df_alive.sample(frac=0.5, replace=0, random_state=1)



df_dead_h_2 = df_dead.loc[~df_dead['Ensembl_ID'].isin(df_dead_h_1['Ensembl_ID'])]
df_alive_h_2 = df_alive.loc[~df_alive['Ensembl_ID'].isin(df_alive_h_1['Ensembl_ID'])]

print('1st half of dead sample:', df_dead_h_1.shape)
print('2nd half of dead sample:', df_dead_h_2.shape)

print('1st half of alive sample:', df_alive_h_1.shape)
print('2nd half of alive sample:', df_alive_h_2.shape)

df_h_1 = pd.concat([df_dead_h_1, df_alive_h_1], axis=0, join='inner').sort_index()
df_h_2 = pd.concat([df_dead_h_2, df_alive_h_2], axis=0, join='inner').sort_index()

print("first half", df_h_1.shape)
print("2nd half", df_h_2.shape)


In [None]:
# Script for Kaplan-Meier Analysis for single gene developed by Md Abdullah Al Mamun
lncRNAs =[]
def survival_analysis():

    kmf = KaplanMeierFitter()
    df = pd.read_csv(PATH+"TCGA_CESC_htseq_fpkm_sorted_lncRNA-reduced-37-T-with-survival.csv") # returns a Pandas DataFrame
    df = df.replace(np.nan,0.0)
    
    T = df['T']
    E = df['E']
    
    lncRNAs = df.columns
    lncRNAs = lncRNAs[1:-2]
    
#     for i in lncRNAs:
    groups = df[lncRNAs[0]].values

    ix = (groups >=0.3 )

    kmf.fit(T[~ix], E[~ix], label='Low')
    ax = kmf.plot()

    kmf.fit(T[ix], E[ix], label='High')
    ax = kmf.plot(ax=ax)


    results = logrank_test(T[ix], T[~ix], E[ix], E[~ix], alpha=.99)

    results.print_summary()

if __name__== "__main__":
    survival_analysis()