Q5. For each tumor type, calculate mutual information (I(X;Y)) between the sample feature and each of the variables in the molecular data file. Then, select the top 100 variables.

Q6. Compare similarity and difference in the top 100 variables among tumor types

Q7. For each tumor type, average the values from the top 100 variables with the Mis as relative weights, generating a single vector. What is C-index for this vector for right-censored overall survival data? Do you observer similar C-index values between Q1 and here? Explain why similar or different?

In [6]:
# MI(RPPA, HRD)
# tumor type 별로 추출해서, 타겟 df 랑 sample ID 맞춤.
# 열 돌려가며 for 문으로 계산
import pandas as pd
from sklearn import metrics

# Initialize variables
error_ttype=[]

# read csv files
# RPPA hadling
df_rppa = pd.read_csv(filepath_or_buffer='./data/TCGA-RPPA-pancan-clean.txt', sep='\\t', encoding='utf-8')
df_rppa['SampleID'] = df_rppa['SampleID'].str[:12]
df_rppa.fillna(0, inplace=True)

# Base: HRD data (modified and matched sampleID to tumor in TCGA-CDR data)
df_hrd = pd.read_csv(filepath_or_buffer='./data/06-HRD_modify.txt', sep='\\t', usecols=['SampleID', 'TumorType', 'HRD'], encoding='utf-8')
df_hrd.fillna(0, inplace=True)

# Extract by tumor
rppa_ttypes = df_rppa['TumorType'].astype('category').cat.categories.tolist()
# --> 31
# len(df_hrd['TumorType'].astype('category').cat.categories.tolist())
# --> 34

# Extract RPPA features
rppa_flist = df_rppa.columns[2:].tolist()

# Main loop
for tumor in rppa_ttypes:
    # Initialize lists
    ttype_list=[]
    f_list=[]
    mi_list=[]
    # Extract matched tumor type datasets
    rppa_tumor = df_rppa.loc[df_rppa['TumorType'] == str(tumor)]
    hrd_tumor = df_hrd[df_hrd['TumorType'] == str(tumor)]
    # ttype_list.append(str(tumor))

    # merge dataframes
    merged_bytype = pd.merge(rppa_tumor, hrd_tumor, on='SampleID')
    
    # Check NaN data in dataframe
    # check_na = merged_gbm.isna().sum()
    try:
        for feature in rppa_flist:
            ttype_list.append(str(tumor))
            f_list.append(feature)
            mi = metrics.mutual_info_score(merged_bytype['HRD'], merged_bytype[str(feature)])
            mi_list.append(mi)
    except:
        error_ttype.append(str(tumor))
        continue

    df_result = pd.DataFrame({'TumorType':ttype_list, 'Feature':f_list, 'mi_score':mi_list})
    df_result.to_csv(path_or_buf='./result/mi_score_'+str(tumor)+'.csv', encoding='utf-8')
        

  df_rppa = pd.read_csv(filepath_or_buffer='./data/TCGA-RPPA-pancan-clean.txt', sep='\\t', encoding='utf-8')
  df_hrd = pd.read_csv(filepath_or_buffer='./data/06-HRD_modify.txt', sep='\\t', usecols=['SampleID', 'TumorType', 'HRD'], encoding='utf-8')


In [3]:
# Example for GBM
# Extract matched tumor type datasets
rppa_gbm = df_rppa.loc[df_rppa['TumorType'] == 'GBM']
hrd_gbm = df_hrd[df_hrd['TumorType'] == 'GBM']

# merge dataframes
merged_gbm = pd.merge(rppa_gbm, hrd_gbm, on='SampleID')

In [None]:
# MI(RPPA, Ploidy)
# tumor type 별로 추출해서, 타겟 df 랑 sample ID 맞춤.
# 열 돌려가며 for 문으로 계산
import pandas as pd
from sklearn import metrics

# Initialize variables
error_ttype=[]

# read csv files
# RPPA hadling
df_rppa = pd.read_csv(filepath_or_buffer='./data/TCGA-RPPA-pancan-clean.txt', sep='\\t', encoding='utf-8')
df_rppa['SampleID'] = df_rppa['SampleID'].str[:12]
df_rppa.fillna(0, inplace=True)

# ploidy data
df_ploidy = pd.read_csv(filepath_or_buffer='./data/03+04-ABSOLUTE ploidy file - TCGA_mastercalls.abs_tables_JSedit.fixed.txt', sep='\\t', usecols=['array', 'plody'], encoding='utf-8')
df_ploidy['SampleID'] = df_ploidy['SampleID'].str[:-3]
df_ploidy.fillna(0, inplace=True)

# Extract by tumor
rppa_ttypes = df_rppa['TumorType'].astype('category').cat.categories.tolist()
# --> 31
# len(df_hrd['TumorType'].astype('category').cat.categories.tolist())
# --> 34

# Extract RPPA features
rppa_flist = df_rppa.columns[2:].tolist()

# Main loop
for tumor in rppa_ttypes:
    # Initialize lists
    ttype_list=[]
    f_list=[]
    mi_list=[]
    # Extract matched tumor type datasets
    rppa_tumor = df_rppa.loc[df_rppa['TumorType'] == str(tumor)]
    ploidy_tumor = df_ploidy[df_ploidy['TumorType'] == str(tumor)]
    # ttype_list.append(str(tumor))

    # merge dataframes
    merged_bytype = pd.merge(rppa_tumor, hrd_tumor, on='SampleID')
    
    # Check NaN data in dataframe
    # check_na = merged_gbm.isna().sum()
    try:
        for feature in rppa_flist:
            ttype_list.append(str(tumor))
            f_list.append(feature)
            mi = metrics.mutual_info_score(merged_bytype['HRD'], merged_bytype[str(feature)])
            mi_list.append(mi)
    except:
        error_ttype.append(str(tumor))
        continue

    df_result = pd.DataFrame({'TumorType':ttype_list, 'Feature':f_list, 'mi_score':mi_list})
    df_result.to_csv(path_or_buf='./result/mi_score_'+str(tumor)+'.csv', encoding='utf-8')
        