In [47]:
from lifelines.utils.sklearn_adapter import sklearn_adapter

from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.datasets import load_rossi
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import scipy.io as sio
import os
import numpy as np
import pandas as pd
import math
import natsort as ns
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFdr, f_classif


In [48]:
local_path_luad_lusc = '/Users/rinading/Desktop/UCLA/WQE/data/'
df_radiomic_dev = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data1-v3.xlsx'), 'imaging', header = 0, index_col = 0)
df_genomic_dev = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data1-v3.xlsx'), 'expression', header = 0, index_col = 0)
df_clinical_dev = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data1-v3.xlsx'), 'clinical', header = 0, index_col = 0)

df_radiomic_test = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data2-v3.xlsx'), 'imaging', header = 0, index_col = 0)
df_genomic_test = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data2-v3.xlsx'), 'expression', header = 0, index_col = 0)
df_clinical_test = pd.read_excel(os.path.join(local_path_luad_lusc, 'elife-23421-fig2-data2-v3.xlsx'), 'clinical', header = 0, index_col = 0)

# df_genomic_dev = df_genomic_dev.T
# df_genomic_test = df_genomic_test.T



In [49]:
# Drop the patients who don't have clinical information
df_clinical_dev_no_missing = df_clinical_dev.dropna(subset = ['surv.clinical.Event', 'surv.clinical.Time.Months'])
df_clinical_test_no_missing = df_clinical_test.dropna()

In [50]:
# Drop the irrelevant columns in radiomic feature files
df_radiomic_dev = df_radiomic_dev.drop(columns = ['radiomics.Structure.Name', 'radiomics.Radiomics.Function', 'radiomics.Mapped.Structure.Name'])
df_radiomic_test = df_radiomic_test.drop(columns = ['radiomics.Structure.Name', 'radiomics.Radiomics.Function', 'radiomics.Mapped.Structure.Name'])

In [51]:
# delete patients who have 0 as survival time
df_clinical_dev_no_missing = df_clinical_dev_no_missing[df_clinical_dev_no_missing['surv.clinical.Time.Months'] != 0]
df_clinical_test_no_missing = df_clinical_test_no_missing[df_clinical_test_no_missing['surv.clinical.Time.Months'] != 0]

In [52]:
def get_available_patients(df_clinical, df_features):
    patients_with_clinical_indices = []
    for i in range(len(df_clinical)):
        for j in range(len(df_features)):
            if df_clinical.index[i] == df_features.index[j]:
                patients_with_clinical_indices.append(j)
    return patients_with_clinical_indices

def get_available_patients_genomic(df_clinical, df_features):
    patients_with_clinical_indices = []
    for i in range(len(df_clinical)):
        for j in range(df_features.shape[1]):
            if df_clinical.index[i] == df_features.columns[j]:
                patients_with_clinical_indices.append(j)
    return patients_with_clinical_indices

In [53]:
# Only keep patients with clinical information for both radiomic and genomic files, both development and test sets.
radiomic_patients_with_clinical_dev_indices = get_available_patients(df_clinical_dev_no_missing, df_radiomic_dev)
df_radiomic_dev_no_missing = df_radiomic_dev.iloc[radiomic_patients_with_clinical_dev_indices, :]
radiomic_patients_with_clinical_test_indices = get_available_patients(df_clinical_test_no_missing, df_radiomic_test)
df_radiomic_test_no_missing = df_radiomic_test.iloc[radiomic_patients_with_clinical_test_indices, :]

genomic_patients_with_clinical_dev_indices = get_available_patients_genomic(df_clinical_dev_no_missing, df_genomic_dev)
df_genomic_dev_no_missing = df_genomic_dev.iloc[:, genomic_patients_with_clinical_dev_indices]
genomic_patients_with_clinical_test_indices = get_available_patients_genomic(df_clinical_test_no_missing, df_genomic_test)
df_genomic_test_no_missing = df_genomic_test.iloc[:, genomic_patients_with_clinical_test_indices]



In [57]:
df_genomic_dev_no_missing.shape

(21766, 223)

In [55]:
#df_clinical_test_no_missing.to_excel('/Users/rinading/Desktop/clinical_test.xlsx')

In [56]:
df_genomic_dev_no_missing.to_excel('/Users/rinading/Desktop/genomic_dev.xlsx')
df_genomic_test_no_missing.to_excel('/Users/rinading/Desktop/genomic_test.xlsx')

In [None]:
# Somehow dev and test sets don't have the same number of radiomic features. Finding the common features only.
common_radiomic_indices = []
for i in range(df_radiomic_dev_no_missing.shape[1]):
    for j in range(df_radiomic_test_no_missing.shape[1]):
        if df_radiomic_dev_no_missing.columns[i] == df_radiomic_test_no_missing.columns[j]:
            common_radiomic_indices.append(j)
df_radiomic_test_no_missing = df_radiomic_test_no_missing.iloc[:, common_radiomic_indices]


In [None]:
# Normalize to have range 0 to 1. 
radiomic_scaler = MinMaxScaler(feature_range=(0, 1)).fit(df_radiomic_dev_no_missing)
radiomic_normed_train = radiomic_scaler.transform(df_radiomic_dev_no_missing)
radiomic_normed_test = radiomic_scaler.transform(df_radiomic_test_no_missing)
df_radiomic_dev_no_missing_normed = pd.DataFrame(radiomic_normed_train)
df_radiomic_dev_no_missing_normed.columns = df_radiomic_dev_no_missing.columns
df_radiomic_test_no_missing_normed = pd.DataFrame(radiomic_normed_test)
df_radiomic_test_no_missing_normed.columns = df_radiomic_test_no_missing.columns

# Fill in missing value using the average feature value. 
df_radiomic_dev_no_missing_normed = df_radiomic_dev_no_missing_normed.fillna(df_radiomic_dev_no_missing_normed.mean())
df_radiomic_test_no_missing_normed = df_radiomic_test_no_missing_normed.fillna(df_radiomic_test_no_missing_normed.mean())



### # Converting gene IDs to symbols


In [60]:
ID = pd.read_excel('/Users/rinading/Desktop/UCLA/WQE/data/GSEA/geneids.xlsx')
ID

Unnamed: 0,ID
0,3643
1,84263
2,7171
3,2934
4,11052
...,...
21761,26782
21762,26779
21763,26778
21764,26777


In [61]:
import mygene
mg = mygene.MyGeneInfo()
result = mg.getgenes(ID['ID'].values, fields = 'symbol,name')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-21766...done.


In [77]:
symbols = []
for i in range(len(result)):
    try:
        symbols.append(result[i]['symbol'])
    except:
        symbols.append(np.nan)
pd.DataFrame(symbols).to_excel('/Users/rinading/Desktop/UCLA/WQE/data/GSEA/genesymbols.xlsx', index = False)

In [80]:
pd_symbols = pd.DataFrame(symbols)