In [1]:
from concrete_autoencoder import ConcreteAutoencoderFeatureSelector
from keras.layers import Dense, Dropout, LeakyReLU
import numpy as np

def f(x):
    x = Dense(300)(x)
    x = LeakyReLU(0.1)(x)
    x = Dropout(0.1)(x)
    x = Dense(300)(x)
    x = LeakyReLU(0.1)(x)
    x = Dropout(0.1)(x)
    x = Dense(19962)(x)
    return x

selector = ConcreteAutoencoderFeatureSelector(K = 100, output_function = f, num_epochs = 8000, learning_rate=0.002, start_temp=10, min_temp=0.1, )

(60000, 784) (60000, 10)
(10000, 784) (10000, 10)


In [2]:
def get_expression_matrix(sample_sheet, data_type, data_directory):
    overall_df = None
    for i, row in sample_sheet.iterrows():
        sample_subdir = row["File ID"]
        sample_filename = row["File Name"]
        patient_id = row["Sample ID"]

        filename = f"{data_directory}/{sample_subdir}/{sample_filename}"

        sample_df = pd.read_csv(filename, sep='\t',skiprows=[0])
    
        sample_df = sample_df[ (sample_df['gene_type'] == 'protein_coding')] 

        if overall_df is None:
            overall_df = sample_df[["gene_name"]]

        sample_df = sample_df[[data_type]]
    
        sample_df.columns = [patient_id]

        duplicated_cols = set(overall_df.columns) & set(sample_df.columns)

        sample_df = sample_df.drop(columns=duplicated_cols)

        overall_df = overall_df.join(sample_df) # TODO investigate duplicate sample IDs

    return overall_df    

def get_expression_matrices(sample_sheet_path, expression_directory):
    sample_sheet = pd.read_csv(sample_sheet_path, sep="\t") 

    sample_sheet_tumors = sample_sheet[sample_sheet["Sample Type"] == "Primary Tumor"]
    sample_sheet_normals = sample_sheet[sample_sheet["Sample Type"] == "Solid Tissue Normal"]

    tumor_tpm = get_expression_matrix(sample_sheet_tumors, "tpm_unstranded", expression_directory)
    normal_tpm = get_expression_matrix(sample_sheet_normals, "tpm_unstranded", expression_directory)

    tumor_fpkm = get_expression_matrix(sample_sheet_tumors, "fpkm_unstranded", expression_directory)
    normal_fpkm = get_expression_matrix(sample_sheet_normals, "fpkm_unstranded", expression_directory)

    tumor_fpkm_uq = get_expression_matrix(sample_sheet_tumors, "fpkm_uq_unstranded", expression_directory)
    normal_fpkm_uq = get_expression_matrix(sample_sheet_normals, "fpkm_uq_unstranded", expression_directory)

    return ((tumor_tpm, normal_tpm), (tumor_fpkm, normal_fpkm), (tumor_fpkm_uq, normal_fpkm_uq))

import numpy as np  

def log2_normalize(expression_dataframe):
    expression_dataframe.iloc[:, 1:] = expression_dataframe.iloc[:, 1:].applymap(lambda x: 1 + np.log2(x) if x != 0 else 0)    

In [3]:
import pandas as pd
data = get_expression_matrices(r"C:\Users\saebert\Downloads\gdc_sample_sheet.2023-04-17.tsv", r"C:\Users\saebert\Downloads\gdc_download_20230418_013130.187133") 


In [4]:
# now do the same for KIRP:

kirp_tpm_tumor = data[0][0]
kirp_tpm_normal = data[0][1]

kirp_fpkm_tumor = data[1][0]
kirp_fpkm_normal = data[1][1]

kirp_fpkm_uq_tumor = data[2][0]
kirp_fpkm_uq_normal = data[2][1]

log2_normalize(kirp_tpm_tumor)
log2_normalize(kirp_tpm_normal)
log2_normalize(kirp_fpkm_tumor)
log2_normalize(kirp_fpkm_normal)
log2_normalize(kirp_fpkm_uq_tumor)
log2_normalize(kirp_fpkm_uq_normal)

In [38]:
def depth2(data):
    #input is a pandas data frame with gene expression values
    dt = data.T
    result = dt.apply(lambda col: (col-col.mean())/col.std())
    result = result.replace(np.nan,0)
    result = result.apply(lambda col: abs(col)).T
    return result.std()

In [39]:
kirp_tpm_tumor_full = kirp_tpm_tumor.copy()

In [51]:
kirp_tpm_filtered = kirp_tpm_tumor.iloc[list(selector.get_indices())]

In [53]:
kirp_depth_normal = depth2(kirp_tpm_tumor_full.drop(labels=["gene_name"], axis=1))
kirp_depth_normal

TCGA-B3-4103-01A    0.572552
TCGA-A4-A5Y0-01A    0.700691
TCGA-2Z-A9JM-01A    0.405085
TCGA-UZ-A9PR-01A    0.523241
TCGA-HE-A5NF-01A    0.463544
                      ...   
TCGA-G7-6789-01A    0.653829
TCGA-BQ-7048-01A    0.659886
TCGA-A4-8518-01A    0.679474
TCGA-B9-A69E-01A    0.605951
TCGA-UZ-A9PJ-01A    0.467040
Length: 290, dtype: float64

In [55]:
kirp_depth_fs = depth2(kirp_tpm_filtered.drop(labels=["gene_name"], axis=1))
kirp_depth_fs

TCGA-B3-4103-01A    0.451194
TCGA-A4-A5Y0-01A    0.742447
TCGA-2Z-A9JM-01A    0.353507
TCGA-UZ-A9PR-01A    0.549914
TCGA-HE-A5NF-01A    0.441748
                      ...   
TCGA-G7-6789-01A    0.683079
TCGA-BQ-7048-01A    0.510632
TCGA-A4-8518-01A    0.646797
TCGA-B9-A69E-01A    0.668511
TCGA-UZ-A9PJ-01A    0.480557
Length: 290, dtype: float64

In [49]:
kirp_tpm

NameError: name 'kirp_tpm' is not defined

In [40]:
selected_gene_names

['DAGLA',
 'PAK6',
 'IL16',
 'CNTLN',
 'AFF2',
 'SMC4',
 'CEND1',
 'SVEP1',
 'GHR',
 'UGT1A9',
 'HJURP',
 'EDA',
 'PAMR1',
 'RASD2',
 'MT1H',
 'C19orf18',
 'UBAP1L',
 'BFSP1',
 'FAM47E',
 'COL14A1',
 'NBEAL1',
 'C18orf54',
 'KLHDC7B',
 'MFAP3',
 'MAP1LC3C',
 'SLC22A24',
 'SLC16A6',
 'PLPPR5',
 'MST1R',
 'RNF32',
 'HDAC8',
 'STXBP6',
 'TMCC2',
 'SYNPO2',
 'CYP4F3',
 'SCART1',
 'SMLR1',
 'CERKL',
 'RNFT2',
 'TTBK1',
 'GLOD5',
 'DNHD1',
 'TROAP',
 'CCDC151',
 'KBTBD4',
 'ZNF337',
 'PRODH2',
 'NCF1',
 'AC079447.1',
 'CAPN3',
 'SSC4D',
 'C2CD4B',
 'DCAF12L1',
 'CNTN4',
 'CP',
 'PIK3CG',
 'LHFPL3',
 'GGACT',
 'WFDC12',
 'C5AR2',
 'MYRIP',
 'CHST9',
 'SPA17',
 'TRIM10',
 'NCMAP',
 'SMOC1',
 'SAA2',
 'PID1',
 'PRODH',
 'PPP1R14D',
 'DMRTA1',
 'SPA17',
 'P2RY8',
 'RBM20',
 'CHGB',
 'TRIM10',
 'CCDC151',
 'PET100',
 'CLGN',
 'SCEL',
 'SIRPB2',
 'CCRL2',
 'TMCC2',
 'AADACL4',
 'TMEM158',
 'RTN4RL1',
 'CCDC151',
 'ADGRF1',
 'CLGN',
 'DENND2A',
 'PLEKHG6',
 'SLC7A9',
 'GGACT',
 'CAPN13',
 'CPXM1',


In [47]:
gene_rows_to_keep = selector.get_indices()

In [48]:
gene_rows_to_keep

array([ 6865,  7452, 13037,   602,  9784,  4198, 15108, 11348,  4091,
       18318,  5339, 10121,  9045,  2238, 17364, 13847, 18482,  5676,
       16079, 15820,  8454, 11756,  6198,   538, 16589, 16565,  3578,
        4698, 11058,  3207,  8801, 12267,  6619, 13052, 15549, 17682,
       18759, 15924,  6925,  8702, 12816, 14243,  6990, 16661,  5334,
        6231, 18582, 10085, 18354,  1965,  8757, 17386, 16944,  8481,
         619,  3180, 15707,  6878, 12208,  6874, 12506,  9584,   870,
       17228, 15091, 16860,  6795,  9554,  2152, 11581, 13746,   870,
       14619, 17033,  1817, 17228, 16661, 18044,  9493,  7130, 16173,
        5151,  6619, 17195, 18551, 15406, 16661,  9515,  9493,  8789,
         214,   405,  6878, 10709,  1785, 16290,  6025,  5252,  4709,
       18632], dtype=int64)

In [43]:
kirp_filtered = kirp_tpm_tumor.loc[kirp_tpm_tumor["gene_name"].isin(selected_gene_names)]

In [46]:
len(selected_gene_names)

100

In [44]:
kirp_filtered

Unnamed: 0,gene_name,TCGA-B3-4103-01A,TCGA-A4-A5Y0-01A,TCGA-2Z-A9JM-01A,TCGA-UZ-A9PR-01A,TCGA-HE-A5NF-01A,TCGA-DW-7836-01A,TCGA-5P-A9KE-01A,TCGA-GL-8500-01A,TCGA-G7-6797-01A,...,TCGA-BQ-5880-01A,TCGA-BQ-7062-01A,TCGA-UZ-A9PK-01A,TCGA-A4-A7UZ-01A,TCGA-BQ-5893-01A,TCGA-G7-6789-01A,TCGA-BQ-7048-01A,TCGA-A4-8518-01A,TCGA-B9-A69E-01A,TCGA-UZ-A9PJ-01A
219,PLEKHG6,2.768290,3.260507,4.712937,5.483964,0.921132,4.639290,1.676177,3.398049,2.379954,...,-1.423526,3.644341,3.628960,4.559223,-1.110293,4.839718,-2.105318,-0.987360,0.211884,5.515845
412,SLC7A9,3.383276,4.510582,4.067449,2.619413,3.944203,0.158337,0.782576,3.147242,0.809661,...,6.149731,3.643695,2.063641,-0.211565,-0.911044,-0.947306,5.772926,5.519938,2.402231,-1.013334
545,MFAP3,4.640887,2.375012,2.561595,4.126643,2.770998,3.440500,1.469782,3.612164,5.050754,...,1.565305,3.218006,2.917050,2.325214,3.718614,3.428893,3.994417,2.140975,2.931267,3.884422
609,CNTLN,3.298746,2.546808,2.860367,3.633919,2.108558,3.422018,1.641361,3.267985,4.229096,...,1.030548,2.923073,2.469469,2.427499,3.370890,2.694435,2.677486,0.805705,1.732313,3.969510
626,CP,1.087463,-2.509635,0.724912,0.942308,0.635244,0.965101,-0.069755,-3.499810,-0.988504,...,-0.384584,1.434775,1.458487,6.308099,2.194213,7.813035,1.877587,0.038717,-2.104077,0.657366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37493,UBAP1L,3.394020,-0.653809,1.496207,3.056341,0.315914,0.422664,-0.636212,1.452174,1.951439,...,-1.931260,1.494057,1.920065,2.177344,0.249506,0.402067,0.299127,-0.860923,1.000577,1.826274
38765,TMEM158,1.015498,-1.968040,1.281669,1.256890,0.591871,2.635336,-1.143054,-0.864596,-0.617056,...,1.521352,2.413215,1.733615,3.212818,3.997834,3.880960,0.119821,0.167615,-0.147522,1.083520
39301,PRODH2,3.372646,5.261756,2.646762,-1.049308,3.944634,-1.498994,-0.788365,3.585059,1.640343,...,4.413161,2.587797,1.399007,2.038296,-1.620601,1.509037,2.893556,6.612181,4.584950,-2.671164
41407,PCDHGA6,3.156850,0.638213,0.098689,3.805065,-0.671623,1.623773,-0.294774,1.371503,3.615911,...,-1.085589,1.628634,2.232784,0.261110,-1.374075,-0.009553,-1.001732,0.183455,-0.287360,1.814345


In [19]:
kirp_tpm_tumor.shape

(19962, 291)

In [5]:
kirp_transposed = kirp_tpm_tumor.transpose()

In [50]:
kirp_transposed.shape

(291, 19962)

In [42]:
kirp_transposed.index

Index(['gene_name', 'TCGA-B3-4103-01A', 'TCGA-A4-A5Y0-01A', 'TCGA-2Z-A9JM-01A',
       'TCGA-UZ-A9PR-01A', 'TCGA-HE-A5NF-01A', 'TCGA-DW-7836-01A',
       'TCGA-5P-A9KE-01A', 'TCGA-GL-8500-01A', 'TCGA-G7-6797-01A',
       ...
       'TCGA-BQ-5880-01A', 'TCGA-BQ-7062-01A', 'TCGA-UZ-A9PK-01A',
       'TCGA-A4-A7UZ-01A', 'TCGA-BQ-5893-01A', 'TCGA-G7-6789-01A',
       'TCGA-BQ-7048-01A', 'TCGA-A4-8518-01A', 'TCGA-B9-A69E-01A',
       'TCGA-UZ-A9PJ-01A'],
      dtype='object', length=291)

In [32]:
kirp_train.shape

(250, 19962)

In [33]:
kirp_test.shape

(41, 19962)

In [9]:
kirp_train.columns

Int64Index([    4,     5,     6,     7,     8,     9,    10,    11,    12,
               13,
            ...
            60647, 60648, 60649, 60651, 60653, 60654, 60659, 60661, 60662,
            60663],
           dtype='int64', length=19962)

In [43]:
kirp_transposed

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,...,60647,60648,60649,60651,60653,60654,60659,60661,60662,60663
gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,ACTL10,AC119733.1,AC020765.6,AC010980.1,AC004233.4,AL451106.1,AC008763.4,AC006486.3,AL391628.1,AP006621.6
TCGA-B3-4103-01A,6.645341,2.403322,7.693605,4.061396,2.317594,4.734265,4.415651,7.171889,5.5418,6.065999,...,0.0,0.0,-3.140507,0.990011,0.0,0.0,0.0,0.0,-2.859352,0.576087
TCGA-A4-A5Y0-01A,6.015573,-0.664288,6.14954,2.049073,0.489543,0.997545,0.544782,6.919995,6.114417,5.266442,...,0.0,0.0,0.0,-3.673003,-3.227016,0.0,0.0,0.0,-4.629501,0.956354
TCGA-2Z-A9JM-01A,6.211032,0.000577,7.27472,3.076833,1.057693,5.904113,5.597859,7.795955,5.018367,5.018616,...,0.0,0.0,0.0,1.656176,-2.563199,0.0,0.0,0.0,-4.70275,0.872632
TCGA-UZ-A9PR-01A,6.412737,-3.483581,7.306384,4.154454,1.813114,4.17495,7.011943,7.720798,5.249058,6.139318,...,0.0,0.0,0.0,-0.098815,-0.461959,0.0,0.0,0.0,-6.179188,0.029983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-G7-6789-01A,7.105716,0.0,5.995466,3.553975,1.473839,2.835964,6.247259,7.353516,3.353634,5.239337,...,0.0,0.0,0.0,-1.032094,0.0,0.0,0.0,0.0,-3.075824,0.151079
TCGA-BQ-7048-01A,6.140509,-3.576906,6.38819,2.885379,1.189034,3.3097,4.604664,7.4429,3.543397,7.382806,...,0.0,0.0,0.0,-1.998846,0.0,0.0,0.0,0.0,-3.279284,1.594549
TCGA-A4-8518-01A,6.14773,-0.391002,6.112225,1.593497,-0.166179,2.88409,2.582122,6.552684,4.408807,3.515763,...,0.0,0.0,0.0,-1.6878,0.0,0.0,0.0,0.0,0.0,-0.867226
TCGA-B9-A69E-01A,5.018661,0.0,6.380733,2.182756,0.851679,4.923472,2.508074,6.990996,3.657091,6.890734,...,0.0,0.0,0.0,-3.17397,-3.143054,0.0,0.0,0.0,-4.86986,1.106884


In [44]:
kirp_transposed.index

Index(['gene_name', 'TCGA-B3-4103-01A', 'TCGA-A4-A5Y0-01A', 'TCGA-2Z-A9JM-01A',
       'TCGA-UZ-A9PR-01A', 'TCGA-HE-A5NF-01A', 'TCGA-DW-7836-01A',
       'TCGA-5P-A9KE-01A', 'TCGA-GL-8500-01A', 'TCGA-G7-6797-01A',
       ...
       'TCGA-BQ-5880-01A', 'TCGA-BQ-7062-01A', 'TCGA-UZ-A9PK-01A',
       'TCGA-A4-A7UZ-01A', 'TCGA-BQ-5893-01A', 'TCGA-G7-6789-01A',
       'TCGA-BQ-7048-01A', 'TCGA-A4-8518-01A', 'TCGA-B9-A69E-01A',
       'TCGA-UZ-A9PJ-01A'],
      dtype='object', length=291)

In [18]:
kirp_transposed.loc["gene_name"]

4            TSPAN6
5              TNMD
6              DPM1
7             SCYL3
8          C1orf112
            ...    
60654    AL451106.1
60659    AC008763.4
60661    AC006486.3
60662    AL391628.1
60663    AP006621.6
Name: gene_name, Length: 19962, dtype: object

In [6]:
kt_2 = kirp_transposed.copy()

kt_2 = kt_2.drop("gene_name")
kt_2

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,...,60647,60648,60649,60651,60653,60654,60659,60661,60662,60663
TCGA-B3-4103-01A,6.645341,2.403322,7.693605,4.061396,2.317594,4.734265,4.415651,7.171889,5.5418,6.065999,...,0.0,0.0,-3.140507,0.990011,0.0,0.0,0.0,0.0,-2.859352,0.576087
TCGA-A4-A5Y0-01A,6.015573,-0.664288,6.14954,2.049073,0.489543,0.997545,0.544782,6.919995,6.114417,5.266442,...,0.0,0.0,0.0,-3.673003,-3.227016,0.0,0.0,0.0,-4.629501,0.956354
TCGA-2Z-A9JM-01A,6.211032,0.000577,7.27472,3.076833,1.057693,5.904113,5.597859,7.795955,5.018367,5.018616,...,0.0,0.0,0.0,1.656176,-2.563199,0.0,0.0,0.0,-4.70275,0.872632
TCGA-UZ-A9PR-01A,6.412737,-3.483581,7.306384,4.154454,1.813114,4.17495,7.011943,7.720798,5.249058,6.139318,...,0.0,0.0,0.0,-0.098815,-0.461959,0.0,0.0,0.0,-6.179188,0.029983
TCGA-HE-A5NF-01A,5.763847,0.0,6.284033,2.174215,0.552082,6.024856,4.375429,7.297773,4.211168,4.234624,...,0.0,0.0,0.0,2.09788,0.0,0.0,0.0,0.0,-3.224317,-0.290886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-G7-6789-01A,7.105716,0.0,5.995466,3.553975,1.473839,2.835964,6.247259,7.353516,3.353634,5.239337,...,0.0,0.0,0.0,-1.032094,0.0,0.0,0.0,0.0,-3.075824,0.151079
TCGA-BQ-7048-01A,6.140509,-3.576906,6.38819,2.885379,1.189034,3.3097,4.604664,7.4429,3.543397,7.382806,...,0.0,0.0,0.0,-1.998846,0.0,0.0,0.0,0.0,-3.279284,1.594549
TCGA-A4-8518-01A,6.14773,-0.391002,6.112225,1.593497,-0.166179,2.88409,2.582122,6.552684,4.408807,3.515763,...,0.0,0.0,0.0,-1.6878,0.0,0.0,0.0,0.0,0.0,-0.867226
TCGA-B9-A69E-01A,5.018661,0.0,6.380733,2.182756,0.851679,4.923472,2.508074,6.990996,3.657091,6.890734,...,0.0,0.0,0.0,-3.17397,-3.143054,0.0,0.0,0.0,-4.86986,1.106884


In [7]:
from sklearn.model_selection import train_test_split

kirp_train_2, kirp_test_2 = train_test_split(kt_2, test_size=0.2, random_state=42)

In [40]:
kirp_train_2.index

Index(['TCGA-5P-A9K8-01A', 'TCGA-G7-7502-01A', 'TCGA-A4-7996-01A',
       'TCGA-IA-A40Y-01A', 'TCGA-SX-A7SP-01A', 'TCGA-BQ-7061-01A',
       'TCGA-5P-A9KC-01A', 'TCGA-Y8-A8RY-01A', 'TCGA-P4-A5E7-01A',
       'TCGA-BQ-7046-01A',
       ...
       'TCGA-DW-7839-01A', 'TCGA-2Z-A9JD-01A', 'TCGA-BQ-5888-01A',
       'TCGA-UZ-A9PJ-01A', 'TCGA-UZ-A9PL-01A', 'TCGA-BQ-5894-01A',
       'TCGA-MH-A560-01A', 'TCGA-EV-5903-01A', 'TCGA-2Z-A9JT-01A',
       'TCGA-BQ-7050-01A'],
      dtype='object', length=232)

In [41]:
kirp_train.index

Index(['gene_name', 'TCGA-B3-4103-01A', 'TCGA-A4-A5Y0-01A', 'TCGA-2Z-A9JM-01A',
       'TCGA-UZ-A9PR-01A', 'TCGA-HE-A5NF-01A', 'TCGA-DW-7836-01A',
       'TCGA-5P-A9KE-01A', 'TCGA-GL-8500-01A', 'TCGA-G7-6797-01A',
       ...
       'TCGA-P4-A5E8-01A', 'TCGA-HE-A5NJ-01A', 'TCGA-A4-8310-01A',
       'TCGA-B1-A656-01A', 'TCGA-5P-A9K2-01A', 'TCGA-G7-6793-01A',
       'TCGA-WN-A9G9-01A', 'TCGA-A4-A48D-01A', 'TCGA-HE-A5NH-01A',
       'TCGA-B9-4617-01A'],
      dtype='object', length=250)

In [9]:
# now convert to the necessary format, and 

kirp_train_np = kirp_train_2.to_numpy()
kirp_test_np = kirp_test_2.to_numpy()


kirp_train_np = np.delete(kirp_train_np, (0), axis=0)
kirp_test_np = np.delete(kirp_test_np, (0), axis=0)


In [58]:
kirp_train_np.shape

(250, 19962)

In [36]:
kirp_test_np.shape

(57, 19962)

In [10]:
kirp_train_np = kirp_train_np.astype(np.float32)
kirp_test_np = kirp_test_np.astype(np.float32)

In [11]:
selector.fit(kirp_train_np, kirp_train_np, kirp_test_np, kirp_test_np)



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 19962)]           0         
                                                                 
 concrete_select (ConcreteSe  (None, 100)              1996201   
 lect)                                                           
                                                                 
 dense (Dense)               (None, 300)               30300     
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 300)               0         
                                                                 
 dropout (Dropout)           (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 300)               90300     
                                                             

<concrete_autoencoder.ConcreteAutoencoderFeatureSelector at 0x18c81203850>

In [12]:
selector.get_indices()

array([ 6865,  7452, 13037,   602,  9784,  4198, 15108, 11348,  4091,
       18318,  5339, 10121,  9045,  2238, 17364, 13847, 18482,  5676,
       16079, 15820,  8454, 11756,  6198,   538, 16589, 16565,  3578,
        4698, 11058,  3207,  8801, 12267,  6619, 13052, 15549, 17682,
       18759, 15924,  6925,  8702, 12816, 14243,  6990, 16661,  5334,
        6231, 18582, 10085, 18354,  1965,  8757, 17386, 16944,  8481,
         619,  3180, 15707,  6878, 12208,  6874, 12506,  9584,   870,
       17228, 15091, 16860,  6795,  9554,  2152, 11581, 13746,   870,
       14619, 17033,  1817, 17228, 16661, 18044,  9493,  7130, 16173,
        5151,  6619, 17195, 18551, 15406, 16661,  9515,  9493,  8789,
         214,   405,  6878, 10709,  1785, 16290,  6025,  5252,  4709,
       18632], dtype=int64)

In [None]:
gene_names_list = list(kirp_transposed.loc["gene_name"])

indices_to_keep = selector.get_indices()


selected_gene_names = list()

for index in indices_to_keep:
    selected_gene_names.append(gene_names_list[index])

selected_gene_names

with open("kirp_selected_genes.txt", "w+") as kirp_top100:
    for gene in selected_gene_names:
        kirp_top100.write(gene)
        kirp_top100.write("\n")


In [37]:
selected_gene_names

['DAGLA',
 'PAK6',
 'IL16',
 'CNTLN',
 'AFF2',
 'SMC4',
 'CEND1',
 'SVEP1',
 'GHR',
 'UGT1A9',
 'HJURP',
 'EDA',
 'PAMR1',
 'RASD2',
 'MT1H',
 'C19orf18',
 'UBAP1L',
 'BFSP1',
 'FAM47E',
 'COL14A1',
 'NBEAL1',
 'C18orf54',
 'KLHDC7B',
 'MFAP3',
 'MAP1LC3C',
 'SLC22A24',
 'SLC16A6',
 'PLPPR5',
 'MST1R',
 'RNF32',
 'HDAC8',
 'STXBP6',
 'TMCC2',
 'SYNPO2',
 'CYP4F3',
 'SCART1',
 'SMLR1',
 'CERKL',
 'RNFT2',
 'TTBK1',
 'GLOD5',
 'DNHD1',
 'TROAP',
 'CCDC151',
 'KBTBD4',
 'ZNF337',
 'PRODH2',
 'NCF1',
 'AC079447.1',
 'CAPN3',
 'SSC4D',
 'C2CD4B',
 'DCAF12L1',
 'CNTN4',
 'CP',
 'PIK3CG',
 'LHFPL3',
 'GGACT',
 'WFDC12',
 'C5AR2',
 'MYRIP',
 'CHST9',
 'SPA17',
 'TRIM10',
 'NCMAP',
 'SMOC1',
 'SAA2',
 'PID1',
 'PRODH',
 'PPP1R14D',
 'DMRTA1',
 'SPA17',
 'P2RY8',
 'RBM20',
 'CHGB',
 'TRIM10',
 'CCDC151',
 'PET100',
 'CLGN',
 'SCEL',
 'SIRPB2',
 'CCRL2',
 'TMCC2',
 'AADACL4',
 'TMEM158',
 'RTN4RL1',
 'CCDC151',
 'ADGRF1',
 'CLGN',
 'DENND2A',
 'PLEKHG6',
 'SLC7A9',
 'GGACT',
 'CAPN13',
 'CPXM1',
