In [1]:
import numpy as np
import pandas as pd

In [2]:
bfile = "data/GSE66499_series_matrix.txt"

bmat_raw = open(bfile, "r").read()
bmat_list = bmat_raw.split("!")

In [3]:
nfile = "data/GSE80796_series_matrix.txt"

nmat_raw = open(nfile, "r").read()
nmat_list = nmat_raw.split("!")

# Matrix Parsing

## Bronchial RNA

In [4]:
bmat_geoids = np.array(bmat_list[20].split("\t")[-1].replace('"', '').split(" ")[:-1])
bmat_geoids[:5]

array(['GSM1623452', 'GSM1623453', 'GSM1623454', 'GSM1623455',
       'GSM1623456'], dtype='<U10')

In [5]:
bmat_cancer = np.array([
    1 if s.split(" ")[-1] == "Y" else 0 
    for s in bmat_list[46].replace('"', '').replace('\n', '').split("\t")[1:]
])
bmat_cancer[:5]

array([0, 0, 1, 1, 1])

In [6]:
matrix_data = bmat_list[70].split("\n")[2:-1]

n_geoid = len(bmat_geoids)
n_rna = len(matrix_data)

In [7]:
probe_names = []
series_matrix = np.zeros((n_rna, n_geoid))
for r in range(n_rna):
    data_str = matrix_data[r]
    data_list = data_str.split("\t")
    
    probe = data_list[0].replace('"', '')
    samples = [float(s) for s in data_list[1:]]
    
    probe_names.append(probe)
    series_matrix[r] = samples

In [8]:
matrix_df = pd.DataFrame(series_matrix.T, index = bmat_geoids, columns = probe_names)
matrix_df.index.rename("geoid", inplace = True)
matrix_df.insert(0, "cancer", bmat_cancer)
matrix_df.to_csv("data/matrix_bronchial.csv", header = True, index = True)

matrix_df.head()

Unnamed: 0_level_0,cancer,7892501,7892502,7892503,7892504,7892505,7892506,7892507,7892508,7892509,...,8180409,8180410,8180411,8180412,8180413,8180414,8180415,8180416,8180417,8180418
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1623452,0,4.216093,5.842999,4.980203,10.059506,5.019846,6.643097,6.017031,7.155189,10.78814,...,10.194038,10.516945,7.357735,8.375515,8.307193,10.754339,7.41112,6.186312,8.777097,9.784806
GSM1623453,0,4.477295,5.527915,5.12564,10.46744,5.182849,6.045693,6.547059,8.011207,10.814677,...,10.524005,10.533004,7.381779,8.307119,8.211558,10.586939,7.437256,6.320433,8.776844,9.862342
GSM1623454,1,4.192968,5.609544,4.603893,10.565574,4.886351,5.849225,6.652074,8.215739,10.908662,...,10.353227,10.69312,7.223452,8.467328,8.398528,10.904259,7.763389,6.184391,9.064446,9.73441
GSM1623455,1,3.857432,5.239925,4.875725,10.213407,4.475636,5.498195,7.000071,7.02144,10.936842,...,10.227046,10.476882,7.238507,8.365352,8.294859,11.067606,7.176992,6.300595,8.884312,9.684322
GSM1623456,1,4.17941,4.91378,4.688398,10.426099,5.461521,5.458753,6.344151,6.90163,10.81366,...,10.198178,10.340738,7.29537,8.304812,8.318317,10.599784,7.251437,6.336659,8.9342,9.608108


## Nasal RNA

In [9]:
nmat_geoids = np.array(nmat_list[11].split("\t")[-1].replace('"', '').split(" ")[:-1])
nmat_geoids[:5]

array(['GSM2137106', 'GSM2137107', 'GSM2137108', 'GSM2137109',
       'GSM2137110'], dtype='<U10')

In [10]:
nmat_cancer = np.array([
    1 if s.split(": ")[-1] == "Lung Cancer" else 0 
    for s in nmat_list[36].replace('"', '').replace('\n', '').split("\t")[1:]
])
nmat_cancer[:5]

array([1, 1, 1, 0, 1])

In [11]:
matrix_data = nmat_list[69].split("\n")[2:-1]

n_geoid = len(nmat_geoids)
n_rna = len(matrix_data)

In [12]:
probe_names = []
series_matrix = np.zeros((n_rna, n_geoid))
for r in range(n_rna):
    data_str = matrix_data[r]
    data_list = data_str.split("\t")
    
    probe = data_list[0].replace('"', '')
    samples = [float(s) for s in data_list[1:]]
    
    probe_names.append(probe)
    series_matrix[r] = samples

In [13]:
matrix_df = pd.DataFrame(series_matrix.T, index = nmat_geoids, columns = probe_names)
matrix_df.index.rename("geoid", inplace = True)
matrix_df.insert(0, "cancer", nmat_cancer)
matrix_df.to_csv("data/matrix_nasal.csv", header = True, index = True)

matrix_df.head()

Unnamed: 0_level_0,cancer,7892501,7892502,7892503,7892504,7892505,7892506,7892507,7892508,7892509,...,8180408,8180409,8180410,8180411,8180413,8180414,8180415,8180416,8180417,8180418
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM2137106,1,2.288567,3.467242,3.106184,8.791999,4.01269,4.081984,5.291053,5.464147,8.951847,...,7.769016,8.964655,8.973874,5.470847,7.023548,9.563756,5.953739,4.374953,7.687563,8.691736
GSM2137107,1,1.995199,3.453598,2.908943,8.919501,4.205525,3.219758,4.841385,5.042885,9.129324,...,7.607348,8.747328,8.850202,5.861981,7.032782,10.309471,6.441748,4.434606,7.554528,8.881209
GSM2137108,1,3.982334,3.869297,2.948449,8.461618,4.468499,4.008897,4.830632,5.670238,8.984328,...,8.181692,9.125575,8.873725,5.270558,6.407999,8.712529,6.395317,4.754459,7.727718,8.677669
GSM2137109,0,1.931912,3.657101,4.390043,8.623355,3.803331,3.329403,4.317178,5.997588,9.332712,...,7.820397,9.069256,9.384279,5.380734,6.846073,9.381491,5.983618,4.18522,7.542234,8.736329
GSM2137110,1,1.437287,3.41754,3.199383,8.74883,2.403188,3.964574,4.415696,5.164858,9.07493,...,7.767819,8.805099,9.094576,5.550305,6.500735,9.403628,6.037162,4.37895,7.52563,8.802997
