# PreProcessing data
### (validation, standardization, imputation, augmentation, deduplication)

# Defining Features for GEO data (validation set)

In [133]:
%run "..\Model\DataHelpers.ipynb"

In [134]:
%run "..\Model\DataHelpers.ipynb"

featuresStatistical = [ 'MCF2L2', 'FOXC1' , 'YBX1'  , 'IGF2BP2', 'FSCN1'   ,
                        'GABRP' , 'SOX10' , 'CENPA' , 'PADI2'  , 'CDC20'   ,
                        'BCL11A', 'HAPLN3', 'ANP32E', 'SFT2D2' , 'B3GNT5'  ,
                        'ANKS6' , 'FOXA1' , 'PSAT1' , 'TBX19'  , 'CDCA7'   ,
                        'CENPW' , 'UGT8'  , 'RGMA'  , 'EN1'    , 'FAM171A1' ]

featuresLASSO       = [ 'CD1A'  , 'CSF2RB', 'EPCAM' , 'ERBB2'  , 'ESR1'    ,
                        'EZH2'  , 'FGB'   , 'FOXA1' , 'FOXC1'  , 'GATA3'   , 
                        'LAMA2' , 'LMNA'  , 'MDGA2' , 'OBSCN'  , 'OGN'     , 
                        'PGR'   , 'SELL'  , 'SRC'   , 'TACSTD2', 'TBC1D22B', 
                        'TFF1'  , 'TGFB3' , 'UBE2C' , 'VTCN1'  ,'WTAP'     , 
                        'YES1'  , 'YOD1' ]

featuresValidation = featuresStatistical + featuresLASSO + constTargetAndMetadata

# Define GEO data

In [135]:
geoData = {}

geoData['GSE65216'] = {'dataSetName': 'GSE65216', 'dataSetFile':'GSE65216-GPL570_series_matrix.txt'   , 'dataSetSkipRows': 67 , 'dataSetSkipFooter': 1, 'platformName': 'GPL570'  , 'platformFile': 'GPL570-55999.txt'  , 'platformSkipRows': 16, 'platformSkipFooter': 0, 'processing':'Data were analyzed using standard AffyCDF or Brainarray HGU133Plus2_Hs_ENTREZG version 13 custom chipset definition file. Data were normalized using GC-RMA. Batch and hybridization effects were corrected using a linear model in which they were treated as fixed effects. Samples with technical replicates were subsequently averaged.'}
geoData['GSE76250'] = {'dataSetName': 'GSE76250', 'dataSetFile':'GSE76250_series_matrix.txt'          , 'dataSetSkipRows': 74 , 'dataSetSkipFooter': 1, 'platformName': 'GPL17586', 'platformFile': 'GPL17586-45144.txt', 'platformSkipRows': 15, 'platformSkipFooter': 1, 'processing':'The data were analyzed with Robust Multichip Analysis (RMA) algorithm using Affymetrix default analysis settings and global scaling as normalization method (ExpressionConsole 1.3.1) probe group file: HTA-2_0.r1.pgf meta-probeset file: HTA-2_0.r1.Psrs.mps'}

# Define synonyms for TCGA-BRCA based on KEGG and HEP

In [136]:
geoData['GSE65216']['synonyms'] = { 'IGJ': 'JCHAIN',                                   
                                    'PRH1-PRR4': 'PRR4',
                                    'FIGF': 'VEGFD'
                                  }

geoData['GSE76250']['synonyms'] = { 'DARC': 'ACKR1',
                                    'IGJ': 'JCHAIN',
                                    'WDR67': 'TBC1D31',
                                    'FIGF': 'VEGFD',
                                    'IL8': 'CXCL8',
                                  }

# Get files to load TCGA-BRCA RNA-SEQ

In [81]:
import pandas as pd

targetGenes = [ 'protein_coding' ]

variant = 'research_logScaled' ## FOUT
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Load data - TCGA-BRCA

In [57]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    #Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_type'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Loading data - GEO

In [137]:
import os

geoData_GSE65216_file = os.path.join('../Data', 'geoData_GSE65216.csv')
geoData_GSE76250_file = os.path.join('../Data', 'geoData_GSE76250.csv')

print(f"***- Reading GEO data - Start")
geoData['GSE65216']['data'] = pd.read_csv(geoData_GSE65216_file, sep=',')
geoData['GSE76250']['data'] = pd.read_csv(geoData_GSE76250_file, sep=',')
print(f"***- Reading GEO data - End")

***- Reading GEO data - Start
***- Reading GEO data - End


# Feature columns - GEO

In [138]:
print(f"***- Determine feature columns - Start")
onlyFeatureColumns_GSE76250 = len(geoData['GSE76250']['data'].columns)-2 # tnbc / case_id
geoData['GSE76250']['featureColumns'] = geoData['GSE76250']['data'].columns[:onlyFeatureColumns_GSE76250]

onlyFeatureColumns_GSE65216 = len(geoData['GSE65216']['data'].columns)-2 # tnbc / case_id
geoData['GSE65216']['featureColumns'] = geoData['GSE65216']['data'].columns[:onlyFeatureColumns_GSE65216]
print(f"***- Determine feature columns - End")

***- Determine feature columns - Start
***- Determine feature columns - End


# Deduplication TCGA-BRCA - Check rows

In [60]:
# There are no duplicates
dfPatientGenes[dfPatientGenes.duplicated()]

gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AC020765.6,AC010980.1,AC004233.4,AL451106.1,AC008763.4,AC006486.3,AL391628.1,AP006621.6,tnbc,case_id


# Deplication GEO - Check rows

In [139]:
total = len(geoData.keys())
counter = 1
print(f"***- Checking dupplicates - Start")

for k in geoData.keys():
    if len(geoData[k]['data'].columns[geoData[k]['data'].columns.duplicated()]) == 0:
        print(f'***- Dataset {k} has no duplicates')
    else:
        print(f'***- Dataset {k} has duplicates')
        
    counter+=1

print(f"***- Checking dupplicates - End")

***- Checking dupplicates - Start
***- Dataset GSE65216 has no duplicates
***- Dataset GSE76250 has no duplicates
***- Checking dupplicates - End


# Deduplication TCGA-BRCA - Check columns ( 24 duplicates )
### Duplicate columns removed based on sum(column) == 0

In [62]:
duplCol = dfPatientGenes.columns[dfPatientGenes.columns.duplicated()]
numColB4 = len(dfPatientGenes.columns)
print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Number of duplicate columns: {len(duplCol)}')

print(f'Removing duplicates - Start')

reindexDfPatientGenes = dfPatientGenes.T.reset_index()
reindexDfPatientGenes.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)
reindexDfPatientGenes.set_index('gene_name', inplace=True)
dfPatientGenes = reindexDfPatientGenes.T

print(f'Removing duplicates - End')

print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Check                      : {numColB4}-{len(duplCol)} = {len(dfPatientGenes.columns)}')

Total number of columns    : 19964
Number of duplicate columns: 24
Removing duplicates - Start
Removing duplicates - End
Total number of columns    : 19940
Check                      : 19964-24 = 19940


# Imputation - TBD -> 0 waarde?

In [63]:
# TBD

# Standardization TCGA-BRCA - Log transformation - Manage outliers

In [64]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
onlyFeatureColumns = len(dfPatientGenes.columns)-2 # tnbc / case_id
allGeneColumns = dfPatientGenes.columns[:onlyFeatureColumns]

# Convert to float (was object)
dfPatientGenes[allGeneColumns] = dfPatientGenes[allGeneColumns].astype(float)

print(f'Apply LOG - Start')
# Apply log
dfPatientGenes[allGeneColumns] = (dfPatientGenes[allGeneColumns] + 1).transform(np.log)
print(f'Apply LOG - End')

Apply LOG - Start
Apply LOG - End


# Standardization TCGA-BRCA - Removing mean and Scaling

In [65]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

print(f'Apply Scaling - Start')
dfPatientGenes[allGeneColumns] = scaler.fit_transform(dfPatientGenes[allGeneColumns])
print(f'Apply Scaling - End')

Apply Scaling - Start
Apply Scaling - End


# Standardization GEO - Removing mean and Scaling

# EVIN

In [18]:
kevintest = log2_to_ln(geoData['GSE76250']['data']['OR4F5'])

In [20]:
kevintest.describe()

min_value = kevintest.min()
max_value = kevintest.max()
std_dev = kevintest.std()

print(f"Minimum: {min_value}")
print(f"Maximum: {max_value}")
print(f"Standard Deviation: {std_dev}")

Minimum: 1.0001871213966815
Maximum: 2.4840752192450513
Standard Deviation: 0.1721494450796082


In [17]:
dfPatientGenes['OR4F5'].describe()

min_value = dfPatientGenes['OR4F5'].min()
max_value = dfPatientGenes['OR4F5'].max()
std_dev = dfPatientGenes['OR4F5'].std()

print(f"Minimum: {min_value}")
print(f"Maximum: {max_value}")
print(f"Standard Deviation: {std_dev}")

Minimum: 0
Maximum: 4
Standard Deviation: 0.1687690058365982


In [104]:
geoData['GSE76250']['featureColumns']

Index(['DDX11L1', 'MIR1302-11', 'OR4F5', 'OTTHUMG00000002525', 'LOC100132287',
       'LOC101060495', 'OR4F29', 'OTTHUMG00000156971', 'OTTHUMG00000002330',
       'OTTHUMG00000002408',
       ...
       'DUX4L2.6', 'DUX4L2.7', 'DUX4L2.8', 'DUX4L2.9', 'DUX4L2.10',
       'DUX4L2.11', 'DUX4L2.12', 'DUX4L2.13', 'FRG2.1', 'DUX4.2'],
      dtype='object', length=32670)

In [106]:
# write featurecolumns before any manipulation
# MCF2L2
print(f'Writing DataFrame to CSV - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']].to_csv(f'../Data/validationSet_test_001_geenWijziging.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End


In [124]:
geoData['GSE65216']['data']['MCF2L2']

0     3.201138
1     3.239352
2     2.769365
3     2.939457
4     3.198275
        ...   
61    2.810779
62    2.825427
63    2.892950
64    2.824327
65    2.809392
Name: MCF2L2, Length: 66, dtype: float64

In [125]:
geoData['GSE76250']['data']['MCF2L2']

0      3.115142
1      2.861696
2      2.896250
3      3.107124
4      3.045224
         ...   
193    2.909149
194    2.857584
195    2.874800
196    2.992598
197    2.815506
Name: MCF2L2, Length: 198, dtype: float64

In [66]:
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']]

Unnamed: 0,DDX11L1,MIR1302-11,OR4F5,OTTHUMG00000002525,LOC100132287,LOC101060495,OR4F29,OTTHUMG00000156971,OTTHUMG00000002330,OTTHUMG00000002408,...,DUX4L2.6,DUX4L2.7,DUX4L2.8,DUX4L2.9,DUX4L2.10,DUX4L2.11,DUX4L2.12,DUX4L2.13,FRG2.1,DUX4.2
0,4.293023,4.086924,2.090081,1.866271,6.142883,4.370827,1.254193,4.834084,3.878490,1.675614,...,6.374212,6.374212,6.374212,6.393057,6.393057,6.393057,6.374212,6.374212,2.615489,6.274078
1,4.226858,3.601991,1.662459,1.966973,6.089811,4.874371,1.501630,4.986944,3.755139,1.977632,...,5.852711,5.852711,5.852711,5.947252,5.947252,5.947252,5.852711,5.852711,2.416140,5.777761
2,4.185904,3.632204,1.863902,2.164858,6.023621,4.810844,1.019085,4.842264,3.873162,1.967044,...,5.812860,5.812860,5.812860,5.815567,5.815567,5.815567,5.812860,5.812860,2.371899,5.815431
3,4.730009,3.969706,1.996188,2.803360,7.198605,5.775214,1.420217,4.952729,3.932943,2.640858,...,6.180747,6.180747,6.180747,6.190111,6.190111,6.190111,6.180747,6.180747,2.627649,6.019647
4,4.526884,3.785156,2.077839,1.834897,6.309876,5.106549,1.195496,5.034830,4.049428,2.223506,...,6.210926,6.210926,6.210926,6.321991,6.321991,6.321991,6.210926,6.210926,2.450051,6.208675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,4.251326,3.595412,1.647896,1.846134,5.991062,5.362765,1.415470,5.225954,3.768028,2.102454,...,5.846236,5.846236,5.846236,5.840122,5.840122,5.840122,5.846236,5.846236,2.554812,5.716691
194,4.352772,3.550632,1.712088,1.852272,6.251459,4.185341,1.134545,4.870184,3.828825,1.831424,...,5.818895,5.818895,5.818895,5.804369,5.804369,5.804369,5.818895,5.818895,2.363534,5.774248
195,4.381121,3.624632,1.667431,1.960412,6.324233,4.569757,1.333372,4.768807,3.779686,1.784886,...,5.864415,5.864415,5.864415,5.845787,5.845787,5.845787,5.864415,5.864415,2.378769,5.843131
196,4.470076,4.000274,1.984008,2.127924,6.136254,4.127996,1.017029,4.836334,3.873482,2.049178,...,6.045414,6.045414,6.045414,6.372333,6.372333,6.372333,6.045414,6.045414,2.593521,5.924782


In [126]:
#geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']] = geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']].apply(log2_to_ln)
#geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']] = geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']].apply(log2_to_ln)

#dfPatientGenes[allGeneColumns] = (dfPatientGenes[allGeneColumns] + 1).transform(np.log)

geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']] = geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']].transform(log2_to_ln)
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']] = geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']].transform(log2_to_ln)
print('DONE!!!')

DONE!!!


In [108]:
# write featurecolumns before any manipulation
# MCF2L2
print(f'Writing DataFrame to CSV - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']].to_csv(f'../Data/validationSet_test_002_logConvert.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End


In [127]:
geoData['GSE65216']['data']['MCF2L2']

0     2.218859
1     2.245348
2     1.919577
3     2.037477
4     2.216875
        ...   
61    1.948284
62    1.958437
63    2.005240
64    1.957674
65    1.947322
Name: MCF2L2, Length: 66, dtype: float64

In [128]:
geoData['GSE76250']['data']['MCF2L2']

0      2.159252
1      1.983577
2      2.007528
3      2.153694
4      2.110788
         ...   
193    2.016468
194    1.980726
195    1.992660
196    2.074311
197    1.951560
Name: MCF2L2, Length: 198, dtype: float64

In [86]:
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']]

Unnamed: 0,DDX11L1,MIR1302-11,OR4F5,OTTHUMG00000002525,LOC100132287,LOC101060495,OR4F29,OTTHUMG00000156971,OTTHUMG00000002330,OTTHUMG00000002408,...,DUX4L2.6,DUX4L2.7,DUX4L2.8,DUX4L2.9,DUX4L2.10,DUX4L2.11,DUX4L2.12,DUX4L2.13,FRG2.1,DUX4.2
0,2.975697,2.832840,1.448734,1.293600,4.257922,3.029626,0.869340,3.350732,2.688364,1.161447,...,4.418267,4.418267,4.418267,4.431329,4.431329,4.431329,4.418267,4.418267,1.812919,4.348859
1,2.929835,2.496710,1.152329,1.363402,4.221135,3.378657,1.040851,3.456686,2.602864,1.370790,...,4.056790,4.056790,4.056790,4.122321,4.122321,4.122321,4.056790,4.056790,1.674741,4.004839
2,2.901448,2.517652,1.291958,1.500565,4.175256,3.334623,0.706376,3.356402,2.684671,1.363451,...,4.029168,4.029168,4.029168,4.031044,4.031044,4.031044,4.029168,4.029168,1.644075,4.030950
3,3.278592,2.751591,1.383652,1.943141,4.989693,4.003073,0.984419,3.432970,2.726108,1.830503,...,4.284167,4.284167,4.284167,4.290658,4.290658,4.290658,4.284167,4.284167,1.821347,4.172501
4,3.137797,2.623670,1.440248,1.271854,4.373673,3.539590,0.828655,3.489878,2.806850,1.541217,...,4.305086,4.305086,4.305086,4.382070,4.382070,4.382070,4.305086,4.305086,1.698246,4.303526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,2.946795,2.492150,1.142234,1.279643,4.152688,3.717185,0.981129,3.622355,2.611798,1.457310,...,4.052302,4.052302,4.052302,4.048064,4.048064,4.048064,4.052302,4.052302,1.770861,3.962508
194,3.017112,2.461111,1.186729,1.283897,4.333181,2.901057,0.786407,3.375754,2.653939,1.269446,...,4.033351,4.033351,4.033351,4.023282,4.023282,4.023282,4.033351,4.033351,1.638277,4.002404
195,3.036762,2.512403,1.155775,1.358854,4.383624,3.167514,0.924223,3.305485,2.619879,1.237189,...,4.064903,4.064903,4.064903,4.051991,4.051991,4.051991,4.064903,4.064903,1.648837,4.050150
196,3.098421,2.772779,1.375210,1.474965,4.253327,2.861309,0.704951,3.352291,2.684893,1.420382,...,4.190362,4.190362,4.190362,4.416965,4.416965,4.416965,4.190362,4.190362,1.797692,4.106746


In [88]:
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']]

Unnamed: 0,DDX11L1,MIR1302-11,OR4F5,OTTHUMG00000002525,LOC100132287,LOC101060495,OR4F29,OTTHUMG00000156971,OTTHUMG00000002330,OTTHUMG00000002408,...,DUX4L2.6,DUX4L2.7,DUX4L2.8,DUX4L2.9,DUX4L2.10,DUX4L2.11,DUX4L2.12,DUX4L2.13,FRG2.1,DUX4.2
0,2.975697,2.832840,1.448734,1.293600,4.257922,3.029626,0.869340,3.350732,2.688364,1.161447,...,4.418267,4.418267,4.418267,4.431329,4.431329,4.431329,4.418267,4.418267,1.812919,4.348859
1,2.929835,2.496710,1.152329,1.363402,4.221135,3.378657,1.040851,3.456686,2.602864,1.370790,...,4.056790,4.056790,4.056790,4.122321,4.122321,4.122321,4.056790,4.056790,1.674741,4.004839
2,2.901448,2.517652,1.291958,1.500565,4.175256,3.334623,0.706376,3.356402,2.684671,1.363451,...,4.029168,4.029168,4.029168,4.031044,4.031044,4.031044,4.029168,4.029168,1.644075,4.030950
3,3.278592,2.751591,1.383652,1.943141,4.989693,4.003073,0.984419,3.432970,2.726108,1.830503,...,4.284167,4.284167,4.284167,4.290658,4.290658,4.290658,4.284167,4.284167,1.821347,4.172501
4,3.137797,2.623670,1.440248,1.271854,4.373673,3.539590,0.828655,3.489878,2.806850,1.541217,...,4.305086,4.305086,4.305086,4.382070,4.382070,4.382070,4.305086,4.305086,1.698246,4.303526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,2.946795,2.492150,1.142234,1.279643,4.152688,3.717185,0.981129,3.622355,2.611798,1.457310,...,4.052302,4.052302,4.052302,4.048064,4.048064,4.048064,4.052302,4.052302,1.770861,3.962508
194,3.017112,2.461111,1.186729,1.283897,4.333181,2.901057,0.786407,3.375754,2.653939,1.269446,...,4.033351,4.033351,4.033351,4.023282,4.023282,4.023282,4.033351,4.033351,1.638277,4.002404
195,3.036762,2.512403,1.155775,1.358854,4.383624,3.167514,0.924223,3.305485,2.619879,1.237189,...,4.064903,4.064903,4.064903,4.051991,4.051991,4.051991,4.064903,4.064903,1.648837,4.050150
196,3.098421,2.772779,1.375210,1.474965,4.253327,2.861309,0.704951,3.352291,2.684893,1.420382,...,4.190362,4.190362,4.190362,4.416965,4.416965,4.416965,4.190362,4.190362,1.797692,4.106746


In [140]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [141]:
print(f'Apply Scaling - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']] = scaler.fit_transform(geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']])
geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']] = scaler.fit_transform(geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']])
print(f'Apply Scaling - End')

Apply Scaling - Start
Apply Scaling - End


In [113]:
# write featurecolumns before any manipulation
# MCF2L2
print(f'Writing DataFrame to CSV - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']].to_csv(f'../Data/validationSet_test_003_scaling.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End


In [131]:
geoData['GSE65216']['data']['MCF2L2']

0     2.906656
1     3.251919
2    -0.994381
3     0.542396
4     2.880789
        ...   
61   -0.620202
62   -0.487861
63    0.122201
64   -0.497801
65   -0.632735
Name: MCF2L2, Length: 66, dtype: float64

In [132]:
geoData['GSE76250']['data']['MCF2L2']

0      1.609710
1     -0.749142
2     -0.427544
3      1.535085
4      0.958975
         ...   
193   -0.307491
194   -0.787413
195   -0.627182
196    0.469178
197   -1.179038
Name: MCF2L2, Length: 198, dtype: float64

In [91]:
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']]

Unnamed: 0,DDX11L1,MIR1302-11,OR4F5,OTTHUMG00000002525,LOC100132287,LOC101060495,OR4F29,OTTHUMG00000156971,OTTHUMG00000002330,OTTHUMG00000002408,...,DUX4L2.6,DUX4L2.7,DUX4L2.8,DUX4L2.9,DUX4L2.10,DUX4L2.11,DUX4L2.12,DUX4L2.13,FRG2.1,DUX4.2
0,-0.452357,2.879262,0.921711,-0.794152,-0.150037,-1.127452,-0.105342,-0.838236,-0.247608,-1.397034,...,2.229380,2.229380,2.229380,2.065073,2.065073,2.065073,2.229380,2.229380,0.705922,2.056506
1,-0.721871,-0.666290,-0.804442,-0.385977,-0.314934,0.000546,1.096420,-0.275093,-0.945955,-0.276484,...,-0.099085,-0.099085,-0.099085,0.266808,0.266808,0.266808,-0.099085,-0.099085,-0.599015,0.001646
2,-0.888691,-0.445389,0.008710,0.416109,-0.520589,-0.141762,-1.247225,-0.808101,-0.277772,-0.315768,...,-0.277017,-0.277017,-0.277017,-0.264376,-0.264376,-0.264376,-0.277017,-0.277017,-0.888617,0.157608
3,1.327642,2.022231,0.542700,3.004146,3.130133,2.018540,0.701010,-0.401142,0.060676,2.184225,...,1.365572,1.365572,1.365572,1.246440,1.246440,1.246440,1.365572,1.365572,0.785521,1.003107
4,0.500242,0.672908,0.872295,-0.921320,0.368816,0.520652,-0.390424,-0.098678,0.720152,0.635761,...,1.500319,1.500319,1.500319,1.778411,1.778411,1.778411,1.500319,1.500319,-0.377034,1.785724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,-0.622204,-0.714391,-0.863227,-0.875773,-0.621751,1.094606,0.677954,0.605432,-0.872984,0.186632,...,-0.127995,-0.127995,-0.127995,-0.165327,-0.165327,-0.165327,-0.127995,-0.127995,0.308731,-0.251197
194,-0.208978,-1.041797,-0.604108,-0.850894,0.187313,-1.542963,-0.686454,-0.705242,-0.528784,-0.818946,...,-0.250071,-0.250071,-0.250071,-0.309545,-0.309545,-0.309545,-0.250071,-0.250071,-0.943374,-0.012898
195,-0.093503,-0.500752,-0.784372,-0.412571,0.413424,-0.681826,0.279218,-1.078720,-0.806983,-0.991612,...,-0.046827,-0.046827,-0.046827,-0.142476,-0.142476,-0.142476,-0.046827,-0.046827,-0.843646,0.272292
196,0.268843,2.245727,0.493533,0.266405,-0.170634,-1.671422,-1.257210,-0.829947,-0.275960,-0.011033,...,0.761320,0.761320,0.761320,1.981478,1.981478,1.981478,0.761320,0.761320,0.562120,0.610345


# Rename GEO columns to adhere to TCGA-BRCA synonyms

In [142]:
total = len(geoData.keys())
counter = 1
print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - Start")

for k in geoData.keys():
    for s in geoData[k]['synonyms'].keys():
        geoData[k]['data'].rename(columns={s : geoData[k]['synonyms'][s]}, inplace=True)
    
    print(f"{counter}/{total} - Dataset {k} - Renamed Gene Symbols")
    counter+=1

print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - End")

***- Renaming gene symbol to TCGA-BRCA from GEO - Start
1/2 - Dataset GSE65216 - Renamed Gene Symbols
2/2 - Dataset GSE76250 - Renamed Gene Symbols
***- Renaming gene symbol to TCGA-BRCA from GEO - End


# Combine GEO data into 1 set

In [143]:
combinedData = pd.concat([geoData['GSE65216']['data'][featuresValidation], geoData['GSE76250']['data'][featuresValidation]])

# Write train/test set

In [21]:
print(f'Writing DataFrame to CSV - Start')
dfPatientGenes.to_csv(f'../Data/geneDataPreProcessed.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End


# Write validation set

In [144]:
print(f'Writing DataFrame to CSV - Start')
combinedData.to_csv(f'../Data/validationSet.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End
