# PreProcessing data
### (validation, standardization, imputation, augmentation, deduplication)

# Defining Features for GEO data (validation set)

In [32]:
%run "..\Model\DataHelpers.ipynb"

featuresLASSO       = [ 'CD1A'  , 'CSF2RB', 'EPCAM' , 'ERBB2'  , 'ESR1'    ,
                        'EZH2'  , 'FGB'   , 'FOXA1' , 'FOXC1'  , 'GATA3'   , 
                        'LAMA2' , 'LMNA'  , 'MDGA2' , 'OBSCN'  , 'OGN'     , 
                        'PGR'   , 'SELL'  , 'SRC'   , 'TACSTD2', 'TBC1D22B', 
                        'TFF1'  , 'TGFB3' , 'UBE2C' , 'VTCN1'  ,'WTAP'     , 
                        'YES1'  , 'YOD1' ]

# 'CD1A','CSF2RB','EPCAM','ERBB2','ESR1',
# 'EZH2','FGB','FOXA1','FOXC1','GATA3',
# 'LAMA2','LMNA','MDGA2','OBSCN','OGN',
# 'PGR','SELL','SRC','TACSTD2','TBC1D22B',
# 'TFF1','TGFB3','UBE2C','VTCN1','WTAP',
# 'YES1','YOD1'


featuresValidation = featuresLASSO + constTargetAndMetadata

# Define GEO data

In [33]:
geoData = {}

geoData['GSE65216'] = {'dataSetName': 'GSE65216', 'dataSetFile':'GSE65216-GPL570_series_matrix.txt'   , 'dataSetSkipRows': 67 , 'dataSetSkipFooter': 1, 'platformName': 'GPL570'  , 'platformFile': 'GPL570-55999.txt'  , 'platformSkipRows': 16, 'platformSkipFooter': 0, 'processing':'Data were analyzed using standard AffyCDF or Brainarray HGU133Plus2_Hs_ENTREZG version 13 custom chipset definition file. Data were normalized using GC-RMA. Batch and hybridization effects were corrected using a linear model in which they were treated as fixed effects. Samples with technical replicates were subsequently averaged.'}
geoData['GSE76250'] = {'dataSetName': 'GSE76250', 'dataSetFile':'GSE76250_series_matrix.txt'          , 'dataSetSkipRows': 74 , 'dataSetSkipFooter': 1, 'platformName': 'GPL17586', 'platformFile': 'GPL17586-45144.txt', 'platformSkipRows': 15, 'platformSkipFooter': 1, 'processing':'The data were analyzed with Robust Multichip Analysis (RMA) algorithm using Affymetrix default analysis settings and global scaling as normalization method (ExpressionConsole 1.3.1) probe group file: HTA-2_0.r1.pgf meta-probeset file: HTA-2_0.r1.Psrs.mps'}

# Define synonyms for TCGA-BRCA based on KEGG and HEP

In [34]:
geoData['GSE65216']['synonyms'] = { 'IGJ': 'JCHAIN',                                   
                                    'PRH1-PRR4': 'PRR4',
                                    'FIGF': 'VEGFD'
                                  }

geoData['GSE76250']['synonyms'] = { 'DARC': 'ACKR1',
                                    'IGJ': 'JCHAIN',
                                    'WDR67': 'TBC1D31',
                                    'FIGF': 'VEGFD',
                                    'IL8': 'CXCL8',
                                  }

# Loading data - GEO

In [35]:
import os

geoData_GSE65216_file = os.path.join('../Data', 'geoData_GSE65216.csv')
geoData_GSE76250_file = os.path.join('../Data', 'geoData_GSE76250.csv')

print(f"***- Reading GEO data - Start")
geoData['GSE65216']['data'] = pd.read_csv(geoData_GSE65216_file, sep=',')
geoData['GSE76250']['data'] = pd.read_csv(geoData_GSE76250_file, sep=',')
print(f"***- Reading GEO data - End")

***- Reading GEO data - Start
***- Reading GEO data - End


# Feature columns - GEO

In [36]:
print(f"***- Determine feature columns - Start")
onlyFeatureColumns_GSE76250 = len(geoData['GSE76250']['data'].columns)-2 # tnbc / case_id
geoData['GSE76250']['featureColumns'] = geoData['GSE76250']['data'].columns[:onlyFeatureColumns_GSE76250]

onlyFeatureColumns_GSE65216 = len(geoData['GSE65216']['data'].columns)-2 # tnbc / case_id
geoData['GSE65216']['featureColumns'] = geoData['GSE65216']['data'].columns[:onlyFeatureColumns_GSE65216]
print(f"***- Determine feature columns - End")

***- Determine feature columns - Start
***- Determine feature columns - End


# Deplication GEO - Check rows

In [37]:
total = len(geoData.keys())
counter = 1
print(f"***- Checking dupplicates - Start")

for k in geoData.keys():
    if len(geoData[k]['data'].columns[geoData[k]['data'].columns.duplicated()]) == 0:
        print(f'***- Dataset {k} has no duplicates')
    else:
        print(f'***- Dataset {k} has duplicates')
        
    counter+=1

print(f"***- Checking dupplicates - End")

***- Checking dupplicates - Start
***- Dataset GSE65216 has no duplicates
***- Dataset GSE76250 has no duplicates
***- Checking dupplicates - End


# Standardization GEO - Removing mean and Scaling

In [38]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [39]:
print(f'Apply Scaling - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']] = scaler.fit_transform(geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']])
geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']] = scaler.fit_transform(geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']])
print(f'Apply Scaling - End')

Apply Scaling - Start
Apply Scaling - End


# Rename GEO columns to adhere to TCGA-BRCA synonyms

In [40]:
total = len(geoData.keys())
counter = 1
print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - Start")

for k in geoData.keys():
    for s in geoData[k]['synonyms'].keys():
        geoData[k]['data'].rename(columns={s : geoData[k]['synonyms'][s]}, inplace=True)
    
    print(f"{counter}/{total} - Dataset {k} - Renamed Gene Symbols")
    counter+=1

print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - End")

***- Renaming gene symbol to TCGA-BRCA from GEO - Start
1/2 - Dataset GSE65216 - Renamed Gene Symbols
2/2 - Dataset GSE76250 - Renamed Gene Symbols
***- Renaming gene symbol to TCGA-BRCA from GEO - End


# Combine GEO data into 1 set

In [41]:
combinedData = pd.concat([geoData['GSE65216']['data'][featuresValidation], geoData['GSE76250']['data'][featuresValidation]])

# Write validation set

In [42]:
print(f'Writing DataFrame to CSV - Start')
combinedData.to_csv(f'../Data/validationSet.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End
