# PreProcessing data
### (validation, standardization, imputation, augmentation, deduplication)

# Defining Features for GEO data (validation set)

In [3]:
%run "..\Model\DataHelpers.ipynb"

featuresStatistical = [ 'MCF2L2', 'FOXC1' , 'YBX1'  , 'IGF2BP2', 'FSCN1'   ,
                        'GABRP' , 'SOX10' , 'CENPA' , 'PADI2'  , 'CDC20'   ,
                        'BCL11A', 'HAPLN3', 'ANP32E', 'SFT2D2' , 'B3GNT5'  ,
                        'ANKS6' , 'FOXA1' , 'PSAT1' , 'TBX19'  , 'CDCA7'   ,
                        'CENPW' , 'UGT8'  , 'RGMA'  , 'EN1'    , 'FAM171A1' ]

featuresResearchPapers = [ 'BRCA1'  , 'BRCA2', 'CD274' , 'MKI67' , 'PDCD1',
                           'PIK3CA' , 'TP53' , 'LRPPRC', 'DCLK1' , 'TOP2A',
                           'TACSTD2', 'ROR1' , 'TTN'   , 'CTLA4' , 'EGFR' ,  
                           'EPCAM'  , 'MYC'  , 'PTEN'  , 'CDK6'  , 'DDX3X',
                           'SRC'    , 'YES1' , 'FYN'   , 'TBC1D1', 'FOXC1',
                           'EZH2'   , 'LAG3' , 'GATA3' , 'CCND1' , 'PRR4' , 
                           'YOD1' ]

featuresValidation = featuresStatistical + featuresResearchPapers + constTargetAndMetadata

# Define GEO data

In [4]:
geoData = {}

geoData['GSE65216'] = {'dataSetName': 'GSE65216', 'dataSetFile':'GSE65216-GPL570_series_matrix.txt'   , 'dataSetSkipRows': 67 , 'dataSetSkipFooter': 1, 'platformName': 'GPL570'  , 'platformFile': 'GPL570-55999.txt'  , 'platformSkipRows': 16, 'platformSkipFooter': 0, 'processing':'Data were analyzed using standard AffyCDF or Brainarray HGU133Plus2_Hs_ENTREZG version 13 custom chipset definition file. Data were normalized using GC-RMA. Batch and hybridization effects were corrected using a linear model in which they were treated as fixed effects. Samples with technical replicates were subsequently averaged.'}
geoData['GSE76250'] = {'dataSetName': 'GSE76250', 'dataSetFile':'GSE76250_series_matrix.txt'          , 'dataSetSkipRows': 74 , 'dataSetSkipFooter': 1, 'platformName': 'GPL17586', 'platformFile': 'GPL17586-45144.txt', 'platformSkipRows': 15, 'platformSkipFooter': 1, 'processing':'The data were analyzed with Robust Multichip Analysis (RMA) algorithm using Affymetrix default analysis settings and global scaling as normalization method (ExpressionConsole 1.3.1) probe group file: HTA-2_0.r1.pgf meta-probeset file: HTA-2_0.r1.Psrs.mps'}

# Define synonyms for TCGA-BRCA based on KEGG and HEP

In [5]:
geoData['GSE65216']['synonyms'] = { 'IGJ': 'JCHAIN',                                   
                                    'PRH1-PRR4': 'PRR4',
                                    'FIGF': 'VEGFD'
                                  }

geoData['GSE76250']['synonyms'] = { 'DARC': 'ACKR1',
                                    'IGJ': 'JCHAIN',
                                    'WDR67': 'TBC1D31',
                                    'FIGF': 'VEGFD',
                                    'IL8': 'CXCL8',
                                  }

# Get files to load TCGA-BRCA RNA-SEQ

In [6]:
import pandas as pd

targetGenes = [ 'protein_coding' ]

variant = 'research_logScaled' ## FOUT
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Load data - TCGA-BRCA

In [7]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    #Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_type'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Loading data - GEO

In [8]:
import os

geoData_GSE65216_file = os.path.join('../Data', 'geoData_GSE65216.csv')
geoData_GSE76250_file = os.path.join('../Data', 'geoData_GSE76250.csv')

print(f"***- Reading GEO data - Start")
geoData['GSE65216']['data'] = pd.read_csv(geoData_GSE65216_file, sep=',')
geoData['GSE76250']['data'] = pd.read_csv(geoData_GSE76250_file, sep=',')
print(f"***- Reading GEO data - End")

***- Reading GEO data - Start
***- Reading GEO data - End


# Feature columns - GEO

In [9]:
print(f"***- Determine feature columns - Start")
onlyFeatureColumns_GSE76250 = len(geoData['GSE76250']['data'].columns)-2 # tnbc / case_id
geoData['GSE76250']['featureColumns'] = geoData['GSE76250']['data'].columns[:onlyFeatureColumns_GSE76250]

onlyFeatureColumns_GSE65216 = len(geoData['GSE65216']['data'].columns)-2 # tnbc / case_id
geoData['GSE65216']['featureColumns'] = geoData['GSE65216']['data'].columns[:onlyFeatureColumns_GSE65216]
print(f"***- Determine feature columns - End")

***- Determine feature columns - Start
***- Determine feature columns - End


# Deduplication TCGA-BRCA - Check rows

In [10]:
# There are no duplicates
dfPatientGenes[dfPatientGenes.duplicated()]

gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AC020765.6,AC010980.1,AC004233.4,AL451106.1,AC008763.4,AC006486.3,AL391628.1,AP006621.6,tnbc,case_id


# Deplication GEO - Check rows

In [11]:
total = len(geoData.keys())
counter = 1
print(f"***- Checking dupplicates - Start")

for k in geoData.keys():
    if len(geoData[k]['data'].columns[geoData[k]['data'].columns.duplicated()]) == 0:
        print(f'***- Dataset {k} has no duplicates')
    else:
        print(f'***- Dataset {k} has duplicates')
        
    counter+=1

print(f"***- Checking dupplicates - End")

***- Checking dupplicates - Start
***- Dataset GSE65216 has no duplicates
***- Dataset GSE76250 has no duplicates
***- Checking dupplicates - End


# Deduplication TCGA-BRCA - Check columns ( 24 duplicates )
### Duplicate columns removed based on sum(column) == 0

In [12]:
duplCol = dfPatientGenes.columns[dfPatientGenes.columns.duplicated()]
numColB4 = len(dfPatientGenes.columns)
print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Number of duplicate columns: {len(duplCol)}')

print(f'Removing duplicates - Start')

reindexDfPatientGenes = dfPatientGenes.T.reset_index()
reindexDfPatientGenes.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)
reindexDfPatientGenes.set_index('gene_name', inplace=True)
dfPatientGenes = reindexDfPatientGenes.T

print(f'Removing duplicates - End')

print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Check                      : {numColB4}-{len(duplCol)} = {len(dfPatientGenes.columns)}')

Total number of columns    : 19964
Number of duplicate columns: 24
Removing duplicates - Start
Removing duplicates - End
Total number of columns    : 19940
Check                      : 19964-24 = 19940


# Imputation - TBD -> 0 waarde?

In [13]:
# TBD

# Standardization TCGA-BRCA - Log transformation - Manage outliers

In [14]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
onlyFeatureColumns = len(dfPatientGenes.columns)-2 # tnbc / case_id
allGeneColumns = dfPatientGenes.columns[:onlyFeatureColumns]

# Convert to float (was object)
dfPatientGenes[allGeneColumns] = dfPatientGenes[allGeneColumns].astype(float)

print(f'Apply LOG - Start')
# Apply log
dfPatientGenes[allGeneColumns] = (dfPatientGenes[allGeneColumns] + 1).transform(np.log)
print(f'Apply LOG - End')

Apply LOG - Start
Apply LOG - End


# Standardization TCGA-BRCA - Removing mean and Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

print(f'Apply Scaling - Start')
dfPatientGenes[allGeneColumns] = scaler.fit_transform(dfPatientGenes[allGeneColumns])
print(f'Apply Scaling - End')

Apply Scaling - Start
Apply Scaling - End


# Standardization GEO - Removing mean and Scaling

In [16]:
print(f'Apply Scaling - Start')
geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']] = scaler.fit_transform(geoData['GSE76250']['data'][geoData['GSE76250']['featureColumns']])
geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']] = scaler.fit_transform(geoData['GSE65216']['data'][geoData['GSE65216']['featureColumns']])
print(f'Apply Scaling - End')

Apply Scaling - Start
Apply Scaling - End


# Rename GEO columns to adhere to TCGA-BRCA synonyms

In [17]:
total = len(geoData.keys())
counter = 1
print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - Start")

for k in geoData.keys():
    for s in geoData[k]['synonyms'].keys():
        geoData[k]['data'].rename(columns={s : geoData[k]['synonyms'][s]}, inplace=True)
    
    print(f"{counter}/{total} - Dataset {k} - Renamed Gene Symbols")
    counter+=1

print(f"***- Renaming gene symbol to TCGA-BRCA from GEO - End")

***- Renaming gene symbol to TCGA-BRCA from GEO - Start
1/2 - Dataset GSE65216 - Renamed Gene Symbols
2/2 - Dataset GSE76250 - Renamed Gene Symbols
***- Renaming gene symbol to TCGA-BRCA from GEO - End


# Combine GEO data into 1 set

In [18]:
combinedData = pd.concat([geoData['GSE65216']['data'][featuresValidation], geoData['GSE76250']['data'][featuresValidation]])

# Write train/test set

In [19]:
print(f'Writing DataFrame to CSV - Start')
dfPatientGenes.to_csv(f'../Data/geneDataPreProcessed.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End


# Write validation set

In [20]:
print(f'Writing DataFrame to CSV - Start')
combinedData.to_csv(f'../Data/validationSet.csv', index=False)
print(f'Writing DataFrame to CSV - End')

Writing DataFrame to CSV - Start
Writing DataFrame to CSV - End
