# PreProcessing data
### (validation, standardization, imputation, augmentation, deduplication)

# Get files to load RNA-SEQ

In [381]:
import pandas as pd

targetGenes = [ 'protein_coding' ]

variant = 'research_logScaled'
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Load only protein coding

In [382]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    #Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_type'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Deduplication - Check rows

In [384]:
# There are no duplicates
dfPatientGenes[dfPatientGenes.duplicated()]

gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AC020765.6,AC010980.1,AC004233.4,AL451106.1,AC008763.4,AC006486.3,AL391628.1,AP006621.6,tnbc,case_id


# Deduplication - Check columns ( 24 duplicates )
### Duplicate columns removed based on sum(column) == 0

In [385]:
duplCol = dfPatientGenes.columns[dfPatientGenes.columns.duplicated()]
numColB4 = len(dfPatientGenes.columns)
print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Number of duplicate columns: {len(duplCol)}')

print(f'Removing duplicates - Start')

reindexDfPatientGenes = dfPatientGenes.T.reset_index()
reindexDfPatientGenes.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)
reindexDfPatientGenes.set_index('gene_name', inplace=True)
dfPatientGenes = reindexDfPatientGenes.T

print(f'Removing duplicates - End')

print(f'Total number of columns    : {len(dfPatientGenes.columns)}')
print(f'Check                      : {numColB4}-{len(duplCol)} = {len(dfPatientGenes.columns)}')

Total number of columns    : 19964
Number of duplicate columns: 24
Removing duplicates - Start
Removing duplicates - End
Total number of columns    : 19940
Check                      : 19964-24 = 19940


# Imputation - TBD -> 0 waarde?

In [4]:
# TBD

# Standardization - Log transformation - Manage outliers

In [386]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
onlyFeatureColumns = len(dfPatientGenes.columns)-2 # tnbc / case_id
allGeneColumns = dfPatientGenes.columns[:onlyFeatureColumns]

# Convert to float (was object)
dfPatientGenes[allGeneColumns] = dfPatientGenes[allGeneColumns].astype(float)

print(f'Apply LOG - Start')
# Apply log
dfPatientGenes[allGeneColumns] = (dfPatientGenes[allGeneColumns] + 1).transform(np.log)
print(f'Apply LOG - End')

Apply LOG - Start
Apply LOG - End


# Standardization - Scaling and removing mean

In [388]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#scaledLogDfPatientGenes = logDfPatientGenes.copy(deep=True)
dfPatientGenes[allGeneColumns] = scaler.fit_transform(dfPatientGenes[allGeneColumns])

# Check data

In [390]:
dfMin = scaledLogDfPatientGenes[allGeneColumns].min()
dfMax = scaledLogDfPatientGenes[allGeneColumns].max()
dfMean = scaledLogDfPatientGenes[allGeneColumns].mean()
dfStd = scaledLogDfPatientGenes[allGeneColumns].std()

In [391]:
pd.DataFrame({'min':dfMin, 'max':dfMax, 'mean': dfMean, 'std': dfStd})

Unnamed: 0_level_0,min,max,mean,std
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TSPAN6,-7.869879,3.248239,2.072719e-16,1.000512
TNMD,-1.415365,5.239736,2.109083e-16,1.000512
DPM1,-9.465894,3.572000,-8.345423e-16,1.000512
SCYL3,-3.699373,3.151134,7.981788e-16,1.000512
C1orf112,-4.012077,3.526979,1.838175e-15,1.000512
...,...,...,...,...
AL451106.1,0.000000,0.000000,0.000000e+00,0.000000
AC008763.4,-0.273882,9.299309,3.090897e-17,1.000512
AC006486.3,0.000000,0.000000,0.000000e+00,0.000000
AL391628.1,-2.125367,2.842208,3.072716e-16,1.000512


In [392]:
print(f'Writing DataFrame to CSV - Start')
dfPatientGenes.to_csv(f'../Data/geneDataPreProcessed.csv', index=False)
print(f'Writing DataFrame to CSV - End')