# Extract genes based on research, featureset 2 (performed best out of 5 featuresets defined)

In [81]:
import pandas as pd

variant = 'research_logScaled'
targetGenes = [ 'BRCA1', 'BRCA2', 'CD274','MKI67','PDCD1','PIK3CA','TP53','LRPPRC','YOD1','DCLK1',
                'TOP2A','TACSTD2','ROR1','TTN','CTLA4','EGFR','EPCAM','MYC','PTEN','CDK6','DDX3X',
                'SRC','YES1','FYN','TBC1D1','MALAT1','FOXC1','LAG3','GATA3', 'CCND1', 'PRR4' ]
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Iterate over files and collect gene data

In [76]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Checking data

In [34]:
dfMin = dfPatientGenes[targetGenes].min()
dfMax = dfPatientGenes[targetGenes].max()
dfMean = dfPatientGenes[targetGenes].mean()

# Features into outputset
scores = pd.DataFrame({'min':dfMin, 'max':dfMax, 'mean': dfMean})
scores

Unnamed: 0_level_0,min,max,mean
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRCA1,0,4119,682.292733
BRCA2,4,2626,374.777892
CD274,0,3325,158.169908
MKI67,0,22360,3265.894575
PDCD1,0,1205,62.91607
PIK3CA,135,8166,1462.908905
TP53,1,13646,2628.141249
LRPPRC,10,32167,6601.310133
YOD1,0,4388,503.117707
DCLK1,0,34399,1579.774821


# Apply natural logarithm in base e to flatten skewed data / outliers

In [77]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
dfPatientGenes[targetGenes] = (dfPatientGenes[targetGenes] + 1).transform(np.log)
logDfPatientGenes = dfPatientGenes[targetGenes].copy(deep=True)
logDfPatientGenes

gene_name,BRCA1,BRCA2,CD274,MKI67,PDCD1,PIK3CA,TP53,LRPPRC,YOD1,DCLK1,...,SRC,YES1,FYN,TBC1D1,MALAT1,FOXC1,LAG3,GATA3,CCND1,PRR4
0,7.178545,5.743003,4.094345,8.273592,2.197225,7.261927,7.698483,8.513185,5.645447,9.634562,...,8.100161,7.561122,6.045005,6.274762,8.903543,3.367296,3.091042,10.029900,11.234679,1.791759
0,5.921578,5.463832,4.875197,7.663877,4.532599,6.432940,6.148468,8.188133,5.303305,5.891644,...,6.741701,7.124478,6.336826,6.889591,8.612685,4.127134,5.814131,8.933532,8.245647,2.564949
0,4.787492,4.962845,3.218876,6.280396,3.637586,5.666427,6.695799,7.553287,5.187386,6.089045,...,6.947937,6.287859,6.317165,6.253829,7.345365,4.663439,3.583519,8.760139,8.667680,2.197225
0,5.293305,5.257495,4.442651,7.243513,3.737670,6.685861,7.556951,8.246696,6.098074,6.539586,...,7.893199,7.240650,6.638568,7.120444,8.872067,5.866468,4.025352,9.365719,10.200773,3.218876
0,6.082219,5.379897,4.454347,7.291656,3.295837,6.849066,7.572503,8.102586,5.746203,5.056246,...,7.355641,7.297091,6.622736,7.126891,8.550628,5.283204,3.784190,9.786054,9.396156,1.945910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,3.465736,3.218876,5.855072,3.806662,1.945910,5.638355,7.192934,7.656810,4.615121,4.442651,...,6.194405,6.905753,6.052089,6.276643,8.445053,5.323010,4.304065,8.468003,7.703008,1.386294
0,5.910797,5.438079,4.820282,7.336286,4.852030,7.174724,7.748460,8.191186,6.059123,6.654153,...,7.093405,7.459339,7.338888,7.452402,8.532279,5.438079,4.653960,9.235131,11.126483,2.772589
0,5.365976,5.164786,4.682131,7.236339,4.043051,6.822197,7.411556,7.839526,5.605802,6.582025,...,7.137278,6.911747,7.090077,7.600902,8.691146,5.602119,4.343805,8.956866,9.582387,2.890372
0,4.499810,4.043051,3.761200,5.283204,3.367296,6.061457,7.763446,7.689371,5.147494,5.402677,...,7.005789,6.467699,6.469250,7.325808,7.637716,6.556778,4.672829,8.474286,7.509335,0.693147


# Scale data

In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaledLogDfPatientGenes = logDfPatientGenes.copy(deep=True)
scaledLogDfPatientGenes[targetGenes] = scaler.fit_transform(logDfPatientGenes[targetGenes])
scaledLogDfPatientGenes

gene_name,BRCA1,BRCA2,CD274,MKI67,PDCD1,PIK3CA,TP53,LRPPRC,YOD1,DCLK1,...,SRC,YES1,FYN,TBC1D1,MALAT1,FOXC1,LAG3,GATA3,CCND1,PRR4
0,1.107231,0.147813,-0.623536,0.601675,-0.981546,0.221388,0.028479,-0.216788,-0.245303,2.385990,...,0.953866,-0.045424,-0.815878,-1.481479,0.311162,-1.611753,-1.208212,0.490418,1.653559,-0.359890
0,-0.309597,-0.198682,0.259723,0.020687,0.894867,-1.257088,-2.049999,-0.741921,-0.613335,-0.419255,...,-0.948984,-0.634114,-0.412512,-0.635479,-0.042728,-1.029436,1.179804,-0.265456,-0.969475,0.554964
0,-1.587917,-0.820484,-1.613819,-1.297614,0.175747,-2.624145,-1.316061,-1.767540,-0.738025,-0.271307,...,-0.660101,-1.762061,-0.439688,-1.510283,-1.584689,-0.618429,-0.776334,-0.385000,-0.599119,0.119864
0,-1.017775,-0.454777,-0.229550,-0.379873,0.256161,-0.806011,-0.161307,-0.647311,0.241574,0.066365,...,0.663966,-0.477490,0.004568,-0.317827,0.272863,0.303536,-0.388868,0.032508,0.746252,1.328702
0,-0.128527,-0.302857,-0.216320,-0.333998,-0.098840,-0.514938,-0.140452,-0.880126,-0.136923,-1.045369,...,-0.089013,-0.401395,-0.017315,-0.308957,-0.118234,-0.143460,-0.600356,0.322303,0.040158,-0.177496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-3.077774,-2.985017,1.368104,-3.654801,-1.183471,-2.674211,-0.649432,-1.600294,-1.353593,-1.505247,...,-1.715603,-0.929004,-0.806086,-1.478890,-0.246689,-0.112954,-0.144450,-0.586409,-1.445670,-0.839644
0,-0.321750,-0.230645,0.197605,-0.291471,1.151521,0.065864,0.095496,-0.736990,0.199675,0.152231,...,-0.456338,-0.182649,0.972578,0.138944,-0.140560,-0.024768,0.162391,-0.057524,1.558612,0.800647
0,-0.935861,-0.569844,0.041337,-0.386708,0.501527,-0.562858,-0.356272,-1.305110,-0.287948,0.098173,...,-0.394882,-0.920922,0.628661,0.343279,0.052736,0.100947,-0.109600,-0.249369,0.203585,0.940010
0,-1.912186,-1.962089,-1.000371,-2.247825,-0.041425,-1.919619,0.115592,-1.547691,-0.780935,-0.785726,...,-0.579065,-1.519597,-0.229470,-0.035249,-1.228982,0.832569,0.178938,-0.582077,-1.615628,-1.659789


# Send dataframe to a csv

In [82]:
scaledLogDfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)