# Extract genes based on research, featureset 2 (performed best out of 5 featuresets defined)

In [83]:
import pandas as pd

variant = 'research_logScaled'
targetGenes = [ 'BRCA1', 'BRCA2', 'CD274','MKI67','PDCD1','PIK3CA','TP53','LRPPRC','YOD1','DCLK1',
                'TOP2A','TACSTD2','ROR1','TTN','CTLA4','EGFR','EPCAM','MYC','PTEN','CDK6','DDX3X',
                'SRC','YES1','FYN','TBC1D1','MALAT1','FOXC1','LAG3','GATA3', 'CCND1', 'PRR4' ]
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Iterate over files and collect gene data

In [84]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    ## Remove duplicates. Second row (duplicate) has 'PAR_Y', value 0. Ergo keep first.
    #dfTarget.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Checking data

In [85]:
dfMin = dfPatientGenes[targetGenes].min()
dfMax = dfPatientGenes[targetGenes].max()
dfMean = dfPatientGenes[targetGenes].mean()

# Features into outputset
scores = pd.DataFrame({'min':dfMin, 'max':dfMax, 'mean': dfMean})
scores

Unnamed: 0_level_0,min,max,mean
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRCA1,0,4119,682.292733
BRCA2,4,2626,374.777892
CD274,0,3325,158.169908
MKI67,0,22360,3265.894575
PDCD1,0,1205,62.91607
PIK3CA,135,8166,1462.908905
TP53,1,13646,2628.141249
LRPPRC,10,32167,6601.310133
YOD1,0,4388,503.117707
DCLK1,0,34399,1579.774821


# Apply natural logarithm in base e to flatten skewed data / outliers

In [86]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
dfPatientGenes[targetGenes] = (dfPatientGenes[targetGenes] + 1).transform(np.log)
logDfPatientGenes = dfPatientGenes.copy(deep=True)
logDfPatientGenes

gene_name,FYN,BRCA1,FOXC1,TBC1D1,LAG3,CDK6,GATA3,CCND1,PRR4,EPCAM,...,YES1,YOD1,TACSTD2,ROR1,PDCD1,SRC,DDX3X,MALAT1,tnbc,case_id
0,6.045005,7.178545,3.367296,6.274762,3.091042,5.894403,10.029900,11.234679,1.791759,8.877800,...,7.561122,5.645447,8.881975,3.806662,2.197225,8.100161,9.291644,8.903543,False,6E7D5EC6-A469-467C-B748-237353C23416
0,6.336826,5.921578,4.127134,6.889591,5.814131,7.080026,8.933532,8.245647,2.564949,9.087608,...,7.124478,5.303305,8.054840,3.891820,4.532599,6.741701,8.887515,8.612685,False,55262FCB-1B01-4480-B322-36570430C917
0,6.317165,4.787492,4.663439,6.253829,3.583519,4.927254,8.760139,8.667680,2.197225,8.626048,...,6.287859,5.187386,7.696667,3.135494,3.637586,6.947937,8.101375,7.345365,False,427D0648-3F77-4FFC-B52C-89855426D647
0,6.638568,5.293305,5.866468,7.120444,4.025352,5.934894,9.365719,10.200773,3.218876,8.729721,...,7.240650,6.098074,9.213436,4.912655,3.737670,7.893199,9.107421,8.872067,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,6.622736,6.082219,5.283204,7.126891,3.784190,5.880533,9.786054,9.396156,1.945910,7.358194,...,7.297091,5.746203,9.049937,5.030438,3.295837,7.355641,8.997395,8.550628,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,6.052089,3.465736,5.323010,6.276643,4.304065,6.028279,8.468003,7.703008,1.386294,7.821643,...,6.905753,4.615121,8.756997,3.828641,1.945910,6.194405,7.610853,8.445053,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,7.338888,5.910797,5.438079,7.452402,4.653960,6.614726,9.235131,11.126483,2.772589,7.641084,...,7.459339,6.059123,8.636397,5.278115,4.852030,7.093405,9.223257,8.532279,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,7.090077,5.365976,5.602119,7.600902,4.343805,6.188264,8.956866,9.582387,2.890372,8.323123,...,6.911747,5.605802,8.621013,5.768321,4.043051,7.137278,8.978787,8.691146,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,6.469250,4.499810,6.556778,7.325808,4.672829,5.552960,8.474286,7.509335,0.693147,8.103192,...,6.467699,5.147494,9.579349,3.912023,3.367296,7.005789,8.311153,7.637716,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Scale data

In [87]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaledLogDfPatientGenes = logDfPatientGenes.copy(deep=True)
scaledLogDfPatientGenes[targetGenes] = scaler.fit_transform(logDfPatientGenes[targetGenes])
scaledLogDfPatientGenes

gene_name,FYN,BRCA1,FOXC1,TBC1D1,LAG3,CDK6,GATA3,CCND1,PRR4,EPCAM,...,YES1,YOD1,TACSTD2,ROR1,PDCD1,SRC,DDX3X,MALAT1,tnbc,case_id
0,-0.815878,1.107231,-1.611753,-1.481479,-1.208212,-0.472844,0.490418,1.653559,-0.359890,0.235306,...,-0.045424,-0.245303,0.007767,-0.944244,-0.981546,0.953866,0.120404,0.311162,False,6E7D5EC6-A469-467C-B748-237353C23416
0,-0.412512,-0.309597,-1.029436,-0.635479,1.179804,0.652670,-0.265456,-0.969475,0.554964,0.480431,...,-0.634114,-0.613335,-0.782062,-0.861011,0.894867,-0.948984,-0.542458,-0.042728,False,55262FCB-1B01-4480-B322-36570430C917
0,-0.439688,-1.587917,-0.618429,-1.510283,-0.776334,-1.390959,-0.385000,-0.599119,0.119864,-0.058825,...,-1.762061,-0.738025,-1.124080,-1.600238,0.175747,-0.660101,-1.831903,-1.584689,False,427D0648-3F77-4FFC-B52C-89855426D647
0,0.004568,-1.017775,0.303536,-0.317827,-0.388868,-0.434405,0.032508,0.746252,1.328702,0.062300,...,-0.477490,0.241574,0.324277,0.136744,0.256161,0.663966,-0.181762,0.272863,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,-0.017315,-0.128527,-0.143460,-0.308957,-0.600356,-0.486010,0.322303,0.040158,-0.177496,-1.540099,...,-0.401395,-0.136923,0.168153,0.251864,-0.098840,-0.089013,-0.362231,-0.118234,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.806086,-3.077774,-0.112954,-1.478890,-0.144450,-0.345755,-0.586409,-1.445670,-0.839644,-0.998637,...,-0.929004,-1.353593,-0.111574,-0.922762,-1.183471,-1.715603,-2.636468,-0.246689,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,0.972578,-0.321750,-0.024768,0.138944,0.162391,0.210959,-0.057524,1.558612,0.800647,-1.209589,...,-0.182649,0.199675,-0.226735,0.493941,1.151521,-0.456338,0.008233,-0.140560,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,0.628661,-0.935861,0.100947,0.343279,-0.109600,-0.193881,-0.249369,0.203585,0.940010,-0.412742,...,-0.920922,-0.287948,-0.241426,0.973065,0.501527,-0.394882,-0.392752,0.052736,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,-0.229470,-1.912186,0.832569,-0.035249,0.178938,-0.796976,-0.582077,-1.615628,-1.659789,-0.669694,...,-1.519597,-0.780935,0.673687,-0.841265,-0.041425,-0.579065,-1.487820,-1.228982,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Send dataframe to a csv

In [88]:
scaledLogDfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)