# Extract genes based on research, featureset 2 (performed best out of 5 featuresets defined)

In [16]:
import pandas as pd

variant = 'RESEARCH_HUMAN'
targetGenes = [ 'BRCA1', 'BRCA2', 'CD274','MKI67','PDCD1','PIK3CA','TP53','LRPPRC','YOD1','DCLK1',
                'TOP2A','TACSTD2','ROR1','TTN','CTLA4','EGFR','EPCAM','MYC','PTEN','CDK6','DDX3X',
                'SRC','YES1','FYN','TBC1D1','MALAT1','FOXC1','LAG3','GATA3', 'CCND1', 'PRR4' ]
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Iterate over files and collect gene data

In [17]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Scale data

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dfPatientGenes[targetGenes] = scaler.fit_transform(dfPatientGenes[targetGenes])

# Show scaled dataframe

In [28]:
dfPatientGenes

gene_name,FYN,BRCA1,FOXC1,TBC1D1,LAG3,CDK6,GATA3,CCND1,PRR4,EPCAM,...,YES1,YOD1,TACSTD2,ROR1,PDCD1,SRC,DDX3X,MALAT1,tnbc,case_id
0,-0.681170,1.065633,-0.407174,-1.085514,-0.419344,-0.462306,0.217837,1.856713,-0.250565,-0.052354,...,-0.306427,-0.488204,-0.369978,-0.619694,-0.541810,0.766467,-0.112999,-0.062022,False,6E7D5EC6-A469-467C-B748-237353C23416
0,-0.498729,-0.526771,-0.387659,-0.727258,0.412201,0.104864,-0.764522,-0.565830,0.035228,0.282770,...,-0.664935,-0.669251,-0.898486,-0.602657,0.286946,-0.707879,-0.806179,-0.098838,False,55262FCB-1B01-4480-B322-36570430C917
0,-0.512763,-0.956279,-0.361639,-1.094252,-0.382151,-0.616989,-0.842978,-0.498410,-0.128082,-0.371864,...,-1.036096,-0.717825,-1.022135,-0.713395,-0.255692,-0.591040,-1.563948,-0.177189,False,427D0648-3F77-4FFC-B52C-89855426D647
0,-0.244844,-0.822164,-0.215572,-0.524696,-0.329017,-0.451994,-0.498083,0.212922,0.525159,-0.249946,...,-0.584270,-0.130525,-0.000858,-0.232112,-0.216227,0.395472,-0.463818,-0.066542,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,-0.260154,-0.416424,-0.307825,-0.518342,-0.360897,-0.465744,-0.101383,-0.288501,-0.209737,-1.173840,...,-0.541566,-0.421967,-0.198200,-0.159707,-0.364219,-0.275452,-0.644514,-0.105400,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.677342,-1.105673,-0.303094,-1.084719,-0.281196,-0.426557,-0.947954,-0.619604,-0.332220,-0.988661,...,-0.793576,-0.890040,-0.480323,-0.615434,-0.561542,-0.922888,-1.809810,-0.115670,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,0.743906,-0.533562,-0.288310,-0.137844,-0.198839,-0.199002,-0.591034,1.595119,0.157711,-1.071140,...,-0.404489,-0.168060,-0.574495,0.023436,0.632261,-0.492870,-0.250827,-0.107264,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,0.311408,-0.796699,-0.264064,0.081399,-0.273226,-0.377059,-0.752886,-0.205447,0.239366,-0.663546,...,-0.790413,-0.512491,-0.585712,0.551569,-0.068235,-0.460348,-0.673156,-0.089936,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,-0.396665,-1.007209,-0.008002,-0.300687,-0.193525,-0.534492,-0.946004,-0.632745,-0.413875,-0.826303,...,-0.980211,-0.733280,0.577213,-0.598398,-0.344487,-0.553699,-1.415931,-0.166760,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Send dataframe to a csv

In [29]:
dfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)