In [5]:
import pandas as pd

# Extract genes based on research, notebook 2 (performed best out of 5)
variant = 'research'
targetGenes = [ 'BRCA1', 'BRCA2', 'CD274','MKI67','PDCD1','PIK3CA','TP53','LRPPRC','YOD1','DCLK1',
                'TOP2A','TACSTD2','ROR1','TTN','CTLA4','EGFR ','EPCAM','MYC','PTEN','CDK6','DDX3X',
                'SRC','YES1','FYN','TBC1D1','MALAT1','FOXC1','LAG3','GATA3', 'CCND1', 'PRR4' ]
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


In [6]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])
    
#Send data to a csv
dfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)

dfPatientGenes

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

gene_name,FYN,BRCA1,FOXC1,TBC1D1,LAG3,CDK6,GATA3,CCND1,PRR4,EPCAM,...,YES1,YOD1,TACSTD2,ROR1,PDCD1,SRC,DDX3X,MALAT1,tnbc,case_id
0,421,1310,28,530,21,362,22694,75710,5,7170,...,1921,282,7200,44,8,3294,10846,7357,False,6E7D5EC6-A469-467C-B748-237353C23416
0,564,372,61,981,334,1187,7581,3810,12,8844,...,1241,200,3148,48,92,846,7240,5500,False,55262FCB-1B01-4480-B322-36570430C917
0,553,119,105,519,35,137,6374,5811,8,5574,...,537,178,2200,22,37,1040,3298,1548,False,427D0648-3F77-4FFC-B52C-89855426D647
0,763,198,352,1236,55,377,11680,26923,24,6183,...,1394,444,10030,135,41,2678,9021,7129,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,751,437,196,1244,43,357,17783,12041,6,1568,...,1475,312,8517,152,26,1564,8081,5169,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,424,31,204,531,73,414,4759,2214,3,2493,...,997,100,6354,45,6,489,2019,4651,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,1538,368,229,1723,104,745,10250,67946,15,2081,...,1735,427,5632,195,127,1203,10129,5075,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,1199,213,270,1999,76,486,7760,14506,17,4117,...,1003,271,5546,319,56,1257,7932,5949,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,644,89,703,1518,106,257,4789,1824,1,3304,...,643,171,14462,49,28,1102,4068,2074,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37
