# Extract genes based on literature

In [6]:
import pandas as pd

variant = 'literature'
targetGenes = ['TBC1D9', 'GATA3', 'SLC16A6', 'ESR1', 'INPP4B', 'SLC44A4', 'ANXA9', 'AGR2', 'MCCC2', 'TSPAN1', 'STBD1', 'MLPH', 'CACNA2D2',
'RARA', 'STARD3', 'PPP1R14C', 'LDHB', 'MFGE8', 'PSAT1']
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


In [7]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Scale data

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dfPatientGenes[targetGenes] = scaler.fit_transform(dfPatientGenes[targetGenes])

# Show scaled dataframe

In [9]:
dfPatientGenes

gene_name,CACNA2D2,ESR1,AGR2,GATA3,SLC16A6,TBC1D9,INPP4B,LDHB,MLPH,TSPAN1,...,STARD3,RARA,MCCC2,PSAT1,MFGE8,ANXA9,PPP1R14C,SLC44A4,tnbc,case_id
0,2.136757,-0.624168,-0.627589,0.217837,-0.314595,0.847575,-0.410629,-0.310413,-0.116100,0.959835,...,-0.200094,0.238722,0.018261,-0.355483,-0.307049,2.188644,-0.311336,-0.458428,False,6E7D5EC6-A469-467C-B748-237353C23416
0,0.347756,-0.901611,0.436025,-0.764522,0.406879,-0.818628,-0.726877,-0.472801,0.876574,-0.469553,...,1.460425,-0.574182,-0.480349,0.027124,-0.257054,-0.898170,-0.141145,-0.522576,False,55262FCB-1B01-4480-B322-36570430C917
0,-0.619785,-0.679491,-0.534424,-0.842978,-0.423784,-0.783472,-0.787473,-0.439585,-0.882510,-0.513208,...,-0.255267,0.642472,-0.660073,-0.279432,-0.303270,-0.664321,-0.320717,-0.936029,False,427D0648-3F77-4FFC-B52C-89855426D647
0,-0.635794,-0.738582,0.004143,-0.498083,0.074130,-0.457301,0.750863,-0.266126,0.417837,-0.066909,...,0.572934,-0.130288,0.118625,-0.343722,-0.200227,0.046098,-0.244332,0.397544,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,-0.388655,-0.548046,-0.413284,-0.101383,-0.200915,-0.556168,0.519980,-0.410060,-0.317074,-0.521897,...,-0.228916,-0.049422,0.369533,-0.303737,-0.079017,-0.797373,-0.256393,-0.587977,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.418672,-0.755612,-0.690334,-0.947954,-0.480797,-0.842488,-0.948914,-0.354701,1.598176,-0.672359,...,-0.445282,-0.478841,-0.882444,-0.369595,-0.327396,-0.741733,-0.309996,-0.070785,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,-0.391657,-0.713139,-0.342916,-0.591034,0.323605,-0.323850,-0.277938,-0.247673,0.161738,-0.658160,...,-0.222946,-0.330812,0.169608,-0.295112,-0.161205,-0.411925,-0.303296,-0.322615,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,-0.058471,-0.757057,-0.477359,-0.752886,-0.454882,-0.687080,-0.704762,-0.144335,-0.286573,-0.432679,...,-0.291911,-0.525160,-0.625443,-0.202597,-0.186566,-0.749797,-0.314016,0.437386,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,-0.541741,-0.753961,-0.517344,-0.946004,-0.462483,-0.700406,-0.794550,0.475691,-0.630072,-0.276496,...,-0.294999,-0.422872,-0.843805,-0.338234,0.021337,-0.811888,-0.031258,-0.593239,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Send dataframe to a csv

In [10]:
dfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)