# Extract genes based on research, featureset 2 (performed best out of 5 featuresets defined)

In [5]:
import pandas as pd

variant = 'research_log'
targetGenes = [ 'BRCA1', 'BRCA2', 'CD274','MKI67','PDCD1','PIK3CA','TP53','LRPPRC','YOD1','DCLK1',
                'TOP2A','TACSTD2','ROR1','TTN','CTLA4','EGFR','EPCAM','MYC','PTEN','CDK6','DDX3X',
                'SRC','YES1','FYN','TBC1D1','MALAT1','FOXC1','LAG3','GATA3', 'CCND1', 'PRR4' ]
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Iterate over files and collect gene data

In [2]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Gather all targeted genes we want with the values we want
    dfTarget = dfGenes[dfGenes['gene_name'].isin(targetGenes)][['gene_name', 'stranded_first']]

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

# Checking data

In [3]:
dfMin = dfPatientGenes[targetGenes].min()
dfMax = dfPatientGenes[targetGenes].max()
dfMean = dfPatientGenes[targetGenes].mean()

# Features into outputset
scores = pd.DataFrame({'min':dfMin, 'max':dfMax, 'mean': dfMean})
scores

Unnamed: 0_level_0,min,max,mean
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRCA1,0,4119,682.292733
BRCA2,4,2626,374.777892
CD274,0,3325,158.169908
MKI67,0,22360,3265.894575
PDCD1,0,1205,62.91607
PIK3CA,135,8166,1462.908905
TP53,1,13646,2628.141249
LRPPRC,10,32167,6601.310133
YOD1,0,4388,503.117707
DCLK1,0,34399,1579.774821


# Apply natural logarithm in base e to flatten skewed data / outliers

In [6]:
# Source: Artificial Intelligence with Python, 2nd edition. Packt Publishing Ltd, ISBN 978-1-83921-953-5.
# p. 87-88
import numpy as np
dfPatientGenes[targetGenes] = (dfPatientGenes[targetGenes] + 1).transform(np.log)
logDfPatientGenes = dfPatientGenes[targetGenes].copy(deep=True)
logDfPatientGenes

gene_name,BRCA1,BRCA2,CD274,MKI67,PDCD1,PIK3CA,TP53,LRPPRC,YOD1,DCLK1,...,SRC,YES1,FYN,TBC1D1,MALAT1,FOXC1,LAG3,GATA3,CCND1,PRR4
0,2.101514,1.908505,1.628131,2.227171,1.162283,2.111658,2.163149,2.252679,1.893932,2.364109,...,2.208292,2.147231,1.952319,1.984411,2.292893,1.474144,1.408800,2.400610,2.504274,1.026672
0,1.934644,1.866222,1.770740,2.159162,1.710658,2.005921,1.966898,2.217913,1.841074,1.930310,...,2.046621,2.094882,1.992906,2.065544,2.263084,1.634547,1.918998,2.295916,2.224153,1.271150
0,1.755699,1.785548,1.439569,1.985185,1.534194,1.897084,2.040675,2.146316,1.822513,1.958551,...,2.072912,1.986210,1.990223,1.981529,2.121706,1.734031,1.522467,2.278307,2.268788,1.162283
0,1.839486,1.833780,1.694266,2.109427,1.555545,2.039382,2.146744,2.224266,1.959824,2.020167,...,2.185287,2.109079,2.033210,2.094385,2.289709,1.926650,1.614495,2.338504,2.415983,1.439569
0,1.957587,1.853152,1.696413,2.115250,1.457646,2.060395,2.148560,2.208559,1.908980,1.801090,...,2.122937,2.115905,2.031135,2.095178,2.256607,1.837880,1.565317,2.378254,2.341436,1.080418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.496434,1.439569,1.924989,1.570003,1.080418,1.892864,2.103272,2.158346,1.725463,1.694266,...,1.973304,2.067591,1.953324,1.984670,2.245491,1.844195,1.668474,2.247918,2.163669,0.869742
0,1.933085,1.862230,1.761349,2.120618,1.766789,2.101047,2.168878,2.218245,1.954321,2.035248,...,2.091049,2.135271,2.120930,2.134451,2.254684,1.862230,1.732356,2.325826,2.495392,1.327761
0,1.850968,1.818853,1.737326,2.108556,1.618011,2.056966,2.129607,2.179233,1.887948,2.025780,...,2.096456,2.068349,2.090638,2.151867,2.271213,1.887391,1.675938,2.298262,2.359191,1.358505
0,1.704713,1.618011,1.560500,1.837880,1.474144,1.954651,2.170589,2.162101,1.816045,1.856716,...,2.080165,2.010587,2.010795,2.119360,2.156138,2.022445,1.735688,2.248581,2.141164,0.526589


# Send dataframe to a csv

In [7]:
logDfPatientGenes.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)