# Extract genes based on PCA

In [1]:
import pandas as pd

variant = 'automated'
df = pd.read_csv('../Data/clinical.csv')
geneFiles = df[['file', 'tnbc', 'bcr_patient_uuid']]

geneFiles

Unnamed: 0,file,tnbc,bcr_patient_uuid
0,84225715-14a6-423c-a6d6-15558e151f56\253aa5dc-...,False,6E7D5EC6-A469-467C-B748-237353C23416
1,927604f9-a38e-4c3f-b50f-3e0db4daf5ec\1be6a56c-...,False,55262FCB-1B01-4480-B322-36570430C917
2,931442ba-af81-4b68-beca-7285fc44b1df\f2dda955-...,False,427D0648-3F77-4FFC-B52C-89855426D647
3,7b4d770a-2b8c-4ca5-bf51-c4745c5de39a\ae8996bd-...,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,307261f2-f88f-4658-b6d1-98ef946148e2\75d91076-...,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...
972,0a7dc8b9-4196-41d3-ada1-a50cb36bfd2b\3c9c665e-...,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,e0cb738d-854a-4033-b370-79fa28d7cef8\69118aea-...,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,e25bb12b-f88b-41c5-951e-a36d4b94008f\73e13f2d-...,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,ee447251-5c90-426f-a0c7-bd2041189761\06a03e45-...,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


# Iterate over files and collect gene data

In [2]:
dfPatientGenes = pd.DataFrame()
total_files = len(geneFiles)
for idx, row in geneFiles.iterrows():
    file_name = row.iloc[0]
    file_path = f"../Data/{file_name}"
    
    print(f"File {idx+1}/{total_files} - {file_name}", end="\r")
    
    dfGenes = pd.read_csv(file_path, sep='\t', skiprows=[0])
    
    # Columns may have been dropped if first line in file were columns, load data again without dropping first line
    if "gene_name" not in dfGenes.columns:
        dfGenes = pd.read_csv(file_path, sep='\t')
    
    #dfGenes
    
    # Filter rows where gene_name is not null or empty
    dfTarget = dfGenes[dfGenes['gene_name'].notna() & (dfGenes['gene_name'] != '')][['gene_name', 'stranded_first']]

    ## Remove duplicates. Second row (duplicate) has 'PAR_Y', value 0. Ergo keep first.
    #dfTarget.drop_duplicates(subset=['gene_name'], keep='first', inplace=True)

    # Transform the dfTarget to a single row dataframe with all genes as columns
    dfNewGenes = dfTarget.set_index('gene_name').T.reset_index(drop=True)

    # Add tnbc status
    dfNewGenes["tnbc"] = df["tnbc"].loc[df['file'] == file_name].values

    # Add the patient uuid for reference and set it as the index
    patientID = df['bcr_patient_uuid'].loc[df['file'] == file_name]
    dfNewGenes['case_id'] = patientID.values
    
    # Concat the data to the final dataframe before saving it to a file
    dfPatientGenes = pd.concat([dfPatientGenes, dfNewGenes])
    
#Send data to a csv
dfPatientGenes.to_csv(f'../Data/patient_genes_all.csv', index=False)

dfPatientGenes

File 977/977 - 404cea41-efa9-44c0-84a1-5dd603d7a2a4\69ed2c6f-9495-475c-b8d6-c1db8f4b0537.rna_seq.augmented_star_gene_counts.tsv

gene_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6,tnbc,case_id
0,314,2,895,726,427,118,461,854,951,1920,...,18,0,0,0,401,0,9,68,False,6E7D5EC6-A469-467C-B748-237353C23416
0,196,2,927,1996,1040,182,874,1056,923,679,...,20,0,0,0,151,0,6,47,False,55262FCB-1B01-4480-B322-36570430C917
0,667,24,644,346,297,199,489,529,412,401,...,17,0,0,0,76,0,2,37,False,427D0648-3F77-4FFC-B52C-89855426D647
0,1490,3,704,1081,638,195,1318,1113,826,1425,...,26,0,1,0,142,0,8,46,False,C31900A4-5DCD-4022-97AC-638E86E889E4
0,1282,6,731,813,464,147,1675,1230,855,879,...,22,0,0,0,123,0,3,33,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,368,9,283,995,614,221,322,848,397,484,...,16,0,0,0,87,0,4,23,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
0,1045,14,804,1142,593,651,2288,1420,1166,719,...,26,0,0,0,197,0,10,40,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
0,914,510,625,815,541,435,2180,1382,1056,896,...,25,0,0,0,71,0,6,25,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
0,1274,7,587,830,452,492,991,1062,357,572,...,7,0,0,1,60,0,1,24,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37


In [9]:
# Apply PCA

#Read data from csv (uncomment to not need to rerun this whole notebook but keep commented if you do run all at once -- it saves significant runtime)
dfPatientGenes = pd.read_csv(f'../Data/patient_genes_all.csv')
variant = 'automated'

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

X = dfPatientGenes.drop(columns=['tnbc', 'case_id'])
y = dfPatientGenes['tnbc']
case_ids = dfPatientGenes['case_id']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# See how many components this gives
print(f"Original features: {X.shape[1]}") # Expected output: 60660
print(f"PCA components retained: {X_pca.shape[1]}") # Expected output: 768

pca_columns = [f'PC{i+1}' for i in range(X_pca.shape[1])]
dfPCA = pd.DataFrame(X_pca, columns=pca_columns)
dfPCA['tnbc'] = y.values
dfPCA['case_id'] = case_ids.values

dfPCA.to_csv(f'../Data/patient_genes_{variant}.csv', index=False)

dfPCA

Original features: 60660
PCA components retained: 768


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC761,PC762,PC763,PC764,PC765,PC766,PC767,PC768,tnbc,case_id
0,31.765932,-18.788266,35.663751,40.721270,-10.153662,-9.172647,4.858313,-5.845414,-58.790682,41.595025,...,-0.160965,-1.360216,-0.096551,-1.290682,0.626077,-1.117848,1.212694,-0.974630,False,6E7D5EC6-A469-467C-B748-237353C23416
1,-47.503724,29.627336,-3.156859,31.273387,-0.136287,-5.013486,-4.714762,-9.654402,-22.123084,44.007990,...,0.156278,-0.331804,0.090240,-0.600533,0.854371,-1.363396,1.295262,0.406849,False,55262FCB-1B01-4480-B322-36570430C917
2,-111.873707,62.672829,2.986938,-10.449637,-8.023955,-5.629093,-4.429158,-14.546950,-23.864911,24.600368,...,-0.384897,-1.284424,0.142945,-1.863664,-2.470849,0.884313,-1.502265,1.016221,False,427D0648-3F77-4FFC-B52C-89855426D647
3,-41.217732,13.835600,7.312426,15.532706,-8.144742,16.577972,6.168371,-1.096828,-7.944082,-1.223308,...,-0.962404,1.939822,-1.390015,2.879568,4.483421,2.523240,-2.175289,-1.218501,False,C31900A4-5DCD-4022-97AC-638E86E889E4
4,-44.282093,16.689704,9.210410,-1.091120,-6.055308,19.567431,7.827870,-3.578270,-13.235552,-4.856956,...,-2.291985,0.392051,1.978584,-0.900382,-1.458117,0.746839,2.489626,-2.577971,False,6623FC5E-00BE-4476-967A-CBD55F676EA6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,-108.220507,53.980465,2.064252,17.165304,-13.157578,9.220367,1.772912,-10.244235,-17.959233,-14.598108,...,0.165763,1.374869,5.332714,3.241647,2.260845,1.539722,2.112002,-6.518037,False,5CD79093-1571-4F71-8136-0D84CCABDCAC
973,-9.686508,11.443539,-0.437004,-1.404610,9.924864,67.137988,3.996076,-4.109714,-21.882164,22.561050,...,7.445634,-5.014546,2.319087,11.565369,9.429728,-10.120691,-12.323861,-3.378795,False,F89588E9-CA73-4465-A7FB-7246EDB45E3A
974,-41.685795,25.706570,-3.593552,-11.230308,5.304674,52.942407,5.261883,-2.463057,-6.392234,3.564028,...,2.659014,-2.477460,-9.579771,-1.462379,-3.795677,0.990227,-0.204101,2.912988,False,CA20249F-B7EA-4FD9-9ECB-34F74755AE35
975,-94.585742,33.410058,-16.044123,25.035698,-10.409812,18.695194,3.890929,0.582821,3.729137,-17.769162,...,-0.253686,0.557540,-0.953259,-2.469198,-0.352324,-1.993852,1.033071,-1.049677,False,23F438BD-1DBB-4D46-972F-1E8E74DDBD37
