In [1]:
!conda env create -f environment.yml
!pip install ipykernel
!python -m ipykernel install --user --name JupyterLab --display-name "Python (JupyterLab)"


CondaValueError: prefix already exists: /home/kbanerj1/.conda/envs/JupyterLab

Installed kernelspec JupyterLab in /home/kbanerj1/.local/share/jupyter/kernels/jupyterlab


In [2]:
import os
import pandas as pd
import glob
import time

In [None]:
print("Current Directory:", os.getcwd())

## Reading Gene Data

In [7]:
def read_gene_df(filepath, filename):
    '''
    Function that reads a gene file and processes the gene data with the file name.
    Skips rows with unwanted gene names (e.g., N_unmapped, N_multimapping, N_noFeature, N_ambiguous).
    Retains only the tpm_unstranded column.
    '''
    
    df1 = pd.read_csv(filepath, sep='\t', skiprows=1)
    unwanted_genes = ['N_unmapped', 'N_multimapping', 'N_noFeature', 'N_ambiguous']
    df1 = df1[~df1['gene_id'].isin(unwanted_genes)]
    df1 = df1[['gene_name', 'tpm_unstranded']]
    df1.set_index('gene_name', inplace=True)
    df1 = df1.T
    processed_filename = filename.split('.')[0]
    df1.index = [processed_filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index': 'file_name'}, inplace=True)
    
    return df1

## First Need to download data using gdc-cleint

In [None]:
filelist = glob.glob(os.path.join(os.getcwd(),"data/*/*.tsv"))
filelist

In [6]:
# Get the list of file paths
# filelist = glob.glob("./data/*/*.tsv")
all_dfs = []

for filepath in filelist:
    filename =  os.path.basename(filepath)  # get filename from path
    df = read_gene_df(filepath, filename)
    all_dfs.append(df) 

combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AL451106.1,AC092910.4,AC073611.1,AC136977.1,AC078856.1,AC008763.4,AL592295.6,AC006486.3,AL391628.1,AP006621.6
0,e99de328-1950-4739-a59b-e0685c45b94b,29.7759,0.1208,49.7936,4.7416,2.6487,9.8434,13.0292,11.4386,20.3905,...,0.0,0.0000,0.2921,0.0,0.0000,0.0,25.1013,0.0,0.0124,0.7077
1,a6384a57-0d59-4e92-8ca2-07bd22cf906f,56.2901,0.0737,83.4729,6.5013,12.3590,0.0322,0.0068,27.7641,1.1546,...,0.0,1.8124,0.2671,0.0,3.5654,0.0,6.4693,0.0,0.0226,1.4564
2,933a9a9c-26e4-4209-86f3-f18b812bfbdc,66.3745,0.5016,75.2411,5.3781,3.8030,17.9505,1.9869,15.8284,74.2166,...,0.0,0.0000,0.3681,0.0,0.0000,0.0,26.3922,0.0,0.0128,1.1752
3,7412a444-9fad-4d96-a4d4-2c270045919a,57.2746,0.5082,92.0853,5.6865,2.9126,18.4282,27.8222,52.6998,14.0285,...,0.0,0.0000,0.5025,0.0,0.0000,0.0,10.4905,0.0,0.0425,3.2065
4,325f91c9-70b9-4285-a7ff-0b7b6280a8ed,128.4459,7.0286,84.6279,5.7158,5.8681,17.5790,10.0602,70.3848,21.2514,...,0.0,0.0000,0.3641,0.0,0.0000,0.0,8.8325,0.0,0.0166,2.7076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
658,f053a1a1-718a-46bd-a761-8ed3511aeb57,44.7003,0.0803,55.1775,5.8021,2.0247,4.6252,1.1732,4.7872,42.5586,...,0.0,0.0000,0.7070,0.0,0.0000,0.0,22.6909,0.0,0.0616,4.3028
659,602acda6-a90d-42bc-a8aa-6eade2a791c4,47.4886,0.3089,66.6481,5.7443,4.0697,13.6360,9.4849,52.6728,14.5572,...,0.0,0.0000,0.2629,0.0,0.0000,0.0,24.3318,0.0,0.0135,2.9467
660,4b228438-6c20-4c56-8633-e1873d1aee8c,43.6628,0.4114,65.7862,6.5181,2.7338,6.8011,2.2545,12.0243,36.5026,...,0.0,0.0000,0.5861,0.0,0.0000,0.0,34.1121,0.0,0.0710,2.8467
661,04d8f7fc-783b-4c42-89cc-35d603174e5b,25.1427,0.3060,41.0510,7.8615,6.5292,6.5307,14.5350,16.8377,11.6456,...,0.0,11.2918,0.2615,0.0,17.7707,0.0,6.0990,0.0,0.0657,1.2636


In [14]:
## sanity check for columns that have all zeros across rows (samples)
zero_read_genes = combined_df.columns[(combined_df == 0).all(axis=0)]
zero_read_genes

Index(['CD99', 'NME1-NME2', 'Z83844.1', 'MCTS2P', 'CORO7-PAM16', 'ICAM4',
       'AC004837.1', 'AL021546.1', 'PEDS1-UBE2V1', 'VAMP7',
       ...
       'CDR1', 'AC114982.2', 'AC114402.2', 'AC084756.2', 'AL031178.2',
       'ACTL10', 'AC119733.1', 'AL451106.1', 'AC136977.1', 'AC006486.3'],
      dtype='object', name='gene_name', length=3329)

In [15]:
# Drop zero count genes
combined_df = combined_df.drop(columns=zero_read_genes)
combined_df

gene_name,file_name,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,AC010980.1,AC007511.1,AC004233.4,AC092910.4,AC073611.1,AC078856.1,AC008763.4,AL592295.6,AL391628.1,AP006621.6
0,e99de328-1950-4739-a59b-e0685c45b94b,29.7759,0.1208,49.7936,4.7416,2.6487,9.8434,13.0292,11.4386,20.3905,...,1.3224,0.0000,0.0000,0.0000,0.2921,0.0000,0.000,25.1013,0.0124,0.7077
1,a6384a57-0d59-4e92-8ca2-07bd22cf906f,56.2901,0.0737,83.4729,6.5013,12.3590,0.0322,0.0068,27.7641,1.1546,...,0.1100,0.0928,0.0000,1.8124,0.2671,3.5654,0.000,6.4693,0.0226,1.4564
2,99618995-9556-4c65-a643-43c9354395a7,59.3714,0.1389,43.7528,4.6948,1.5906,4.7471,1.9091,16.5347,41.5551,...,0.2072,0.0000,0.1879,0.0000,0.5874,1.1200,0.000,23.7052,0.0568,1.9113
3,5b1cc683-d363-4cb8-b9b2-050c3c541858,30.2841,0.1136,92.6183,5.8728,4.1862,18.3498,1.6605,24.4281,15.9592,...,0.3392,0.0000,0.2307,0.0000,1.9572,1.3748,0.021,9.6998,0.0436,3.9436
4,e7e63873-0099-474a-94b1-160a155c1ea3,16.8758,0.3241,56.2293,4.1407,1.1105,5.6585,2.1326,12.6668,22.7295,...,13.0383,0.0000,0.0940,0.0000,0.4677,0.0000,0.000,28.7837,0.0142,2.6037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,e11c6b6a-61cc-48ef-ac4d-221f27d45091,23.7059,0.4353,44.5254,3.2805,1.9987,3.5011,3.6117,8.3267,14.4904,...,0.5414,0.0000,0.0631,0.0000,0.3221,0.0000,0.000,15.9500,0.0429,1.0381
218,843f0cc6-cd74-4af8-be28-5fb02b532269,89.1621,4.1974,56.5681,8.1008,4.5077,8.1289,19.8376,56.2106,26.5987,...,0.8700,0.0000,0.0000,0.0000,0.1812,0.0000,0.000,30.1781,0.0268,2.5353
219,66cfd7f5-5c5e-4d20-b2b2-c3c592189160,59.8688,0.1680,62.5148,6.2605,1.8694,11.2746,3.9081,5.0972,26.1516,...,0.1463,0.0000,0.0000,0.0000,0.5113,0.0000,0.000,29.6109,0.0064,3.1739
220,e3c51e59-d9ff-425c-ade2-2a396333826a,33.4777,0.2491,53.7671,5.0215,6.3026,8.1720,7.9097,25.1466,13.9373,...,0.1652,0.2092,0.0000,5.1068,0.2473,20.0922,0.000,8.1392,0.0637,1.8238


## Read clinical and sample data

In [17]:
# Load clinical data 
clinical_data = pd.read_csv("/home/talal/Documents/Projects/fyp/Glioma-ML-Classifier-with-ANOVA-Feature-Selection/metadata/clinical.tsv", sep='\t')
clinical_data.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_arm,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type
0,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,Immunotherapy (Including Vaccines)
1,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,Targeted Molecular Therapy
2,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,no,'--,"Radiation Therapy, NOS"
3,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Adjuvant,yes,'--,Chemotherapy
4,d420e653-3fb2-432b-9e81-81232a80264d,HCM-BROD-0210-C71,HCMI-CMDC,'--,False,Cancer Related,'--,'--,-19586,481,...,'--,'--,'--,'--,'--,'--,Neoadjuvant,no,'--,'--


In [18]:
clinical_data.shape

(1386, 158)

In [19]:
clinical_data.groupby("primary_diagnosis")['case_id'].count()

primary_diagnosis
Astrocytoma, NOS                 118
Astrocytoma, anaplastic          240
Glioblastoma                     434
Gliosarcoma                        6
Mixed glioma                     242
Oligodendroglioma, NOS           202
Oligodendroglioma, anaplastic    144
Name: case_id, dtype: int64

In [20]:
unique_cases_in_clinical = clinical_data['case_submitter_id'].nunique()
case_duplicates = clinical_data['case_submitter_id'].value_counts()
# Display the unique case count and the distribution of duplicates
unique_cases_in_clinical, case_duplicates.head()

(663,
 case_submitter_id
 HCM-BROD-0213-C71    8
 HCM-BROD-0198-C71    7
 HCM-BROD-0106-C71    7
 HCM-BROD-0104-C71    7
 HCM-BROD-0420-C71    7
 Name: count, dtype: int64)

In [21]:
# drop duplicates based on 'case_submitter_id'
clinical_data_dedup = clinical_data.drop_duplicates(subset='case_submitter_id', keep='first')
clinical_data_dedup.shape

(663, 158)

## Simplify labels to broader categories

In [22]:
# simplify labels by mapping complex or specific diagnoses to broader categories - dropping 'mixed gliomas'
diagnosis_map = {
    "Astrocytoma, NOS": "Astrocytoma",
    "Astrocytoma, anaplastic": "Astrocytoma",
    "Oligodendroglioma, NOS": "Oligodendroglioma",
    "Oligodendroglioma, anaplastic": "Oligodendroglioma",
    "Glioblastoma": "Glioblastoma"
}

# create a new 'label' column based on the mapping
clinical_data_dedup.loc[:, 'label'] = clinical_data_dedup['primary_diagnosis'].map(diagnosis_map)
clinical_data_dedup.groupby(["primary_diagnosis", "label"])['case_submitter_id'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_data_dedup.loc[:, 'label'] = clinical_data_dedup['primary_diagnosis'].map(diagnosis_map)


primary_diagnosis              label            
Astrocytoma, NOS               Astrocytoma           59
Astrocytoma, anaplastic        Astrocytoma          120
Glioblastoma                   Glioblastoma         189
Oligodendroglioma, NOS         Oligodendroglioma    101
Oligodendroglioma, anaplastic  Oligodendroglioma     72
Name: case_submitter_id, dtype: int64

In [24]:
sample_data = pd.read_csv("/home/talal/Documents/Projects/fyp/Glioma-ML-Classifier-with-ANOVA-Feature-Selection/metadata/gdc_sample_sheet.tsv", sep='\t')
# Simplify filenames in sample_data for merging
sample_data['filename_short'] = sample_data['File Name'].apply(lambda x: x.split('.')[0])
sample_data = pd.merge(sample_data, combined_df, left_on='filename_short', right_on='file_name', how='inner')

# Retain relevant columns in sample_data
columns_to_retain = ['File ID', 'File Name', 'Data Category', 'Data Type', 'Project ID', 'Case ID', 'Sample ID', 'Sample Type', 'filename_short']
sample_data = sample_data[columns_to_retain]
sample_data.head()

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short
0,4fa35d6c-060b-433b-bc26-8bd7984631cc,9e60d3ee-fcb2-48dc-8f3a-3171d25bf32d.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0210-C71,HCM-BROD-0210-C71-85R,Next Generation Cancer Model,9e60d3ee-fcb2-48dc-8f3a-3171d25bf32d
1,d9c44441-5f47-4532-8ac1-8a0a7a59a6f0,7835cada-4a04-42d0-bed4-e8af440b5913.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0209-C71,HCM-BROD-0209-C71-85A,Next Generation Cancer Model,7835cada-4a04-42d0-bed4-e8af440b5913
2,6807181b-13e7-4f4b-a7d8-9d00f113f6bc,517e37c3-64f8-40fa-8b6c-3c74e423ea1a.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03266,C3L-03266-01,Primary Tumor,517e37c3-64f8-40fa-8b6c-3c74e423ea1a
3,f23b7ee8-2af9-4db6-9f2e-bbb74ce501c7,c8b55027-80ef-4748-940a-1c97e0e12116.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03727,C3L-03727-01,Primary Tumor,c8b55027-80ef-4748-940a-1c97e0e12116
4,849a5e37-7da5-4d58-9790-9868792456b0,5b664467-3c27-4bf4-90f8-4f518a153dd8.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-01887,C3L-01887-01,Primary Tumor,5b664467-3c27-4bf4-90f8-4f518a153dd8


In [15]:
sample_data.shape

(707, 9)

In [25]:
# Merge with the clinical data to get the primary diagnosis and label
merged_data_dedup = sample_data.merge(
    clinical_data_dedup[['case_submitter_id', 'primary_diagnosis', 'label']], 
    how='left', 
    left_on='Case ID', 
    right_on='case_submitter_id'
)

merged_data_dedup

Unnamed: 0,File ID,File Name,Data Category,Data Type,Project ID,Case ID,Sample ID,Sample Type,filename_short,case_submitter_id,primary_diagnosis,label
0,4fa35d6c-060b-433b-bc26-8bd7984631cc,9e60d3ee-fcb2-48dc-8f3a-3171d25bf32d.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0210-C71,HCM-BROD-0210-C71-85R,Next Generation Cancer Model,9e60d3ee-fcb2-48dc-8f3a-3171d25bf32d,HCM-BROD-0210-C71,Glioblastoma,Glioblastoma
1,d9c44441-5f47-4532-8ac1-8a0a7a59a6f0,7835cada-4a04-42d0-bed4-e8af440b5913.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,HCMI-CMDC,HCM-BROD-0209-C71,HCM-BROD-0209-C71-85A,Next Generation Cancer Model,7835cada-4a04-42d0-bed4-e8af440b5913,HCM-BROD-0209-C71,Glioblastoma,Glioblastoma
2,6807181b-13e7-4f4b-a7d8-9d00f113f6bc,517e37c3-64f8-40fa-8b6c-3c74e423ea1a.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03266,C3L-03266-01,Primary Tumor,517e37c3-64f8-40fa-8b6c-3c74e423ea1a,C3L-03266,Glioblastoma,Glioblastoma
3,f23b7ee8-2af9-4db6-9f2e-bbb74ce501c7,c8b55027-80ef-4748-940a-1c97e0e12116.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-03727,C3L-03727-01,Primary Tumor,c8b55027-80ef-4748-940a-1c97e0e12116,C3L-03727,Glioblastoma,Glioblastoma
4,849a5e37-7da5-4d58-9790-9868792456b0,5b664467-3c27-4bf4-90f8-4f518a153dd8.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-01887,C3L-01887-01,Primary Tumor,5b664467-3c27-4bf4-90f8-4f518a153dd8,C3L-01887,Glioblastoma,Glioblastoma
...,...,...,...,...,...,...,...,...,...,...,...,...
217,2761c4d4-00a9-4eb5-9166-af4e56677d07,a93d22ba-5cfa-4359-ae53-3309ced4e12a.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-02041,C3L-02041-02,Primary Tumor,a93d22ba-5cfa-4359-ae53-3309ced4e12a,C3L-02041,Glioblastoma,Glioblastoma
218,0997ec63-6fe0-40fb-b0ff-3ddcb14ec7d2,b7effa07-1b05-412a-a1ef-54d52d795c90.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-00365,C3L-00365-01,Primary Tumor,b7effa07-1b05-412a-a1ef-54d52d795c90,C3L-00365,Glioblastoma,Glioblastoma
219,2c259bc9-5e6a-46c0-b82b-d6f6eb257455,0dbfa70d-2a0d-41c4-aca6-b8df6394bb9c.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-02542,C3L-02542-03,Primary Tumor,0dbfa70d-2a0d-41c4-aca6-b8df6394bb9c,C3L-02542,Glioblastoma,Glioblastoma
220,3ae2f730-7444-4b65-8c75-1f8f18e6047a,27ad3b30-97ff-44a4-89f7-5233a3c3596e.rna_seq.a...,Transcriptome Profiling,Gene Expression Quantification,CPTAC-3,C3L-02970,C3L-02970-03,Primary Tumor,27ad3b30-97ff-44a4-89f7-5233a3c3596e,C3L-02970,Glioblastoma,Glioblastoma


In [28]:
merged_data_dedup.shape

(222, 12)

In [29]:
# Group by both primary_diagnosis and label, then count unique case_submitter_ids
merged_data_dedup.groupby(['primary_diagnosis', 'label'])['case_submitter_id'].count()

primary_diagnosis              label            
Astrocytoma, NOS               Astrocytoma          17
Astrocytoma, anaplastic        Astrocytoma          41
Glioblastoma                   Glioblastoma         72
Oligodendroglioma, NOS         Oligodendroglioma    29
Oligodendroglioma, anaplastic  Oligodendroglioma    25
Name: case_submitter_id, dtype: int64

In [30]:
# Filter out samples with non-empty labels
merged_data_dedup = merged_data_dedup[merged_data_dedup['label'].notna()]
merged_data_dedup.shape

(184, 12)

## Keep samples that have valid labels

In [31]:
# merge combined_df with merged_data_dedup to get 'Sample ID'
combined_df_with_sample_id = pd.merge(combined_df, merged_data_dedup[['filename_short', 'Sample ID']], 
    left_on='file_name', right_on='filename_short', how='left')

# Set 'Sample ID' as index, drop unnecessary columns
combined_df_with_sample_id = combined_df_with_sample_id.set_index('Sample ID')
combined_df_with_sample_id = combined_df_with_sample_id.drop(['filename_short', 'file_name'], axis=1)

# Filter combined_df_with_sample_id to only include non-empty labels
filtered_sample_ids = merged_data_dedup['Sample ID'].dropna().unique()
combined_df_with_sample_id = combined_df_with_sample_id.loc[filtered_sample_ids]

# Display the final shape
combined_df_with_sample_id.shape

(184, 56241)

## Output the Processed Data

In [32]:
# Define extraction directory
extraction_dir = "./processed"
os.makedirs(extraction_dir, exist_ok=True)

data_file_path = os.path.join(extraction_dir, 'data.csv')
combined_df_with_sample_id.to_csv(data_file_path, index=True)
print(f"Data saved to {data_file_path}")

Data saved to ./processed/data.csv


In [33]:
# Extract and save the filtered labels
y = merged_data_dedup.set_index('Sample ID')['label']
labels_file_path = os.path.join(extraction_dir, 'glioma_labels.csv')
y.to_csv(labels_file_path, index=True)
print(f"Labels saved to {labels_file_path}")

Labels saved to ./processed/glioma_labels.csv
