In [1]:
import os
import pandas as pd

In [2]:
# Define the main folder containing all sample subfolders
main_folder = "Datasets/BreastCancer"
output_file = "Datasets/DESeq2Input/breastCancerCounts.csv"

In [3]:
# Dictionary to store expression data
expression_dfs = []  # Store each sample as a DataFrame with gene names

# Iterate over each sample folder
for sample_folder in os.listdir(main_folder):
    sample_path = os.path.join(main_folder, sample_folder)
    
    # Ensure it's a directory
    if os.path.isdir(sample_path):
        
        # Find the RNA-seq data file (excluding the logs folder)
        expression_file = None
        for file in os.listdir(sample_path):
            if file.endswith(".tsv") and "logs" not in file.lower():
                expression_file = os.path.join(sample_path, file)
                break  # We are expecting 1 tsv file

        if expression_file:
            # Read the file (1st row skipped since it contains metadata)
            df = pd.read_csv(expression_file, sep='\t', skiprows=1)

            # Ensure required columns exist
            if "gene_id" not in df.columns or "unstranded" not in df.columns:
                raise ValueError(f"Missing expected columns in {expression_file}.")

            # Rename the expression column to the sample ID (using the folder name)
            df = df[["gene_id", "unstranded"]].rename(columns={"unstranded": sample_folder})

            expression_dfs.append(df)

# Merge all samples using an outer join to keep all genes. We will remove the null elements later
merged_df = expression_dfs[0]
for df in expression_dfs[1:]:
    merged_df = merged_df.merge(df, on="gene_id", how="outer")

In [5]:
merged_df = merged_df[:-4] # Remove the summary statistic rows
merged_df.index = merged_df['gene_id'] # Set index as gene_id
merged_df= merged_df.drop('gene_id', axis = 1) # Remove the gene_id which is present as a separate column
merged_df.dropna() # Remove null unstranded values
merged_df = merged_df.loc[merged_df.iloc[:, 1:].sum(axis=1) > 0] # Remove those genes that have 0 expression across all samples
print(merged_df.head())

                    2c3000b7-4db9-4f00-a82a-ca6802806631  \
gene_id                                                    
ENSG00000000003.15                                  2304   
ENSG00000000005.6                                      7   
ENSG00000000419.13                                  4979   
ENSG00000000457.14                                  1544   
ENSG00000000460.17                                  1723   

                    519b92e0-0dbc-471c-988f-48ce9dc95be3  \
gene_id                                                    
ENSG00000000003.15                                  1033   
ENSG00000000005.6                                      4   
ENSG00000000419.13                                  2240   
ENSG00000000457.14                                  1702   
ENSG00000000460.17                                   587   

                    f2bf29d2-4c33-4810-90e8-88f37ea47bc8  \
gene_id                                                    
ENSG00000000003.15                    

In [6]:
# Save merged dataset
merged_df.to_csv(output_file, index=True)
# Display summary
print(f"Processed {len(expression_dfs)} samples with {merged_df.shape[0]} unique genes.")

Processed 1118 samples with 60660 unique genes.
