In [29]:
import os
import pandas as pd
from pathlib import Path

In [41]:
# Define the main folder containing all sample subfolders
currPath=Path.cwd()
main_folder = currPath.parents[1]/"Datasets/LungNormal"
output_file = currPath.parents[1]/"Datasets/DESeq2Input/lungNormalCounts.csv"
output_file.parent.mkdir(parents=True, exist_ok=True)
print("Main folder absolute path:", main_folder.resolve())
print("Output file absolute path:", output_file.resolve())

Main folder absolute path: /Users/simransodhi/Desktop/CMU/Spring 2025/Bioinformatics (02-604)/Project/Code/Datasets/LungNormal
Output file absolute path: /Users/simransodhi/Desktop/CMU/Spring 2025/Bioinformatics (02-604)/Project/Code/Datasets/DESeq2Input/lungNormalCounts.csv


In [42]:
# Dictionary to store expression data
expression_dfs = []  # Store each sample as a DataFrame with gene names

# Iterate over each sample folder
for sample_folder in os.listdir(main_folder):
    sample_path = os.path.join(main_folder, sample_folder)
    
    # Ensure it's a directory
    if os.path.isdir(sample_path):
        
        # Find the RNA-seq data file (excluding the logs folder)
        expression_file = None
        for file in os.listdir(sample_path):
            if file.endswith(".tsv") and "logs" not in file.lower():
                expression_file = os.path.join(sample_path, file)
                break  # We are expecting 1 tsv file

        if expression_file:
            # Read the file (1st row skipped since it contains metadata)
            df = pd.read_csv(expression_file, sep='\t', skiprows=1)

            # Ensure required columns exist
            if "gene_id" not in df.columns or "unstranded" not in df.columns:
                raise ValueError(f"Missing expected columns in {expression_file}.")

            # Rename the expression column to the sample ID (using the folder name)
            df = df[["gene_id", "unstranded"]].rename(columns={"unstranded": sample_folder})

            expression_dfs.append(df)

# Merge all samples using an outer join to keep all genes. We will remove the null elements later
merged_df = expression_dfs[0]
for df in expression_dfs[1:]:
    merged_df = merged_df.merge(df, on="gene_id", how="outer")

In [43]:
merged_df = merged_df[:-4] # Remove the summary statistic rows
merged_df.index = merged_df['gene_id'] # Set index as gene_id
merged_df= merged_df.drop('gene_id', axis = 1) # Remove the gene_id which is present as a separate column
merged_df.dropna() # Remove null unstranded values
merged_df = merged_df.loc[merged_df.iloc[:, 1:].sum(axis=1) > 0] # Remove those genes that have 0 expression across all samples
print(merged_df.head())

                    29dd3ad7-f2c6-4abf-8e81-6734e4bd8a6a  \
gene_id                                                    
ENSG00000000003.15                                   548   
ENSG00000000005.6                                      1   
ENSG00000000419.13                                   709   
ENSG00000000457.14                                   399   
ENSG00000000460.17                                    92   

                    a4b8b2f2-6a56-4999-bd95-c2c8e4aa3bd0  \
gene_id                                                    
ENSG00000000003.15                                   573   
ENSG00000000005.6                                      1   
ENSG00000000419.13                                   619   
ENSG00000000457.14                                   370   
ENSG00000000460.17                                    68   

                    661b6fed-d56a-452b-adaa-4b48ee9f4dbb  \
gene_id                                                    
ENSG00000000003.15                    

In [44]:
# Save merged dataset
merged_df.to_csv(output_file, index=True)
# Display summary
print(f"Processed {len(expression_dfs)} samples with {merged_df.shape[0]} unique genes.")

Processed 110 samples with 53459 unique genes.
