## Step 1: Load Required Data

In [47]:
import pandas as pd
import os

# Define the dosage to load (modify for other dosages)
dosage = "T10"
base_dir = ""  # Ensure the base directory is correct
deg_results_dir = os.path.join(base_dir, "DEG_Results")
coexpression_dir = os.path.join(base_dir, "CoExpression_Results")
final_enrichment_dir = os.path.join(deg_results_dir, "Final_Enrichment")

# File paths - Ensure correct filenames based on the desired structure
gene_file = os.path.join(deg_results_dir, f"DEG_{dosage}", f"DEG_{dosage}_vs_Control.csv")  # Corrected to dosage vs control
pathway_file = os.path.join(final_enrichment_dir, f"Polygenic_Enrichment_DEG_{dosage}.csv")  # Corrected to dosage vs control enrichment
coexpression_file = os.path.join(coexpression_dir, f"CoExpression_{dosage}.csv")

# Load gene differential expression data (Dosage vs Control)
if os.path.exists(gene_file):
    df_gene = pd.read_csv(gene_file)
    print(f"‚úÖ Loaded {df_gene.shape[0]} genes from {gene_file}")
else:
    print(f"‚ö†Ô∏è Gene file not found: {gene_file}")
    df_gene = None  # Handle missing data case

# Load pathway enrichment data (Dosage vs Control)
if os.path.exists(pathway_file):
    df_pathway = pd.read_csv(pathway_file)
    print(f"‚úÖ Loaded {df_pathway.shape[0]} pathways from {pathway_file}")
else:
    print(f"‚ö†Ô∏è Pathway enrichment file not found: {pathway_file}")
    df_pathway = None

# Load gene-gene coexpression data
if os.path.exists(coexpression_file):
    df_coexpression = pd.read_csv(coexpression_file)
    print(f"‚úÖ Loaded {df_coexpression.shape[0]} co-expression pairs from {coexpression_file}")
else:
    print(f"‚ö†Ô∏è Co-expression file not found: {coexpression_file}")
    df_coexpression = None

# Display sample data for verification
if df_gene is not None:
    print("\nüîπ Sample Gene Data:")
    print(df_gene.head())

if df_pathway is not None:
    print("\nüîπ Sample Pathway Data:")
    print(df_pathway.head())

if df_coexpression is not None:
    print("\nüîπ Sample Coexpression Data:")
    print(df_coexpression.head())


‚úÖ Loaded 495 genes from DEG_Results/DEG_T10/DEG_T10_vs_Control.csv
‚úÖ Loaded 833 pathways from DEG_Results/Final_Enrichment/Polygenic_Enrichment_DEG_T10.csv
‚úÖ Loaded 42 co-expression pairs from CoExpression_Results/CoExpression_T10.csv

üîπ Sample Gene Data:
      Gene       P-Value    Log2FC
0    UQCRB  9.242747e-30 -0.489100
1     SND1  1.263355e-26 -0.427144
2   HNRNPR  1.438665e-31 -0.453004
3  TMEM258  2.261292e-29 -0.450814
4    CAPZB  7.869927e-59 -0.731796

üîπ Sample Pathway Data:
                                                Term       P-value  \
0               cytoplasmic translation (GO:0002181)  5.340766e-94   
1  SRP-dependent cotranslational protein targetin...  1.185206e-93   
2  cotranslational protein targeting to membrane ...  2.626490e-91   
3  nuclear-transcribed mRNA catabolic process, no...  5.909326e-90   
4               protein targeting to ER (GO:0045047)  1.039429e-86   

   Adjusted P-value  Combined Score  \
0      1.407292e-90    29962.885649   

## Step 2: Generate Node Features for Genes

In [83]:
import os
import pandas as pd

# Define directories
deg_dir = "DEG_Results"
coexp_dir = "CoExpression_Results"
output_dir = "Graph_Results/Gene_Features"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load globally relevant genes
global_gene_file = os.path.join(deg_dir, "Filtered_State_Global_DEGs.csv")
df_global_genes = pd.read_csv(global_gene_file)
global_genes = set(df_global_genes["Gene"].str.upper())  # Ensure case consistency

# Get all dosage levels available
dosages = [d for d in os.listdir(deg_dir) if d.startswith("DEG_")]

# Choose a dosage to print sample results for verification
verification_dosage = "DEG_T10"

print("\nüîπ **Starting Node Feature Generation with Overlap Calculation** üîπ\n")

for dosage in dosages:
    dosage_level = dosage.split("_")[-1]  # Extract dosage (e.g., T10, T20)
    
    # Define correct file paths (Ensuring control vs. dosage format)
    deg_file = os.path.join(deg_dir, dosage, f"DEG_{dosage_level}_vs_Control.csv")
    coexpression_file = os.path.join(coexp_dir, f"CoExpression_{dosage_level}.csv")
    pathway_file = os.path.join(deg_dir, "Final_Enrichment", f"Polygenic_Enrichment_DEG_{dosage_level}.csv")

    # Ensure DEG file exists before proceeding
    if not os.path.exists(deg_file):
        print(f"‚ö†Ô∏è Skipping {dosage_level} - Missing DEG file!")
        continue

    # Load DEG data (Control vs Dosage)
    df_gene = pd.read_csv(deg_file)
    df_gene["Gene"] = df_gene["Gene"].str.upper()  # Ensure case consistency

    # Compute Degree in Pathways (number of pathways a gene appears in)
    if os.path.exists(pathway_file):
        df_pathway = pd.read_csv(pathway_file)
        gene_pathway_counts = df_pathway["Genes"].str.split(";").explode().str.upper().value_counts()
        df_gene["Degree in Pathways"] = df_gene["Gene"].map(gene_pathway_counts).fillna(0).astype(int)
    else:
        df_gene["Degree in Pathways"] = 0

    # Load Co-expression Data
    if os.path.exists(coexpression_file):
        df_coexpression = pd.read_csv(coexpression_file)
        coexpressed_genes = set(df_coexpression["Gene1"].str.upper()).union(set(df_coexpression["Gene2"].str.upper()))
    else:
        coexpressed_genes = set()

    # **Compute Overlap Before Filtering**
    original_deg_genes = set(df_gene["Gene"])
    retained_genes = df_gene[
        (df_gene["Degree in Pathways"] > 0) | 
        (df_gene["Gene"].isin(global_genes)) | 
        (df_gene["Gene"].isin(coexpressed_genes))
    ]["Gene"].unique()
    
    # Compute how many genes remain
    overlap_count = len(set(retained_genes) & original_deg_genes)
    
    # Print overlap statistics
    print(f"üìå Dosage: {dosage_level} ‚Üí Original DEG Genes: {len(original_deg_genes)}, Retained Genes: {len(retained_genes)}, Overlapping: {overlap_count}")

    # Apply filtering
    df_gene = df_gene[df_gene["Gene"].isin(retained_genes)]

    # Save the final filtered gene features for this dosage
    output_file = os.path.join(output_dir, f"Filtered_Gene_Features_{dosage_level}.csv")
    df_gene.to_csv(output_file, index=False)

    print(f"‚úÖ Processed {dosage_level}: Saved filtered gene features!")

    # Print sample results for verification
    if dosage == verification_dosage:
        print("\nüîç Sample Verification for", dosage_level)
        print(df_gene.head())

print("\nüéØ **All dosages processed! Filtered gene node features saved in Graph_Results/**")



üîπ **Starting Node Feature Generation with Overlap Calculation** üîπ

üìå Dosage: T80 ‚Üí Original DEG Genes: 309, Retained Genes: 279, Overlapping: 279
‚úÖ Processed T80: Saved filtered gene features!
üìå Dosage: T5 ‚Üí Original DEG Genes: 277, Retained Genes: 251, Overlapping: 251
‚úÖ Processed T5: Saved filtered gene features!
üìå Dosage: T40 ‚Üí Original DEG Genes: 341, Retained Genes: 313, Overlapping: 313
‚úÖ Processed T40: Saved filtered gene features!
üìå Dosage: T1 ‚Üí Original DEG Genes: 302, Retained Genes: 269, Overlapping: 269
‚úÖ Processed T1: Saved filtered gene features!
üìå Dosage: T20 ‚Üí Original DEG Genes: 203, Retained Genes: 176, Overlapping: 176
‚úÖ Processed T20: Saved filtered gene features!
üìå Dosage: T2.5 ‚Üí Original DEG Genes: 432, Retained Genes: 382, Overlapping: 382
‚úÖ Processed T2.5: Saved filtered gene features!
üìå Dosage: T320 ‚Üí Original DEG Genes: 518, Retained Genes: 459, Overlapping: 459
‚úÖ Processed T320: Saved filtered gene featu

In [31]:
import os
import pandas as pd

# Define directory where filtered gene features are stored
graph_results_dir = "Graph_Results/Gene_Features"

# Get all processed dosage files
gene_feature_files = [f for f in os.listdir(graph_results_dir) if f.startswith("Filtered_Gene_Features_")]

# Iterate through each file and compute the number of genes with Degree 0
for file in gene_feature_files:
    dosage_level = file.split("_")[-1].replace(".csv", "")  # Extract dosage level
    file_path = os.path.join(graph_results_dir, file)
    
    # Load gene feature data
    df_gene = pd.read_csv(file_path)
    
    # Ensure 'Degree in Pathways' column is treated as numeric
    df_gene["Degree in Pathways"] = pd.to_numeric(df_gene["Degree in Pathways"], errors='coerce')

    # Count total genes and genes with Degree 0
    total_genes = df_gene.shape[0]
    genes_with_degree_0 = df_gene[df_gene["Degree in Pathways"] == 0].shape[0]

    # Print the results
    print(f"üìå Dosage: {dosage_level} ‚Üí Total Genes: {total_genes}, Genes with Degree 0: {genes_with_degree_0}")

    # Display sample genes with Degree 0 (for verification)
    sample_genes = df_gene[df_gene["Degree in Pathways"] == 0].head(5)
    if not sample_genes.empty:
        print(f"üîç Sample genes with Degree 0 in {dosage_level}:")
        print(sample_genes[["Gene", "Degree in Pathways"]])
        print("-" * 50)

print("‚úÖ Completed Degree 0 analysis for all dosages!")


üìå Dosage: T80 ‚Üí Total Genes: 279, Genes with Degree 0: 8
üîç Sample genes with Degree 0 in T80:
        Gene  Degree in Pathways
28    MT-ND5                   0
45   ATP5EP2                   0
131  S100A16                   0
212  MT-ATP8                   0
226   MT-ND2                   0
--------------------------------------------------
üìå Dosage: T160 ‚Üí Total Genes: 307, Genes with Degree 0: 9
üîç Sample genes with Degree 0 in T160:
        Gene  Degree in Pathways
20    MT-ND5                   0
25   MT-ND4L                   0
36    MT-ND1                   0
122  MT-ATP6                   0
132  S100A16                   0
--------------------------------------------------
üìå Dosage: T320 ‚Üí Total Genes: 459, Genes with Degree 0: 11
üîç Sample genes with Degree 0 in T320:
        Gene  Degree in Pathways
64    MT-ND1                   0
134   MT-ND4                   0
188  MT-ATP6                   0
199  S100A16                   0
228   MT-ND3              

## Step 3: Generate Node Features for pathway

In [58]:
import os
import pandas as pd

# Define directories
deg_dir = "DEG_Results/Final_Enrichment"
output_dir = "Graph_Results/Pathway_Features"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Identify **only dosage** pathway files (excluding merged, state, and vs comparisons)
pathway_files = [
    f for f in os.listdir(deg_dir) 
    if f.startswith("Polygenic_Enrichment_DEG_") and f.endswith(".csv") 
    and "merged" not in f and "State" not in f and "vs" not in f
]

# Process each dosage
for file in pathway_files:
    dosage = file.replace("Polygenic_Enrichment_DEG_", "").replace(".csv", "")  # Extract dosage (e.g., T10, T20)

    # Define pathway file
    pathway_file = os.path.join(deg_dir, file)
    
    # Ensure file exists
    if not os.path.exists(pathway_file):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing pathway file!")
        continue

    # Load pathway data
    df_pathway = pd.read_csv(pathway_file)

    # Ensure required columns exist
    if not {"Term", "Combined Score", "Genes"}.issubset(df_pathway.columns):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing required columns!")
        continue

    # Compute Pathway Size (Number of genes per pathway)
    df_pathway["Pathway Size"] = df_pathway["Genes"].apply(lambda x: len(str(x).split(";")))

    # Select required columns
    df_pathway = df_pathway[["Term", "Combined Score", "Pathway Size"]]

    # Save the pathway features for this dosage
    output_file = os.path.join(output_dir, f"Pathway_Features_{dosage}.csv")
    df_pathway.to_csv(output_file, index=False)

    print(f"‚úÖ Processed {dosage}: Saved pathway features!")

print("üéØ All dosages processed! Pathway features saved in `Graph_Results/Pathway_Features/`.")


‚úÖ Processed T2.5: Saved pathway features!
‚úÖ Processed T10: Saved pathway features!
‚úÖ Processed T40: Saved pathway features!
‚úÖ Processed T160: Saved pathway features!
‚úÖ Processed T320: Saved pathway features!
‚úÖ Processed T5: Saved pathway features!
‚úÖ Processed T80: Saved pathway features!
‚úÖ Processed T1: Saved pathway features!
‚úÖ Processed T20: Saved pathway features!
üéØ All dosages processed! Pathway features saved in `Graph_Results/Pathway_Features/`.


## Step 4: Generate Gene-Pathway and Gene-Gene edges

### Gene Pathway Edges

In [64]:
import os
import pandas as pd

# Define directories
deg_dir = "DEG_Results/Final_Enrichment"
gene_feature_dir = "Graph_Results/Gene_Features"
output_dir = "Graph_Results/Gene_Pathway_Edge"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Identify **only dosage** pathway files (excluding merged, state, and vs comparisons)
pathway_files = [
    f for f in os.listdir(deg_dir) 
    if f.startswith("Polygenic_Enrichment_DEG_") and f.endswith(".csv") 
    and "merged" not in f and "State" not in f and "vs" not in f
]

# Process each dosage
for file in pathway_files:
    dosage = file.replace("Polygenic_Enrichment_DEG_", "").replace(".csv", "")  # Extract dosage (e.g., T10, T20)

    # Define pathway file
    pathway_file = os.path.join(deg_dir, file)
    gene_feature_file = os.path.join(gene_feature_dir, f"Filtered_Gene_Features_{dosage}.csv")

    # Ensure files exist
    if not os.path.exists(pathway_file) or not os.path.exists(gene_feature_file):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing pathway or gene feature file!")
        continue

    # Load pathway data
    df_pathway = pd.read_csv(pathway_file)
    df_genes = pd.read_csv(gene_feature_file)

    # Ensure required columns exist
    if not {"Term", "Combined Score", "Genes"}.issubset(df_pathway.columns):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing required columns in pathway file!")
        continue

    # Convert gene names to uppercase for consistency
    df_genes["Gene"] = df_genes["Gene"].str.upper()
    valid_genes = set(df_genes["Gene"])

    # Expand gene-pathway relationships
    gene_pathway_edges = []
    
    for _, row in df_pathway.iterrows():
        pathway = row["Term"]
        combined_score = row["Combined Score"]
        
        # Ensure genes are properly split and filtered
        genes = str(row["Genes"]).split(";")
        genes = [gene.strip().upper() for gene in genes if gene.strip()]  # Clean and format genes
        
        # Remove genes that are NOT in `Gene_Features`
        filtered_genes = [gene for gene in genes if gene in valid_genes]

        # Skip pathways with no valid gene connections
        if len(filtered_genes) == 0:
            continue
        
        # Compute edge weight
        edge_weight = combined_score / len(filtered_genes) if len(filtered_genes) > 0 else 0

        # Store edges
        for gene in filtered_genes:
            gene_pathway_edges.append([gene, pathway, edge_weight])

    # Convert to DataFrame
    df_edges = pd.DataFrame(gene_pathway_edges, columns=["Gene", "Pathway", "Edge Weight"])

    # Save edge file if edges exist
    if not df_edges.empty:
        edge_file = os.path.join(output_dir, f"Edges_Gene_Pathway_{dosage}.csv")
        df_edges.to_csv(edge_file, index=False)
        print(f"‚úÖ Processed {dosage}: Saved Gene ‚Üî Pathway edges!")
    else:
        print(f"‚ö†Ô∏è Skipping {dosage} - No valid gene-pathway edges!")

print("üéØ All dosages processed! Gene ‚Üî Pathway edge files saved in Graph_Results/Gene_Pathway_Edge/")


‚úÖ Processed T2.5: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T10: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T40: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T160: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T320: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T5: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T80: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T1: Saved Gene ‚Üî Pathway edges!
‚úÖ Processed T20: Saved Gene ‚Üî Pathway edges!
üéØ All dosages processed! Gene ‚Üî Pathway edge files saved in Graph_Results/Gene_Pathway_Edge/


In [66]:
import os
import pandas as pd

# Define directories
gene_feature_dir = "Graph_Results/Gene_Features"
gene_pathway_edge_dir = "Graph_Results/Gene_Pathway_Edge"

# Get all dosage levels from gene feature files
dosages = [
    f.replace("Filtered_Gene_Features_", "").replace(".csv", "")
    for f in os.listdir(gene_feature_dir) if f.startswith("Filtered_Gene_Features")
]

# Storage for overlap results
overlap_results = []

# Process each dosage
for dosage in dosages:
    print(f"\nüîπ Analyzing Dosage: {dosage}")

    # Define file paths
    gene_feature_file = os.path.join(gene_feature_dir, f"Filtered_Gene_Features_{dosage}.csv")
    gene_pathway_edge_file = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")

    # Ensure files exist
    if not os.path.exists(gene_feature_file) or not os.path.exists(gene_pathway_edge_file):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing required files!")
        continue

    # Load data
    df_genes = pd.read_csv(gene_feature_file)
    df_gene_pathway_edges = pd.read_csv(gene_pathway_edge_file)

    # Extract unique genes from both datasets
    genes_in_features = set(df_genes["Gene"].str.upper())  # Ensure consistency
    genes_in_pathway_edges = set(df_gene_pathway_edges["Gene"].str.upper())

    # Compute overlap
    overlapping_genes = genes_in_features.intersection(genes_in_pathway_edges)

    # Store results
    overlap_results.append({
        "Dosage": dosage,
        "Total Genes (Gene Features)": len(genes_in_features),
        "Total Genes (Pathway Edges)": len(genes_in_pathway_edges),
        "Overlapping Genes": len(overlapping_genes)
    })

    # Print results for verification
    print(f"üìå Dosage: {dosage} ‚Üí Genes in Features: {len(genes_in_features)}, Genes in Pathway Edges: {len(genes_in_pathway_edges)}, Overlapping: {len(overlapping_genes)}")

# Convert to DataFrame for better visualization
df_overlap = pd.DataFrame(overlap_results)

# Display summary
print("\nüîπ **Final Overlap Analysis for All Dosages** üîπ")
print(df_overlap)



üîπ Analyzing Dosage: T80
üìå Dosage: T80 ‚Üí Genes in Features: 279, Genes in Pathway Edges: 271, Overlapping: 271

üîπ Analyzing Dosage: T160
üìå Dosage: T160 ‚Üí Genes in Features: 307, Genes in Pathway Edges: 298, Overlapping: 298

üîπ Analyzing Dosage: T320
üìå Dosage: T320 ‚Üí Genes in Features: 459, Genes in Pathway Edges: 448, Overlapping: 448

üîπ Analyzing Dosage: T5
üìå Dosage: T5 ‚Üí Genes in Features: 251, Genes in Pathway Edges: 242, Overlapping: 242

üîπ Analyzing Dosage: T20
üìå Dosage: T20 ‚Üí Genes in Features: 176, Genes in Pathway Edges: 170, Overlapping: 170

üîπ Analyzing Dosage: T40
üìå Dosage: T40 ‚Üí Genes in Features: 313, Genes in Pathway Edges: 303, Overlapping: 303

üîπ Analyzing Dosage: T10
üìå Dosage: T10 ‚Üí Genes in Features: 430, Genes in Pathway Edges: 419, Overlapping: 419

üîπ Analyzing Dosage: T2.5
üìå Dosage: T2.5 ‚Üí Genes in Features: 382, Genes in Pathway Edges: 370, Overlapping: 370

üîπ Analyzing Dosage: T1
üìå Dosage: T1 ‚

### Gene-Gene Edges

In [67]:
import os
import pandas as pd

# Define directories
gene_feature_dir = "Graph_Results/Gene_Features"
gene_gene_edge_dir = "Graph_Results/Gene_Gene_Edges"
gene_pathway_edge_dir = "Graph_Results/Gene_Pathway_Edges"

# Get all dosage levels
dosages = [f.replace("Filtered_Gene_Features_", "").replace(".csv", "") for f in os.listdir(gene_feature_dir) if f.startswith("Filtered_Gene_Features")]

# Store validation results
validation_results = []

# Process each dosage
for dosage in dosages:
    print(f"\nüîπ Validating Dosage: {dosage}...")

    # File paths
    gene_file = os.path.join(gene_feature_dir, f"Filtered_Gene_Features_{dosage}.csv")
    gene_gene_edge_file = os.path.join(gene_gene_edge_dir, f"Edges_Gene_Gene_{dosage}.csv")
    gene_pathway_edge_file = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")

    # Ensure required files exist
    missing_files = [file for file in [gene_file, gene_gene_edge_file, gene_pathway_edge_file] if not os.path.exists(file)]
    if missing_files:
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing files: {missing_files}")
        continue

    # Load data
    df_genes = pd.read_csv(gene_file)
    df_gene_gene_edges = pd.read_csv(gene_gene_edge_file)
    df_gene_pathway_edges = pd.read_csv(gene_pathway_edge_file)

    # Ensure consistent uppercase formatting
    genes_in_features = set(df_genes["Gene"].str.upper())
    genes_in_gene_gene_edges = set(df_gene_gene_edges["Gene1"].str.upper()).union(set(df_gene_gene_edges["Gene2"].str.upper()))
    genes_in_gene_pathway_edges = set(df_gene_pathway_edges["Gene"].str.upper())

    # **Corrected Orphan Node Check**
    # A gene is considered orphaned only if it has NO connections in both gene-gene and gene-pathway edges.
    connected_genes = genes_in_gene_gene_edges | genes_in_gene_pathway_edges
    orphaned_genes = genes_in_features - connected_genes

    # Store results
    validation_results.append({
        "Dosage": dosage,
        "Total Genes": len(genes_in_features),
        "Orphaned Genes (No Connections)": len(orphaned_genes),
    })

    # Print results
    print(f"üìå Dosage: {dosage} ‚Üí Total Genes: {len(genes_in_features)}, Orphaned: {len(orphaned_genes)}")

print("\n‚úÖ **Orphan Gene Validation Completed!**")



üîπ Validating Dosage: T80...
üìå Dosage: T80 ‚Üí Total Genes: 279, Orphaned: 207

üîπ Validating Dosage: T160...
üìå Dosage: T160 ‚Üí Total Genes: 307, Orphaned: 274

üîπ Validating Dosage: T320...
üìå Dosage: T320 ‚Üí Total Genes: 459, Orphaned: 442

üîπ Validating Dosage: T5...
üìå Dosage: T5 ‚Üí Total Genes: 251, Orphaned: 199

üîπ Validating Dosage: T20...
üìå Dosage: T20 ‚Üí Total Genes: 176, Orphaned: 115

üîπ Validating Dosage: T40...
üìå Dosage: T40 ‚Üí Total Genes: 313, Orphaned: 255

üîπ Validating Dosage: T10...
üìå Dosage: T10 ‚Üí Total Genes: 430, Orphaned: 253

üîπ Validating Dosage: T2.5...
üìå Dosage: T2.5 ‚Üí Total Genes: 382, Orphaned: 306

üîπ Validating Dosage: T1...
üìå Dosage: T1 ‚Üí Total Genes: 269, Orphaned: 251

‚úÖ **Orphan Gene Validation Completed!**


In [68]:
import os
import pandas as pd

# Define directories
gene_feature_dir = "Graph_Results/Gene_Features"
gene_gene_edge_dir = "Graph_Results/Gene_Gene_Edges"

# Get all available dosage levels from Gene Features
dosages = [
    f.replace("Filtered_Gene_Features_", "").replace(".csv", "")
    for f in os.listdir(gene_feature_dir)
    if f.startswith("Filtered_Gene_Features")
]


# Process each dosage
for dosage in dosages:
    gene_file = os.path.join(gene_feature_dir, f"Filtered_Gene_Features_{dosage}.csv")
    gene_gene_edge_file = os.path.join(gene_gene_edge_dir, f"Edges_Gene_Gene_{dosage}.csv")

    # Ensure both files exist
    if not os.path.exists(gene_file) or not os.path.exists(gene_gene_edge_file):
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing files!")
        continue

    # Load gene feature data
    df_genes = pd.read_csv(gene_file)
    
    # Filter genes that **do not have Degree 0**
    valid_genes = set(df_genes[df_genes["Degree in Pathways"] > 0]["Gene"].str.upper())

    # Load gene-gene edges
    df_edges = pd.read_csv(gene_gene_edge_file)
    
    # Find genes that appear in the edges file
    genes_in_edges = set(df_edges["Gene1"]).union(set(df_edges["Gene2"]))

    # Find the final set of genes that match both conditions
    final_genes = valid_genes.intersection(genes_in_edges)

    # Print results
    print(f"\nüìå Dosage: {dosage} ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: {len(final_genes)}")
    print(sorted(final_genes)[:10])  # Print first 10 for preview

print("\n‚úÖ Completed analysis for all dosages!")



üìå Dosage: T80 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 17
['MTRNR2L1', 'MTRNR2L8', 'RPL23', 'RPL27A', 'RPL30', 'RPL31', 'RPL8', 'RPS11', 'RPS12', 'RPS18']

üìå Dosage: T160 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 14
['RPL18', 'RPL23', 'RPL27', 'RPL28', 'RPL30', 'RPL8', 'RPS11', 'RPS14', 'RPS21', 'RPS24']

üìå Dosage: T320 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 7
['RPL18', 'RPL23', 'RPL27', 'RPL28', 'RPL37', 'RPS14', 'RPS21']

üìå Dosage: T5 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 34
['MTRNR2L8', 'PABPC1', 'RACK1', 'RPL10', 'RPL11', 'RPL13', 'RPL14', 'RPL15', 'RPL23', 'RPL27A']

üìå Dosage: T20 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 2
['MTRNR2L1', 'MTRNR2L8']

üìå Dosage: T40 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 0
[]

üìå Dosage: T10 ‚Üí Genes in Gene_Features (Degree > 0) & in Gene-Gene Edges: 1
['MTRNR2L8']

üìå Dosage: T2.5 ‚Üí Genes in Ge

### Validation of the data Created

In [69]:
import os
import pandas as pd

# Define the directory containing Gene-Pathway edge files
gene_pathway_edge_dir = "Graph_Results/Gene_Pathway_Edges"

# Get all dosage levels from the available edge files
dosages = [f.replace("Edges_Gene_Pathway_", "").replace(".csv", "") for f in os.listdir(gene_pathway_edge_dir) if f.startswith("Edges_Gene_Pathway")]

# Dictionary to store results
pathway_gene_counts = {}

# Process each dosage
for dosage in dosages:
    file_path = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")

    # Load the data
    df_edges = pd.read_csv(file_path)

    # **Fix: Count only unique genes across all pathways**
    unique_genes = df_edges["Gene"].nunique()

    # Store the result
    pathway_gene_counts[dosage] = unique_genes

    print(f"üìå Dosage: {dosage} ‚Üí Total Unique Genes in Pathways: {unique_genes}")

print("\n‚úÖ Completed analysis of unique genes in pathways for all dosages!")


üìå Dosage: T10 ‚Üí Total Unique Genes in Pathways: 1215
üìå Dosage: T80 ‚Üí Total Unique Genes in Pathways: 1465
üìå Dosage: T5 ‚Üí Total Unique Genes in Pathways: 622
üìå Dosage: T320 ‚Üí Total Unique Genes in Pathways: 534
üìå Dosage: T40 ‚Üí Total Unique Genes in Pathways: 884
üìå Dosage: T20 ‚Üí Total Unique Genes in Pathways: 1613
üìå Dosage: T160 ‚Üí Total Unique Genes in Pathways: 1404
üìå Dosage: T2.5 ‚Üí Total Unique Genes in Pathways: 341
üìå Dosage: T1 ‚Üí Total Unique Genes in Pathways: 377

‚úÖ Completed analysis of unique genes in pathways for all dosages!


### Seeing the Orphan Nodes

In [81]:
import os
import pandas as pd

# Define directories
gene_feature_dir = "Graph_Results/Gene_Features"
pathway_feature_dir = "Graph_Results/Pathway_Features"
gene_pathway_edge_dir = "Graph_Results/Gene_Pathway_Edge"
gene_gene_edge_dir = "Graph_Results/Gene_Gene_Edges"

# Storage for orphan analysis results
orphan_gene_results = []
orphan_pathway_results = []

print("\nüîç **Checking for Orphan Genes & Pathways** üîé\n")

# Get all dosage levels based on gene features
dosages = [
    f.replace("Filtered_Gene_Features_", "").replace(".csv", "")
    for f in os.listdir(gene_feature_dir) if f.startswith("Filtered_Gene_Features")
]

# Process each dosage
for dosage in dosages:
    print(f"\nüîπ Validating Dosage: {dosage}...")

    # Define file paths
    gene_feature_file = os.path.join(gene_feature_dir, f"Filtered_Gene_Features_{dosage}.csv")
    pathway_feature_file = os.path.join(pathway_feature_dir, f"Pathway_Features_{dosage}.csv")
    gene_pathway_edge_file = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")
    gene_gene_edge_file = os.path.join(gene_gene_edge_dir, f"Edges_Gene_Gene_{dosage}.csv")

    # Ensure all files exist before proceeding
    missing_files = []
    for file in [gene_feature_file, pathway_feature_file, gene_pathway_edge_file, gene_gene_edge_file]:
        if not os.path.exists(file):
            missing_files.append(file)
    
    if missing_files:
        print(f"‚ö†Ô∏è Skipping {dosage} - Missing files: {missing_files}")
        continue

    # Load gene-related data
    df_genes = pd.read_csv(gene_feature_file)
    df_gene_pathway_edges = pd.read_csv(gene_pathway_edge_file)
    df_gene_gene_edges = pd.read_csv(gene_gene_edge_file)

    # Extract unique genes from datasets
    genes_in_features = set(df_genes["Gene"].str.upper())
    genes_in_pathway_edges = set(df_gene_pathway_edges["Gene"].str.upper())
    genes_in_gene_gene_edges = set(df_gene_gene_edges["Gene1"].str.upper()).union(set(df_gene_gene_edges["Gene2"].str.upper()))

    # Compute orphan genes (genes in features but NOT in any edges)
    connected_genes = genes_in_pathway_edges.union(genes_in_gene_gene_edges)
    orphaned_genes = genes_in_features - connected_genes  # Only genes with NO edges

    # Store gene orphan results
    orphan_gene_results.append({
        "Dosage": dosage,
        "Total Genes": len(genes_in_features),
        "Orphaned Genes (No Edges)": len(orphaned_genes)
    })

    print(f"üìå Total Genes: {len(genes_in_features)} | Orphaned Genes (No Connections): {len(orphaned_genes)}")

    # Load pathway-related data
    df_pathway_features = pd.read_csv(pathway_feature_file)
    df_gene_pathway_edges = pd.read_csv(gene_pathway_edge_file)

    # Extract unique pathways
    feature_pathways = set(df_pathway_features["Term"].str.strip().str.upper())
    edge_pathways = set(df_gene_pathway_edges["Pathway"].str.strip().str.upper())

    # Compute orphan pathways (pathways in features but missing in edges)
    orphaned_pathways = feature_pathways - edge_pathways

    # Store pathway orphan results
    orphan_pathway_results.append({
        "Dosage": dosage,
        "Total Pathways": len(feature_pathways),
        "Orphaned Pathways (No Gene Connections)": len(orphaned_pathways)
    })

    print(f"üìå Total Pathways: {len(feature_pathways)} | Orphaned Pathways (No Connections): {len(orphaned_pathways)}")
    print("-" * 60)

# Convert results to DataFrame for final analysis
df_orphan_genes = pd.DataFrame(orphan_gene_results)
df_orphan_pathways = pd.DataFrame(orphan_pathway_results)

# Display final summary
print("\nüîπ **Final Orphan Node Analysis (Genes & Pathways)** üîπ\n")
print(df_orphan_genes)
print("\n")
print(df_orphan_pathways)

print("\n‚úÖ **Orphan Node & Pathway Validation Completed!** ‚úÖ")


üîç **Checking for Orphan Genes & Pathways** üîé


üîπ Validating Dosage: T80...
üìå Total Genes: 279 | Orphaned Genes (No Connections): 2
üìå Total Pathways: 580 | Orphaned Pathways (No Connections): 0
------------------------------------------------------------

üîπ Validating Dosage: T160...
üìå Total Genes: 307 | Orphaned Genes (No Connections): 1
üìå Total Pathways: 459 | Orphaned Pathways (No Connections): 0
------------------------------------------------------------

üîπ Validating Dosage: T320...
üìå Total Genes: 459 | Orphaned Genes (No Connections): 2
üìå Total Pathways: 930 | Orphaned Pathways (No Connections): 0
------------------------------------------------------------

üîπ Validating Dosage: T5...
üìå Total Genes: 251 | Orphaned Genes (No Connections): 1
üìå Total Pathways: 627 | Orphaned Pathways (No Connections): 0
------------------------------------------------------------

üîπ Validating Dosage: T20...
üìå Total Genes: 176 | Orphaned Genes (No Con

### Remove Orphan Nodes

## Step 4 : Process Cell data

In [84]:
import pandas as pd

# Define file path
scRNA_file = "merged_scRNA_metadata_normalized.tsv"

# Load the TSV file
df_scrna = pd.read_csv(scRNA_file, sep="\t")  # Use tab separator for TSV files

# Display basic information
print("üìÇ File Loaded Successfully!")
print("Data Overview:")
print(df_scrna.head())  # Show first few rows

print("\nüìä Column Names:")
print(df_scrna.columns)  # Show all column names

print("\nüîç Data Types:")
print(df_scrna.dtypes)  # Show data types of each column

print("\nüõ†Ô∏è Summary Statistics:")
print(df_scrna.describe(include="all"))  # Get summary statistics for all columns


üìÇ File Loaded Successfully!
Data Overview:
   Unnamed: 0             cell  nGene   nUMI orig.ident  percent.mito State  \
0           0  bc_ctrl_026.335   3766  14192          C      0.120279     C   
1           1  bc_ctrl_263.307   3986  14595          C      0.118602     C   
2           2  bc_ctrl_305.140   3964  12623          C      0.137527     C   
3           3  bc_ctrl_267.123   3863  13211          C      0.123609     C   
4           4  bc_ctrl_057.204   3882  12219          C      0.147475     C   

   A1BG       A2M  A4GALT  ...  ZW10    ZWILCH     ZWINT  ZXDB      ZXDC  \
0   0.0  0.000000     0.0  ...   0.0  1.098612  1.386294   0.0  0.000000   
1   0.0  0.000000     0.0  ...   0.0  0.693147  0.693147   0.0  0.000000   
2   0.0  0.000000     0.0  ...   0.0  0.000000  0.000000   0.0  0.000000   
3   0.0  1.098612     0.0  ...   0.0  0.693147  0.693147   0.0  0.000000   
4   0.0  0.000000     0.0  ...   0.0  0.000000  0.693147   0.0  0.693147   

   ZYG11A    ZYG11B   

### Normalizing nUMI and nGene

In [85]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load scRNA metadata file
scRNA_file = "merged_scRNA_metadata_normalized.tsv"
df_cells = pd.read_csv(scRNA_file, sep="\t")

# Step 1: Normalize nGene and nUMI using Min-Max Scaling
columns_to_normalize = ["nGene", "nUMI"]
scaler = MinMaxScaler()
df_cells[columns_to_normalize] = scaler.fit_transform(df_cells[columns_to_normalize])

# Step 2: Encode 'State' as numerical values (S1 -> 0, S2 -> 1, ...)
label_encoder = LabelEncoder()
df_cells["State_Encoded"] = label_encoder.fit_transform(df_cells["State"])

# Save the processed cell features
processed_output_file = "Graph_Results/Cell_Features_Normalized.csv"
df_cells.to_csv(processed_output_file, index=False)

print(f"‚úÖ Normalization & Encoding complete! Saved processed cell features to {processed_output_file}")
print(df_cells.head())  # Display sample results


‚úÖ Normalization & Encoding complete! Saved processed cell features to Graph_Results/Cell_Features_Normalized.csv
   Unnamed: 0             cell     nGene      nUMI orig.ident  percent.mito  \
0           0  bc_ctrl_026.335  0.935336  0.971011          C      0.120279   
1           1  bc_ctrl_263.307  0.998283  1.000000          C      0.118602   
2           2  bc_ctrl_305.140  0.991989  0.858150          C      0.137527   
3           3  bc_ctrl_267.123  0.963090  0.900446          C      0.123609   
4           4  bc_ctrl_057.204  0.968526  0.829089          C      0.147475   

  State  A1BG       A2M  A4GALT  ...    ZWILCH     ZWINT  ZXDB      ZXDC  \
0     C   0.0  0.000000     0.0  ...  1.098612  1.386294   0.0  0.000000   
1     C   0.0  0.000000     0.0  ...  0.693147  0.693147   0.0  0.000000   
2     C   0.0  0.000000     0.0  ...  0.000000  0.000000   0.0  0.000000   
3     C   0.0  1.098612     0.0  ...  0.693147  0.693147   0.0  0.000000   
4     C   0.0  0.000000     0.

### Generating the Cell Features

In [96]:
import os
import pandas as pd

# Define directories
cell_feature_file = "Graph_Results/Cell_Features_Normalized.csv"  # Processed cell metadata
deg_dir = "DEG_Results"  # DEG results folder
output_dir = "Graph_Results/Cell_Features"  # Output directory for cell features per dosage

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the normalized cell feature data
df_cells = pd.read_csv(cell_feature_file)

# Extract core cell features
df_cells = df_cells[["cell", "nGene", "nUMI", "orig.ident", "State"]]  # Keep "State"

# üîπ **Create Correct State Encoding Mapping**
unique_states = sorted(df_cells["State"].unique())  # Get unique states (C, S1, S2, ..., S5)
state_mapping = {state: idx for idx, state in enumerate(unique_states)}  # Assign numeric labels
df_cells["State_Encoded"] = df_cells["State"].map(state_mapping)  # Apply encoding

# Get all available dosage levels (excluding Control)
dosages = sorted([d for d in os.listdir(deg_dir) if d.startswith("DEG_") and d != "DEG_C"])

print("\nüîç **Processing Cell Features for Each Dosage** üîé")

# Process each dosage separately
for dosage in dosages:
    dosage_level = dosage.replace("DEG_", "")  # Extract dosage (e.g., T1, T2.5, ..., T320)
    
    # Define DEG file path for this dosage
    deg_file = os.path.join(deg_dir, dosage, f"DEG_{dosage_level}_vs_Control.csv")
    
    # Ensure the DEG file exists
    if not os.path.exists(deg_file):
        print(f"‚ö†Ô∏è Skipping {dosage_level} - Missing DEG file!")
        continue

    # Load significant genes for this dosage
    df_deg = pd.read_csv(deg_file)
    significant_genes = set(df_deg["Gene"].str.upper())  # Convert to uppercase for consistency

    # ‚úÖ **Fix: Use `orig.ident` for filtering cells for the given dosage**
    df_filtered_cells = df_cells[df_cells["orig.ident"] == dosage_level].copy()  # Filter by orig.ident

    # Retain only the significant genes from DEG
    retained_genes = [gene for gene in significant_genes if gene in df_filtered_cells.columns]

    # Drop all non-significant gene expression columns
    df_filtered_cells = df_filtered_cells[["cell", "nGene", "nUMI", "State_Encoded"] + retained_genes]

    # Save the filtered cell feature file
    output_file = os.path.join(output_dir, f"Cell_Features_{dosage_level}.csv")
    df_filtered_cells.to_csv(output_file, index=False)

    print(f"‚úÖ Processed {dosage_level}: Saved cell node features!")

print("\n‚úÖ **Cell Node Feature Generation Completed!** ‚úÖ")



üîç **Processing Cell Features for Each Dosage** üîé
‚úÖ Processed T1: Saved cell node features!
‚úÖ Processed T10: Saved cell node features!
‚úÖ Processed T160: Saved cell node features!
‚úÖ Processed T2.5: Saved cell node features!
‚úÖ Processed T20: Saved cell node features!
‚úÖ Processed T320: Saved cell node features!
‚úÖ Processed T40: Saved cell node features!
‚úÖ Processed T5: Saved cell node features!
‚úÖ Processed T80: Saved cell node features!

‚úÖ **Cell Node Feature Generation Completed!** ‚úÖ


## Final Cell- Gene edge generation

In [100]:
import os
import pandas as pd

# Define directories
deg_dir = "DEG_Results"
cell_feature_file = "Graph_Results/Cell_Features_Normalized.csv"
output_dir = "Graph_Results/Cell_Gene_Edges"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load full normalized cell feature dataset
df_cells = pd.read_csv(cell_feature_file)

# Identify metadata columns
metadata_cols = ["cell", "nGene", "nUMI", "orig.ident", "percent.mito", "State", "State_Encoded"]

# Get gene expression columns and map to uppercase
gene_expression_cols = [col for col in df_cells.columns if col not in metadata_cols]
gene_expression_cols_upper = {col.upper(): col for col in gene_expression_cols}

# List all dosage folders
dosages = [d.replace("DEG_", "") for d in os.listdir(deg_dir) if d.startswith("DEG_")]

print("\nüîß Generating Cell-Gene Edges Per Dosage (Filtered by DEG & Cells)...")

for dosage in dosages:
    deg_file = os.path.join(deg_dir, f"DEG_{dosage}", f"DEG_{dosage}_vs_Control.csv")

    if not os.path.exists(deg_file):
        print(f"‚ö†Ô∏è Skipping {dosage} - DEG file not found.")
        continue

    # Load DEG genes for the dosage
    df_deg = pd.read_csv(deg_file)
    deg_genes = df_deg["Gene"].str.upper().str.strip().tolist()

    # Map DEG genes to actual expression columns (case-insensitive)
    valid_gene_cols = [gene_expression_cols_upper[gene] for gene in deg_genes if gene in gene_expression_cols_upper]

    if not valid_gene_cols:
        print(f"‚ö†Ô∏è Skipping {dosage} - No matching DEG genes in cell features.")
        continue

    # üîπ Filter cells corresponding only to current dosage
    df_filtered_cells = df_cells[df_cells["orig.ident"] == dosage].copy()

    if df_filtered_cells.empty:
        print(f"‚ö†Ô∏è Skipping {dosage} - No cells found for this dosage in orig.ident.")
        continue

    # Extract only relevant genes + cell column
    df_subset = df_filtered_cells[["cell"] + valid_gene_cols].copy()

    # Melt to long format: (cell, gene, expression)
    df_edges = df_subset.melt(id_vars="cell", var_name="Gene", value_name="Edge Weight")

    # Drop edges with zero expression
    df_edges = df_edges[df_edges["Edge Weight"] > 0]

    # Save final edge file
    output_file = os.path.join(output_dir, f"Cell_Gene_Edges_{dosage}.csv")
    df_edges.to_csv(output_file, index=False)

    print(f"‚úÖ {dosage}: Saved {len(df_edges)} edges for {df_filtered_cells.shape[0]} cells and {len(valid_gene_cols)} genes.")

print("\nüéØ All Cell-Gene Edges (filtered) saved to `Graph_Results/Cell_Gene_Edges/` ‚úÖ")



üîß Generating Cell-Gene Edges Per Dosage (Filtered by DEG & Cells)...
‚úÖ T80: Saved 151137 edges for 909 cells and 309 genes.
‚úÖ T5: Saved 65642 edges for 396 cells and 277 genes.
‚úÖ T40: Saved 157423 edges for 917 cells and 341 genes.
‚úÖ T1: Saved 215492 edges for 1223 cells and 302 genes.
‚úÖ T20: Saved 73755 edges for 640 cells and 203 genes.
‚úÖ T2.5: Saved 176850 edges for 758 cells and 432 genes.
‚úÖ T320: Saved 109557 edges for 560 cells and 518 genes.
‚úÖ T160: Saved 215584 edges for 1338 cells and 343 genes.
‚úÖ T10: Saved 248818 edges for 1027 cells and 495 genes.

üéØ All Cell-Gene Edges (filtered) saved to `Graph_Results/Cell_Gene_Edges/` ‚úÖ


# Graph Construction with Features

In [17]:
import os
import pandas as pd

# Define folders
base_dir = "Graph_Results"
gene_feat_dir = f"{base_dir}/Gene_Features"
gene_pathway_edge_dir = f"{base_dir}/Gene_Pathway_Edge"
cell_gene_edge_dir = f"{base_dir}/Cell_Gene_Edges"

# Output folder (overwrite original)
os.makedirs(gene_feat_dir, exist_ok=True)

# Get all dosage levels
files = [f for f in os.listdir(gene_feat_dir) if f.startswith("Filtered_Gene_Features_")]
dosages = [f.replace("Filtered_Gene_Features_", "").replace(".csv", "") for f in files]

for dosage in dosages:
    try:
        gene_feat_file = os.path.join(gene_feat_dir, f"Filtered_Gene_Features_{dosage}.csv")
        df_gene = pd.read_csv(gene_feat_file)
        genes_in_features = set(df_gene["Gene"].str.upper())

        # Collect genes from edges
        genes_from_edges = set()

        # Gene-Pathway
        gp_file = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")
        if os.path.exists(gp_file):
            df_gp = pd.read_csv(gp_file)
            genes_from_edges.update(df_gp["Gene"].str.upper())

        # Cell-Gene
        cg_file = os.path.join(cell_gene_edge_dir, f"Cell_Gene_Edges_{dosage}.csv")
        if os.path.exists(cg_file):
            df_cg = pd.read_csv(cg_file)
            genes_from_edges.update(df_cg["Gene"].str.upper())

        # Filter gene features
        valid_genes = genes_in_features.intersection(genes_from_edges)
        df_filtered = df_gene[df_gene["Gene"].str.upper().isin(valid_genes)]

        dropped_count = len(df_gene) - len(df_filtered)
        print(f"‚úÖ {dosage}: Kept {len(df_filtered)} genes, Dropped {dropped_count} genes")

        # Save (overwrite)
        df_filtered.to_csv(gene_feat_file, index=False)

    except Exception as e:
        print(f"‚ùå Failed for {dosage}: {e}")

print("\nüéØ Finished cleaning all gene feature files (excluding Gene-Gene edges)!")


‚úÖ T80: Kept 279 genes, Dropped 0 genes
‚úÖ T160: Kept 307 genes, Dropped 0 genes
‚úÖ T320: Kept 459 genes, Dropped 0 genes
‚úÖ T5: Kept 251 genes, Dropped 0 genes
‚úÖ T20: Kept 176 genes, Dropped 0 genes
‚úÖ T40: Kept 313 genes, Dropped 0 genes
‚úÖ T10: Kept 430 genes, Dropped 0 genes
‚úÖ T2.5: Kept 382 genes, Dropped 0 genes
‚úÖ T1: Kept 269 genes, Dropped 0 genes

üéØ Finished cleaning all gene feature files (excluding Gene-Gene edges)!


In [104]:
import os
import pandas as pd

cell_feat_dir = "Graph_Results/Cell_Features"
cell_gene_edge_dir = "Graph_Results/Cell_Gene_Edges"

cell_files = [f for f in os.listdir(cell_feat_dir) if f.startswith("Cell_Features_")]
dosages = [f.replace("Cell_Features_", "").replace(".csv", "") for f in cell_files]

for dosage in dosages:
    try:
        cell_file = os.path.join(cell_feat_dir, f"Cell_Features_{dosage}.csv")
        edge_file = os.path.join(cell_gene_edge_dir, f"Cell_Gene_Edges_{dosage}.csv")

        if not os.path.exists(edge_file):
            print(f"‚ö†Ô∏è Skipping {dosage} ‚Äî no cell-gene edges")
            continue

        df_cells = pd.read_csv(cell_file)
        df_edges = pd.read_csv(edge_file)

        valid_cells = set(df_edges["cell"])
        df_filtered = df_cells[df_cells["cell"].isin(valid_cells)]

        df_filtered.to_csv(cell_file, index=False)
        print(f"‚úÖ Cleaned cell features for {dosage}: {len(df_filtered)} cells")

    except Exception as e:
        print(f"‚ùå Failed for {dosage}: {e}")


‚úÖ Cleaned cell features for T5: 396 cells
‚úÖ Cleaned cell features for T20: 640 cells
‚úÖ Cleaned cell features for T2.5: 758 cells
‚úÖ Cleaned cell features for T160: 1338 cells
‚úÖ Cleaned cell features for T10: 1027 cells
‚úÖ Cleaned cell features for T80: 909 cells
‚úÖ Cleaned cell features for T320: 560 cells
‚úÖ Cleaned cell features for T40: 917 cells
‚úÖ Cleaned cell features for T1: 1223 cells


In [18]:
pathway_feat_dir = "Graph_Results/Pathway_Features"
gene_pathway_edge_dir = "Graph_Results/Gene_Pathway_Edge"

pathway_files = [f for f in os.listdir(pathway_feat_dir) if f.startswith("Pathway_Features_")]
dosages = [f.replace("Pathway_Features_", "").replace(".csv", "") for f in pathway_files]

for dosage in dosages:
    try:
        feat_file = os.path.join(pathway_feat_dir, f"Pathway_Features_{dosage}.csv")
        edge_file = os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv")

        if not os.path.exists(edge_file):
            print(f"‚ö†Ô∏è Skipping {dosage} ‚Äî no gene-pathway edges")
            continue

        df_feat = pd.read_csv(feat_file)
        df_edge = pd.read_csv(edge_file)

        valid_pathways = set(df_edge["Pathway"].str.upper().str.strip())
        df_feat["Term"] = df_feat["Term"].str.upper().str.strip()
        df_filtered = df_feat[df_feat["Term"].isin(valid_pathways)]

        df_filtered.to_csv(feat_file, index=False)
        print(f"‚úÖ Cleaned pathway features for {dosage}: {len(df_filtered)} pathways")

    except Exception as e:
        print(f"‚ùå Failed for {dosage}: {e}")


‚úÖ Cleaned pathway features for T20: 387 pathways
‚úÖ Cleaned pathway features for T40: 767 pathways
‚úÖ Cleaned pathway features for T80: 580 pathways
‚úÖ Cleaned pathway features for T160: 459 pathways
‚úÖ Cleaned pathway features for T320: 930 pathways
‚úÖ Cleaned pathway features for T2.5: 889 pathways
‚úÖ Cleaned pathway features for T5: 627 pathways
‚úÖ Cleaned pathway features for T10: 833 pathways
‚úÖ Cleaned pathway features for T1: 808 pathways


In [20]:
print(gene_df.dtypes)
print(cell_df.dtypes)
print(path_df.dtypes)


Gene                   object
P-Value               float64
Log2FC                float64
Degree in Pathways      int64
dtype: object
cell              object
nGene            float64
nUMI             float64
State_Encoded      int64
dtype: object
Term               object
Combined Score    float64
Pathway Size        int64
dtype: object


In [25]:
import os
import pandas as pd
import torch
from torch_geometric.data import HeteroData
import json
from tqdm import tqdm
import networkx as nx
from collections import Counter

# Directories
base_dir = "Graph_Results"
gene_feat_dir = os.path.join(base_dir, "Gene_Features")
cell_feat_dir = os.path.join(base_dir, "Cell_Features")
pathway_feat_dir = os.path.join(base_dir, "Pathway_Features")
gene_pathway_edge_dir = os.path.join(base_dir, "Gene_Pathway_Edge")
cell_gene_edge_dir = os.path.join(base_dir, "Cell_Gene_Edges")

graph_output_dir = os.path.join(base_dir, "HeteroGraphs")
mapping_output_dir = os.path.join(base_dir, "Graph_Mappings")
os.makedirs(graph_output_dir, exist_ok=True)
os.makedirs(mapping_output_dir, exist_ok=True)

# List dosages
dosages = [f.replace("Filtered_Gene_Features_", "").replace(".csv", "")
           for f in os.listdir(gene_feat_dir) if f.endswith(".csv") and "checkpoint" not in f]

for dosage in tqdm(dosages, desc="üîÑ Building Graphs"):
    try:
        data = HeteroData()

        # Load features
        gene_df = pd.read_csv(os.path.join(gene_feat_dir, f"Filtered_Gene_Features_{dosage}.csv"))
        cell_df = pd.read_csv(os.path.join(cell_feat_dir, f"Cell_Features_{dosage}.csv"))
        path_df = pd.read_csv(os.path.join(pathway_feat_dir, f"Pathway_Features_{dosage}.csv"))

        # Normalize pathway names
        path_df["Term"] = path_df["Term"].str.lower().str.strip()

        # Load edges
        gene_path_df = pd.read_csv(os.path.join(gene_pathway_edge_dir, f"Edges_Gene_Pathway_{dosage}.csv"))
        cell_gene_df = pd.read_csv(os.path.join(cell_gene_edge_dir, f"Cell_Gene_Edges_{dosage}.csv"))

        # Index maps
        gene_idx = {g: i for i, g in enumerate(gene_df["Gene"].str.upper())}
        cell_idx = {c: i for i, c in enumerate(cell_df["cell"])}
        path_idx = {p: i for i, p in enumerate(path_df["Term"])}

        # One-hot encode state column (5 states)
        num_states = 5
        one_hot = pd.get_dummies(cell_df["State_Encoded"], prefix='State', prefix_sep='_')
        for s in range(1, num_states + 1):
            col = f"State_{s}"
            if col not in one_hot.columns:
                one_hot[col] = 0
        one_hot = one_hot.sort_index(axis=1)
        cell_features = pd.concat([cell_df[["nGene", "nUMI"]], one_hot], axis=1)

        # ‚úÖ Strict numeric conversions + DEBUG CHECK
        gene_feat_cols = ["Log2FC", "Degree in Pathways"]
        gene_features = gene_df[gene_feat_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
        print(f"\n[DEBUG] {dosage} gene features dtypes:\n", gene_features.dtypes)

        cell_numeric_cols = ["nGene", "nUMI"]
        cell_one_hot_cols = [col for col in cell_features.columns if col.startswith("State_")]
        cell_features_cleaned = cell_features[cell_numeric_cols + cell_one_hot_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
        print(f"[DEBUG] {dosage} cell features dtypes (AFTER float cast):\n", cell_features_cleaned.dtypes)

        pathway_feat_cols = ["Combined Score", "Pathway Size"]
        pathway_features = path_df[pathway_feat_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
        print(f"[DEBUG] {dosage} pathway features dtypes:\n", pathway_features.dtypes)

        # ‚úÖ Convert to tensors
        data["gene"].x = torch.tensor(gene_features.astype(float).values, dtype=torch.float)
        data["cell"].x = torch.tensor(cell_features_cleaned.values, dtype=torch.float)
        data["pathway"].x = torch.tensor(pathway_features.astype(float).values, dtype=torch.float)

        edge_summary = {}

        # Gene-Pathway edges
        gp_src, gp_tgt, gp_weight = [], [], []
        gene_path_df["Pathway"] = gene_path_df["Pathway"].str.lower().str.strip()
        for _, row in gene_path_df.iterrows():
            g, p = row["Gene"].upper(), row["Pathway"]
            if g in gene_idx and p in path_idx:
                gp_src.append(gene_idx[g])
                gp_tgt.append(path_idx[p])
                gp_weight.append([float(row["Edge Weight"])])
        if gp_src:
            data["gene", "involved_in", "pathway"].edge_index = torch.tensor([gp_src, gp_tgt], dtype=torch.long)
            data["gene", "involved_in", "pathway"].edge_attr = torch.tensor(gp_weight, dtype=torch.float)
            edge_summary["gene‚Üíinvolved_in‚Üípathway"] = len(gp_src)

        # Cell-Gene edges
        cg_src, cg_tgt, cg_weight = [], [], []
        for _, row in cell_gene_df.iterrows():
            c, g = row["cell"], row["Gene"].upper()
            if c in cell_idx and g in gene_idx:
                cg_src.append(cell_idx[c])
                cg_tgt.append(gene_idx[g])
                cg_weight.append([float(row["Edge Weight"])])
        if cg_src:
            data["cell", "expresses", "gene"].edge_index = torch.tensor([cg_src, cg_tgt], dtype=torch.long)
            data["cell", "expresses", "gene"].edge_attr = torch.tensor(cg_weight, dtype=torch.float)
            edge_summary["cell‚Üíexpresses‚Üígene"] = len(cg_src)

        # Save graph
        torch.save(data, os.path.join(graph_output_dir, f"HeteroGraph_{dosage}.pt"))

        # Analyze graph summary
        G = nx.Graph()
        for ntype in data.node_types:
            G.add_nodes_from([f"{ntype}_{i}" for i in range(data[ntype].num_nodes)], type=ntype)

        for (src, rel, dst), edge_index in data.edge_index_dict.items():
            for i in range(edge_index.size(1)):
                u = f"{src}_{edge_index[0, i].item()}"
                v = f"{dst}_{edge_index[1, i].item()}"
                G.add_edge(u, v, type=rel)

        isolated_nodes = list(nx.isolates(G))
        isolated_by_type = dict(Counter([n.split("_")[0] for n in isolated_nodes]))

        # Save mapping JSON
        with open(os.path.join(mapping_output_dir, f"Graph_Mapping_{dosage}.json"), "w") as f:
            json.dump({
                "dosage": dosage,
                "gene_to_index": gene_idx,
                "cell_to_index": cell_idx,
                "pathway_to_index": path_idx,
                "node_counts": {k: v.num_nodes for k, v in data.items() if isinstance(v, HeteroData)},
                "edge_counts": edge_summary,
                "isolated_node_count": len(isolated_nodes),
                "isolated_node_by_type": isolated_by_type
            }, f, indent=2)

        print(f"‚úÖ Graph & JSON saved for {dosage}")

    except Exception as e:
        print(f"‚ùå Failed for {dosage}: {e}")

print("\nüéØ All graphs and mapping JSONs generated!")


üîÑ Building Graphs:   0%|          | 0/9 [00:00<?, ?it/s]


[DEBUG] T80 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T80 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T80 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  11%|‚ñà         | 1/9 [00:07<00:56,  7.08s/it]

‚úÖ Graph & JSON saved for T80

[DEBUG] T160 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T160 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T160 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  22%|‚ñà‚ñà‚ñè       | 2/9 [00:16<01:01,  8.72s/it]

‚úÖ Graph & JSON saved for T160

[DEBUG] T320 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T320 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T320 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  33%|‚ñà‚ñà‚ñà‚ñé      | 3/9 [00:22<00:43,  7.23s/it]

‚úÖ Graph & JSON saved for T320

[DEBUG] T5 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T5 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T5 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 4/9 [00:25<00:28,  5.77s/it]

‚úÖ Graph & JSON saved for T5

[DEBUG] T20 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T20 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T20 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 5/9 [00:29<00:19,  4.90s/it]

‚úÖ Graph & JSON saved for T20

[DEBUG] T40 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T40 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T40 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 6/9 [00:36<00:17,  5.71s/it]

‚úÖ Graph & JSON saved for T40

[DEBUG] T10 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T10 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T10 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7/9 [00:48<00:15,  7.62s/it]

‚úÖ Graph & JSON saved for T10

[DEBUG] T2.5 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T2.5 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T2.5 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8/9 [00:56<00:07,  7.86s/it]

‚úÖ Graph & JSON saved for T2.5

[DEBUG] T1 gene features dtypes:
 Log2FC                float64
Degree in Pathways      int64
dtype: object
[DEBUG] T1 cell features dtypes (AFTER float cast):
 nGene      float64
nUMI       float64
State_1    float64
State_2    float64
State_3    float64
State_4    float64
State_5    float64
dtype: object
[DEBUG] T1 pathway features dtypes:
 Combined Score    float64
Pathway Size        int64
dtype: object


üîÑ Building Graphs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [01:06<00:00,  7.37s/it]

‚úÖ Graph & JSON saved for T1

üéØ All graphs and mapping JSONs generated!





## Inspecting the Graphs created

In [27]:
import os
import torch
import json
from torch_geometric.data import HeteroData
from collections import Counter
from torch_geometric.utils import to_networkx
import networkx as nx

# === Setup ===
base_dir = "Graph_Results"
graph_dir = os.path.join(base_dir, "HeteroGraphs")
map_dir = os.path.join(base_dir, "Graph_Mappings")

# === Inspect Each Graph ===
graph_files = sorted([f for f in os.listdir(graph_dir) if f.endswith(".pt")])

for file in graph_files:
    dosage = file.replace("HeteroGraph_", "").replace(".pt", "")
    print(f"\nüîç Inspecting Graph for Dosage: {dosage}")

    graph_path = os.path.join(graph_dir, file)
    mapping_path = os.path.join(map_dir, f"Graph_Mapping_{dosage}.json")

    # Load graph and mapping
    data = torch.load(graph_path)
    with open(mapping_path, "r") as f:
        mapping = json.load(f)

    # === Node Summary ===
    print("üß¨ Node Types & Counts:")
    for ntype in data.node_types:
        print(f"  - {ntype}: {data[ntype].num_nodes} nodes | Features: {tuple(data[ntype].x.shape)}")

        # Feature stats
        x = data[ntype].x
        print(f"    üìä Feature Mean: {x.mean(dim=0)}")
        print(f"    üìä Feature Std:  {x.std(dim=0)}")
        print(f"    üìä Feature Min:  {x.min(dim=0)[0]}")
        print(f"    üìä Feature Max:  {x.max(dim=0)[0]}")

    # === Edge Summary ===
    print("\nüîó Edge Types & Counts:")
    for rel in data.edge_types:
        eidx = data[rel].edge_index
        print(f"  - {rel[0]} ‚Üí {rel[1]} ‚Üí {rel[2]}: {eidx.shape[1]} edges")

        # Edge attribute stats
        if data[rel].edge_attr is not None:
            eattr = data[rel].edge_attr
            print(f"    üìä Edge Weight Mean: {eattr.mean(dim=0)}")
            print(f"    üìä Edge Weight Std:  {eattr.std(dim=0)}")
            print(f"    üìä Edge Weight Min:  {eattr.min(dim=0)[0]}")
            print(f"    üìä Edge Weight Max:  {eattr.max(dim=0)[0]}")

    # === Isolated Node Summary ===
    print("\n‚ùó Isolated Node Summary:")
    print(f"  - Total Isolated Nodes: {mapping['isolated_node_count']}")
    for t, count in mapping['isolated_node_by_type'].items():
        print(f"    ‚Ä¢ {t}: {count}")

    # === Degree Distribution & Density ===
    nx_graph = to_networkx(data, to_undirected=False)
    degrees = dict(nx_graph.degree())
    degree_counts = Counter(degrees.values())

    print("\nüìà Node Degree Distribution (Top 10):")
    for deg, count in degree_counts.most_common(10):
        print(f"    Degree {deg}: {count} nodes")

    num_nodes = nx_graph.number_of_nodes()
    num_edges = nx_graph.number_of_edges()
    possible_edges = num_nodes * (num_nodes - 1)
    density = num_edges / possible_edges if possible_edges > 0 else 0
    print(f"\n‚öô Graph Size: {num_nodes} nodes, {num_edges} edges")
    print(f"‚öô Approx. Graph Density: {density:.6f}")

    # === Mapping Consistency Check ===
    gene_map_count = len(mapping.get('gene_to_index', {}))
    cell_map_count = len(mapping.get('cell_to_index', {}))
    pathway_map_count = len(mapping.get('pathway_to_index', {}))

    if gene_map_count != data['gene'].num_nodes:
        print(f"‚ö† Gene index mismatch: {gene_map_count} vs {data['gene'].num_nodes}")
    if cell_map_count != data['cell'].num_nodes:
        print(f"‚ö† Cell index mismatch: {cell_map_count} vs {data['cell'].num_nodes}")
    if pathway_map_count != data['pathway'].num_nodes:
        print(f"‚ö† Pathway index mismatch: {pathway_map_count} vs {data['pathway'].num_nodes}")

print("\nüéØ All graphs inspected!")



üîç Inspecting Graph for Dosage: T1
üß¨ Node Types & Counts:
  - gene: 269 nodes | Features: (269, 2)
    üìä Feature Mean: tensor([-0.5032, 28.0409])
    üìä Feature Std:  tensor([ 0.3181, 51.4529])
    üìä Feature Min:  tensor([-1.1831,  0.0000])
    üìä Feature Max:  tensor([  1.7062, 412.0000])
  - cell: 1223 nodes | Features: (1223, 7)
    üìä Feature Mean: tensor([0.2578, 0.1410, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000])
    üìä Feature Std:  tensor([0.1305, 0.0800, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])
    üìä Feature Min:  tensor([1.4306e-03, 6.4739e-04, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00])
    üìä Feature Max:  tensor([0.6072, 0.3324, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000])
  - pathway: 808 nodes | Features: (808, 2)
    üìä Feature Mean: tensor([175.0227,   9.3354])
    üìä Feature Std:  tensor([339.6205,   9.7208])
    üìä Feature Min:  tensor([8.8987, 2.0000])
    üìä Feature Max:  tensor([3843.4258,   84.0000])

üîó Edge T

### Features Scaling of the Graphs constructed

In [30]:
import os
import torch
import numpy as np

# Directory where graphs are stored
graph_dir = 'Graph_Results/HeteroGraphs'

# Get all graph files
graph_files = sorted([f for f in os.listdir(graph_dir) if f.endswith('.pt')])

print(f"üîç Inspecting {len(graph_files)} graphs...\n")

for file in graph_files:
    graph_path = os.path.join(graph_dir, file)
    data = torch.load(graph_path)

    print(f"üìÇ Graph: {file}")

    for node_type in data.node_types:
        if hasattr(data[node_type], 'x') and data[node_type].x is not None:
            x = data[node_type].x
            if torch.is_floating_point(x):
                arr = x.cpu().numpy()
                mean = np.mean(arr, axis=0)
                std = np.std(arr, axis=0)
                min_val = np.min(arr, axis=0)
                max_val = np.max(arr, axis=0)

                print(f"  üß¨ Node Type: {node_type}")
                print(f"    ‚Üí Mean: {np.round(mean, 4)}")
                print(f"    ‚Üí Std:  {np.round(std, 4)}")
                print(f"    ‚Üí Min:  {np.round(min_val, 4)}")
                print(f"    ‚Üí Max:  {np.round(max_val, 4)}\n")
            else:
                print(f"  ‚ö†Ô∏è Node type {node_type} has non-numeric features, skipping.\n")
        else:
            print(f"  ‚ö†Ô∏è Node type {node_type} has no 'x' features, skipping.\n")

print("\n‚úÖ All graphs inspected!")


üîç Inspecting 9 graphs...

üìÇ Graph: HeteroGraph_T1.pt
  üß¨ Node Type: gene
    ‚Üí Mean: [-0.5032 28.0409]
    ‚Üí Std:  [ 0.3176 51.3571]
    ‚Üí Min:  [-1.1831  0.    ]
    ‚Üí Max:  [  1.7062 412.    ]

  üß¨ Node Type: cell
    ‚Üí Mean: [0.2578 0.141  1.     0.     0.     0.     0.    ]
    ‚Üí Std:  [0.1304 0.0799 0.     0.     0.     0.     0.    ]
    ‚Üí Min:  [1.4e-03 6.0e-04 1.0e+00 0.0e+00 0.0e+00 0.0e+00 0.0e+00]
    ‚Üí Max:  [0.6072 0.3324 1.     0.     0.     0.     0.    ]

  üß¨ Node Type: pathway
    ‚Üí Mean: [175.0227   9.3354]
    ‚Üí Std:  [339.4103   9.7148]
    ‚Üí Min:  [8.8987 2.    ]
    ‚Üí Max:  [3843.4255   84.    ]

üìÇ Graph: HeteroGraph_T10.pt
  üß¨ Node Type: gene
    ‚Üí Mean: [-0.6196 28.7302]
    ‚Üí Std:  [ 0.3118 48.9427]
    ‚Üí Min:  [-1.9525  0.    ]
    ‚Üí Max:  [  1.4631 421.    ]

  üß¨ Node Type: cell
    ‚Üí Mean: [0.2162 0.1043 0.     0.667  0.1422 0.1908 0.    ]
    ‚Üí Std:  [0.1196 0.0677 0.     0.4713 0.3492 0.393  0.   

In [31]:
import os
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler

# === Directories ===
input_dir = 'Graph_Results/HeteroGraphs'
output_dir = 'Graph_Results/HeteroGraphs_ScaledFinal'
os.makedirs(output_dir, exist_ok=True)

graph_files = [f for f in os.listdir(input_dir) if f.endswith('.pt')]

# === Collect all features across graphs ===
node_type_features = {}
edge_type_features = {}
node_max_dims = {}
edge_max_dims = {}

print("üîç Collecting features across all graphs...\n")

for file in graph_files:
    data = torch.load(os.path.join(input_dir, file))

    for node_type in data.node_types:
        arr = data[node_type].x.cpu().numpy()
        node_type_features.setdefault(node_type, []).append(arr)
        node_max_dims[node_type] = max(node_max_dims.get(node_type, 0), arr.shape[1])

    for edge_type in data.edge_types:
        arr = data[edge_type].edge_attr.cpu().numpy()
        edge_type_features.setdefault(edge_type, []).append(arr)
        edge_max_dims[edge_type] = max(edge_max_dims.get(edge_type, 0), arr.shape[1])

# Pad to max dimensions
combined_node_features = {}
for k, v in node_type_features.items():
    max_cols = node_max_dims[k]
    padded = [np.pad(arr, ((0, 0), (0, max_cols - arr.shape[1])), mode='constant') for arr in v]
    combined_node_features[k] = np.vstack(padded)

combined_edge_features = {}
for k, v in edge_type_features.items():
    max_cols = edge_max_dims[k]
    padded = [np.pad(arr, ((0, 0), (0, max_cols - arr.shape[1])), mode='constant') for arr in v]
    combined_edge_features[k] = np.vstack(padded)

# === Define log-transform rules ===
def apply_log_transform(arr, label, onehot_skip=0):
    transformed = arr.copy()
    # Skip last one-hot columns if specified
    continuous_part = transformed[:, :-onehot_skip] if onehot_skip > 0 else transformed

    # Apply log1p to all continuous columns if pathway, involved_in, or expresses
    if 'pathway' in label or 'involved_in' in label or 'expresses' in label:
        continuous_part = np.log1p(continuous_part)
    elif 'gene' in label and continuous_part.shape[1] > 1:
        continuous_part[:, 1] = np.log1p(continuous_part[:, 1])  # Degree in pathways

    if onehot_skip > 0:
        transformed[:, :-onehot_skip] = continuous_part
    else:
        transformed = continuous_part

    return transformed

# === Fit shared scalers ===
node_scalers = {}
edge_scalers = {}

for node_type, arr in combined_node_features.items():
    if node_type == 'cell':
        onehot_skip = 5  # last 5 columns are one-hot state
        transformed = apply_log_transform(arr, node_type, onehot_skip)
        scaler = StandardScaler()
        scaler.fit(transformed[:, :-onehot_skip])
        node_scalers[node_type] = (scaler, onehot_skip)
    else:
        transformed = apply_log_transform(arr, node_type)
        scaler = StandardScaler()
        scaler.fit(transformed)
        node_scalers[node_type] = (scaler, 0)
    print(f"‚úÖ Fitted scaler for node type: {node_type}")

for edge_type, arr in combined_edge_features.items():
    if 'co_expr' in edge_type:
        print(f"‚ö† Skipping scaler fit for co-expression edge type {edge_type} (will be removed)")
        continue
    transformed = apply_log_transform(arr, edge_type)
    scaler = StandardScaler()
    scaler.fit(transformed)
    edge_scalers[edge_type] = scaler
    print(f"‚úÖ Fitted scaler for edge type: {edge_type}")

# === Apply scalers and save ===
for file in graph_files:
    data = torch.load(os.path.join(input_dir, file))
    print(f"\nüîÑ Processing graph: {file}")

    for node_type in data.node_types:
        arr = data[node_type].x.cpu().numpy()
        if arr.shape[1] < node_max_dims[node_type]:
            arr = np.pad(arr, ((0, 0), (0, node_max_dims[node_type] - arr.shape[1])), mode='constant')

        scaler, onehot_skip = node_scalers[node_type]
        transformed = apply_log_transform(arr, node_type, onehot_skip)
        scaled_continuous = scaler.transform(transformed[:, :-onehot_skip]) if onehot_skip > 0 else scaler.transform(transformed)
        if onehot_skip > 0:
            scaled_arr = np.hstack([scaled_continuous, arr[:, -onehot_skip:]])  # keep one-hot as-is
        else:
            scaled_arr = scaled_continuous

        data[node_type].x = torch.tensor(scaled_arr, dtype=torch.float32)
        print(f"‚úÖ Scaled node features for {node_type}")

    for edge_type in list(data.edge_types):
        if 'co_expr' in edge_type:
            print(f"üóë Removing co-expression edge type: {edge_type}")
            del data[edge_type]
            continue

        arr = data[edge_type].edge_attr.cpu().numpy()
        if arr.shape[1] < edge_max_dims[edge_type]:
            arr = np.pad(arr, ((0, 0), (0, edge_max_dims[edge_type] - arr.shape[1])), mode='constant')

        transformed = apply_log_transform(arr, edge_type)
        scaler = edge_scalers[edge_type]
        scaled_arr = scaler.transform(transformed)

        data[edge_type].edge_attr = torch.tensor(scaled_arr, dtype=torch.float32)
        print(f"‚úÖ Scaled edge features for {edge_type}")

    # === Remove isolated gene nodes ===
    if 'gene' in data.node_types:
        total_genes = data['gene'].num_nodes
        gene_mask = torch.zeros(total_genes, dtype=torch.bool)

        for edge_type in data.edge_types:
            src_type, _, tgt_type = edge_type
            edge_idx = data[edge_type].edge_index
            if src_type == 'gene':
                gene_mask[edge_idx[0]] = True
            if tgt_type == 'gene':
                gene_mask[edge_idx[1]] = True

        if gene_mask.sum() < total_genes:
            data['gene'].x = data['gene'].x[gene_mask]
            print(f"üßπ Removed isolated gene nodes: kept {gene_mask.sum().item()} of {total_genes}")

    output_path = os.path.join(output_dir, file)
    torch.save(data, output_path)
    print(f"üíæ Saved cleaned and scaled graph to {output_path}")

print("\nüéØ All graphs processed: features scaled, co-expression removed, isolated gene nodes cleaned!")


üîç Collecting features across all graphs...

‚úÖ Fitted scaler for node type: gene
‚úÖ Fitted scaler for node type: cell
‚úÖ Fitted scaler for node type: pathway
‚úÖ Fitted scaler for edge type: ('gene', 'involved_in', 'pathway')
‚úÖ Fitted scaler for edge type: ('cell', 'expresses', 'gene')

üîÑ Processing graph: HeteroGraph_T20.pt
‚úÖ Scaled node features for gene
‚úÖ Scaled node features for cell
‚úÖ Scaled node features for pathway
‚úÖ Scaled edge features for ('gene', 'involved_in', 'pathway')
‚úÖ Scaled edge features for ('cell', 'expresses', 'gene')
üíæ Saved cleaned and scaled graph to Graph_Results/HeteroGraphs_ScaledFinal/HeteroGraph_T20.pt

üîÑ Processing graph: HeteroGraph_T320.pt
‚úÖ Scaled node features for gene
‚úÖ Scaled node features for cell
‚úÖ Scaled node features for pathway
‚úÖ Scaled edge features for ('gene', 'involved_in', 'pathway')
‚úÖ Scaled edge features for ('cell', 'expresses', 'gene')
üíæ Saved cleaned and scaled graph to Graph_Results/HeteroGraphs

In [32]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt

# Directory where final scaled graphs are saved
scaled_dir = 'Graph_Results/HeteroGraphs_ScaledFinal'
plot_dir = 'Graph_Results/Scaling_Plots_Final'
os.makedirs(plot_dir, exist_ok=True)

graph_files = [f for f in os.listdir(scaled_dir) if f.endswith('.pt')]

# Collect all features
node_type_features = {}
edge_type_features = {}

print("üîç Collecting scaled features for plotting...\n")

for file in graph_files:
    data = torch.load(os.path.join(scaled_dir, file))

    for node_type in data.node_types:
        arr = data[node_type].x.cpu().numpy()
        node_type_features.setdefault(node_type, []).append(arr)

    for edge_type in data.edge_types:
        arr = data[edge_type].edge_attr.cpu().numpy()
        edge_type_features.setdefault(edge_type, []).append(arr)

combined_node_features = {k: np.vstack(v) for k, v in node_type_features.items()}
combined_edge_features = {k: np.vstack(v) for k, v in edge_type_features.items()}

# Plot function
def plot_feature_distributions(data, label, out_dir):
    num_features = data.shape[1]
    for i in range(num_features):
        plt.figure(figsize=(8, 5))
        plt.hist(data[:, i], bins=50, alpha=0.7, color='green', density=True)
        plt.title(f'{label} - Feature {i} (Scaled)')
        plt.xlabel('Value')
        plt.ylabel('Density')
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f'{label.replace(" ", "_")}_Feature_{i}.png'))
        plt.close()

# Plot node features
print("üìä Plotting node feature distributions...")
for node_type, arr in combined_node_features.items():
    plot_feature_distributions(arr, f'NodeType_{node_type}', plot_dir)

# Plot edge features
print("üìä Plotting edge feature distributions...")
for edge_type, arr in combined_edge_features.items():
    plot_feature_distributions(arr, f'EdgeType_{edge_type}', plot_dir)

print("\nüéØ All scaled feature distribution plots saved!")


üîç Collecting scaled features for plotting...

üìä Plotting node feature distributions...
üìä Plotting edge feature distributions...

üéØ All scaled feature distribution plots saved!


In [35]:
import os
import json
import torch

# Paths
graph_path = 'Graph_Results/HeteroGraphs_ScaledFinal/HeteroGraph_T10.pt'
mapping_path = 'Graph_Results/Graph_Mappings/Graph_Mapping_T10.json'

# Load data
data = torch.load(graph_path)
with open(mapping_path, 'r') as f:
    mapping = json.load(f)

# Invert mappings
gene_to_index = mapping['gene_to_index']
cell_to_index = mapping['cell_to_index']
pathway_to_index = mapping['pathway_to_index']
index_to_gene = {v: k for k, v in gene_to_index.items()}
index_to_cell = {v: k for k, v in cell_to_index.items()}
index_to_pathway = {v: k for k, v in pathway_to_index.items()}

# Function to display edge info
def inspect_edge(edge_type, edge_idx=0):
    edge_index = data[edge_type].edge_index
    edge_attr = data[edge_type].edge_attr if 'edge_attr' in data[edge_type] else None

    src_idx = edge_index[0, edge_idx].item()
    tgt_idx = edge_index[1, edge_idx].item()
    weight = edge_attr[edge_idx].item() if edge_attr is not None else 'No weight'

    src_type, _, tgt_type = edge_type

    # Resolve names
    if src_type == 'gene':
        src_name = index_to_gene.get(src_idx, 'UNKNOWN')
    elif src_type == 'cell':
        src_name = index_to_cell.get(src_idx, 'UNKNOWN')
    elif src_type == 'pathway':
        src_name = index_to_pathway.get(src_idx, 'UNKNOWN')
    else:
        src_name = f'Node {src_idx}'

    if tgt_type == 'gene':
        tgt_name = index_to_gene.get(tgt_idx, 'UNKNOWN')
    elif tgt_type == 'cell':
        tgt_name = index_to_cell.get(tgt_idx, 'UNKNOWN')
    elif tgt_type == 'pathway':
        tgt_name = index_to_pathway.get(tgt_idx, 'UNKNOWN')
    else:
        tgt_name = f'Node {tgt_idx}'

    print(f"\nüîó Edge type: {edge_type}")
    print(f"   Source index: {src_idx} ‚Üí {src_name}")
    print(f"   Target index: {tgt_idx} ‚Üí {tgt_name}")
    print(f"   Edge weight: {weight}")

# Inspect a few example edges
for etype in data.edge_types:
    inspect_edge(etype, edge_idx=0)  # first edge
    if data[etype].edge_index.size(1) > 1:
        inspect_edge(etype, edge_idx=1)  # second edge



üîó Edge type: ('gene', 'involved_in', 'pathway')
   Source index: 256 ‚Üí RPL4
   Target index: 0 ‚Üí cytoplasmic translation (go:0002181)
   Edge weight: 2.6456947326660156

üîó Edge type: ('gene', 'involved_in', 'pathway')
   Source index: 90 ‚Üí RPL5
   Target index: 0 ‚Üí cytoplasmic translation (go:0002181)
   Edge weight: 2.6456947326660156

üîó Edge type: ('cell', 'expresses', 'gene')
   Source index: 0 ‚Üí bc_10uM_015.021
   Target index: 0 ‚Üí UQCRB
   Edge weight: 0.31109583377838135

üîó Edge type: ('cell', 'expresses', 'gene')
   Source index: 1 ‚Üí bc_10uM_211.084
   Target index: 0 ‚Üí UQCRB
   Edge weight: -0.14393192529678345
