In [1]:
import pandas as pd
import sqlite3
import networkx as nx
import numpy as np
import json

In [4]:
conn = sqlite3.connect('GTEx.db')
df = pd.read_sql("SELECT * FROM GTEx_network_age_adjusted", conn)


G = nx.from_pandas_edgelist(df, 'row', 'col', ['weight'])

conn.close()

In [None]:
import sqlite3
import pandas as pd
from typing import DefaultDict
from classify_gene import classify_gene
conn = sqlite3.connect('GTEx.db')

total = pd.read_sql("SELECT COUNT(*) as count FROM GTEx_network_age_adjusted", conn)
print(f"Total edges: {total['count'][0]:,}")

genes_df = pd.read_sql("""
    SELECT DISTINCT gene FROM (
        SELECT row as gene FROM GTEx_network_age_adjusted
        UNION
        SELECT col as gene FROM GTEx_network_age_adjusted
    )
""", conn)

print(f"Unique genes: {len(genes_df):,}")

classifications = {
    'LOC - Uncharacterized Locus', 
    'LINC - Long Intergenic Non-Coding RNA', 
    'Small Nucleolar RNA (snoRNA)', 
    'HGNC Protein-Coding Gene Symbol', 
    'Long Non-Coding RNA (lncRNA)', 
    'MicroRNA (miRNA)', 'Pseudogene', 
    'Unknown/Other', 
    'Clone-based Identifier'}
classified_genes = DefaultDict(set)

for gene in genes_df['gene']:
    classification = classify_gene(gene)
    classified_genes[classification['type']].add(gene)

print("\nGene type distribution:")
for gene_class in classified_genes.keys():
    print(f"  {gene_class}: {len(classified_genes[gene_class]):,}")
print("\n Edge distribution within gene types")
for gene_class in classified_genes.keys():
    genes_in_class = tuple(classified_genes[gene_class])
    count = pd.read_sql(f"""
        SELECT COUNT(*) as count 
        FROM GTEx_network_age_adjusted 
        WHERE row in {genes_in_class} AND col in {genes_in_class}
    """, conn)
    print(f"  {gene_class} space: {count['count'][0]:,}")

conn.close()

Total edges: 2,133,010
Unique genes: 24,813

Gene type distribution:
  HGNC Protein-Coding Gene Symbol: 16,544
  Unknown/Other: 2,553
  Pseudogene: 2,010
  Clone-based Identifier: 2,864
  Long Non-Coding RNA (lncRNA): 12
  LINC - Long Intergenic Non-Coding RNA: 789
  LOC - Uncharacterized Locus: 2
  MicroRNA (miRNA): 33
  Small Nucleolar RNA (snoRNA): 6

 Edge distribution within gene types
  HGNC Protein-Coding Gene Symbol space: 1,530,267
  Unknown/Other space: 11,470
  Pseudogene space: 22,612
  Clone-based Identifier space: 7,514
  Long Non-Coding RNA (lncRNA) space: 0
  LINC - Long Intergenic Non-Coding RNA space: 334
  LOC - Uncharacterized Locus space: 0
  MicroRNA (miRNA) space: 3
  Small Nucleolar RNA (snoRNA) space: 3


In [7]:
#edges by tissue

conn = sqlite3.connect('GTEx.db')
tissue_edges = pd.read_sql("""
    SELECT tissue, COUNT(*) as edge_count
    FROM GTEx_network_age_adjusted
    GROUP BY Tissue
    ORDER BY edge_count DESC
""", conn)
print("\nEdges by tissue:")
for _, row in tissue_edges.iterrows():
    print(f"  {row['Tissue']}: {row['edge_count']:,}")
conn.close()


Edges by tissue:
  Testis: 56,499
  Pituitary: 46,915
  Prostate: 46,104
  Lung: 45,517
  Thyroid: 45,279
  Bladder: 44,947
  Small_Intestine_Terminal_Ileum: 44,881
  Vagina: 44,758
  Spleen: 44,650
  Breast_Mammary_Tissue: 44,487
  Brain_Hypothalamus: 44,392
  Nerve_Tibial: 44,301
  Minor_Salivary_Gland: 44,112
  Kidney_Cortex: 43,785
  Skin_Sun_Exposed: 43,638
  Brain_Cerebellum: 43,434
  Ovary: 43,382
  Brain_Nucleus_accumbens: 43,156
  Brain_Caudate: 43,054
  Skin_Not_Sun_Exposed: 42,981
  Colon_Transverse: 42,976
  Brain_Cerebellar_Hemisphere: 42,950
  Brain_Hippocampus: 42,836
  Brain_Cortex: 42,799
  Stomach: 42,759
  Brain_Frontal_Cortex: 42,749
  Adipose_Visceral: 42,724
  Brain_Anterior_cingulate_cortex: 42,588
  Brain_Amygdala: 42,532
  Colon_Sigmoid: 42,504
  Brain_Spinal_cord: 42,456
  Artery_Coronary: 42,350
  Brain_Substantia_nigra: 42,336
  Adipose_Subcutaneous: 42,257
  Brain_Putamen: 42,047
  Esophagus_Mucosa: 41,983
  Esophagus_Gastroesophageal_Junction: 41,980
  Es

In [3]:
#create graphs for each tissue
conn = sqlite3.connect('GTEx.db')
tissues = pd.read_sql("SELECT DISTINCT tissue FROM GTEx_network_age_adjusted", conn)
tissue_graphs = {}
for _, row in tissues.iterrows():
    tissue = row['Tissue']
    tissue_df = pd.read_sql(f"""
        SELECT row, col, weight 
        FROM GTEx_network_age_adjusted 
        WHERE Tissue = '{tissue}'
    """, conn)

    tissue_df['weight'] = pd.to_numeric(tissue_df['weight'], errors='coerce')
    G_tissue = nx.from_pandas_edgelist(tissue_df, 'row', 'col', ['weight'])
    tissue_graphs[tissue] = G_tissue
    print(f"Constructed graph for {tissue} with {G_tissue.number_of_nodes():,} nodes and {G_tissue.number_of_edges():,} edges.")
conn.close()
folder = "tissue_networks"
#export graphs to gexf format
for tissue, G_tissue in tissue_graphs.items():
    filename = f"{folder}/{tissue.replace(' ', '_')}_network.gexf"
    nx.write_gexf(G_tissue, filename)
    print(f"Exported {tissue} graph to {filename}")


Constructed graph for Adipose_Subcutaneous with 14,479 nodes and 42,257 edges.
Constructed graph for Adipose_Visceral with 14,645 nodes and 42,724 edges.
Constructed graph for Adrenal_Gland with 14,261 nodes and 41,426 edges.
Constructed graph for Artery_Aorta with 14,167 nodes and 41,210 edges.
Constructed graph for Artery_Coronary with 14,557 nodes and 42,350 edges.
Constructed graph for Artery_Tibial with 13,764 nodes and 40,235 edges.
Constructed graph for Bladder with 15,182 nodes and 44,947 edges.
Constructed graph for Brain_Amygdala with 14,559 nodes and 42,532 edges.
Constructed graph for Brain_Anterior_cingulate_cortex with 14,590 nodes and 42,588 edges.
Constructed graph for Brain_Caudate with 14,841 nodes and 43,054 edges.
Constructed graph for Brain_Cerebellar_Hemisphere with 14,840 nodes and 42,950 edges.
Constructed graph for Brain_Cerebellum with 15,062 nodes and 43,434 edges.
Constructed graph for Brain_Cortex with 14,794 nodes and 42,799 edges.
Constructed graph for Br