In [None]:
import pandas as pd
import sqlite3
import networkx as nx
import numpy as np
import json

In [None]:
conn = sqlite3.connect('GTEx.db')
df = pd.read_sql("SELECT * FROM GTEx_network_age_adjusted", conn)


G = nx.from_pandas_edgelist(df, 'row', 'col', ['weight'])

conn.close()

In [None]:
import sqlite3
import pandas as pd
from typing import DefaultDict
from classify_gene import classify_gene
conn = sqlite3.connect('GTEx.db')

total = pd.read_sql("SELECT COUNT(*) as count FROM GTEx_network_age_adjusted", conn)
print(f"Total edges: {total['count'][0]:,}")

genes_df = pd.read_sql("""
    SELECT DISTINCT gene FROM (
        SELECT row as gene FROM GTEx_network_age_adjusted
        UNION
        SELECT col as gene FROM GTEx_network_age_adjusted
    )
""", conn)

print(f"Unique genes: {len(genes_df):,}")

classifications = {
    'LOC - Uncharacterized Locus', 
    'LINC - Long Intergenic Non-Coding RNA', 
    'Small Nucleolar RNA (snoRNA)', 
    'HGNC Protein-Coding Gene Symbol', 
    'Long Non-Coding RNA (lncRNA)', 
    'MicroRNA (miRNA)', 'Pseudogene', 
    'Unknown/Other', 
    'Clone-based Identifier'}
classified_genes = DefaultDict(set)

for gene in genes_df['gene']:
    classification = classify_gene(gene)
    classified_genes[classification['type']].add(gene)

print("\nGene type distribution:")
for gene_class in classified_genes.keys():
    print(f"  {gene_class}: {len(classified_genes[gene_class]):,}")
print("\n Edge distribution within gene types")
for gene_class in classified_genes.keys():
    genes_in_class = tuple(classified_genes[gene_class])
    count = pd.read_sql(f"""
        SELECT COUNT(*) as count 
        FROM GTEx_network_age_adjusted 
        WHERE row in {genes_in_class} AND col in {genes_in_class}
    """, conn)
    print(f"  {gene_class} space: {count['count'][0]:,}")

conn.close()

In [None]:
#edges by tissue

conn = sqlite3.connect('GTEx.db')
tissue_edges = pd.read_sql("""
    SELECT tissue, COUNT(*) as edge_count
    FROM GTEx_network_age_adjusted
    GROUP BY Tissue
    ORDER BY edge_count DESC
""", conn)
print("\nEdges by tissue:")
for _, row in tissue_edges.iterrows():
    print(f"  {row['Tissue']}: {row['edge_count']:,}")
conn.close()

In [21]:
#create graphs for each tissue
conn = sqlite3.connect('GTEx.db')
tissues = pd.read_sql("SELECT DISTINCT tissue FROM GTEx_network_age_adjusted", conn)
tissue_graphs = {}

for _, row in tissues.iterrows():
    tissue = row['Tissue']
    tissue_df = pd.read_sql(f"""
        SELECT row, col, weight 
        FROM GTEx_network_age_adjusted 
        WHERE Tissue = '{tissue}'
    """, conn)
    
    tissue_df = tissue_df[
        tissue_df['row'].apply(lambda x: classify_gene(x)['type'] == 'HGNC Protein-Coding Gene Symbol') &
        tissue_df['col'].apply(lambda x: classify_gene(x)['type'] == 'HGNC Protein-Coding Gene Symbol')
    ]
    
    tissue_df['weight'] = pd.to_numeric(tissue_df['weight'], errors='coerce')
    G_tissue = nx.from_pandas_edgelist(tissue_df, 'row', 'col', ['weight'])
    tissue_graphs[tissue] = G_tissue
    print(f"Constructed graph for {tissue} with {G_tissue.number_of_nodes():,} nodes and {G_tissue.number_of_edges():,} edges.")

conn.close()
folder = "tissue_networks"
#export graphs to gexf format
for tissue, G_tissue in tissue_graphs.items():
    filename = f"{folder}/{tissue.replace(' ', '_')}_network.gexf"
    nx.write_gexf(G_tissue, filename)
    print(f"Exported {tissue} graph to {filename}")


Constructed graph for Adipose_Subcutaneous with 11,836 nodes and 30,514 edges.
Constructed graph for Adipose_Visceral with 12,059 nodes and 31,254 edges.
Constructed graph for Adrenal_Gland with 11,722 nodes and 30,099 edges.
Constructed graph for Artery_Aorta with 11,581 nodes and 29,695 edges.
Constructed graph for Artery_Coronary with 11,911 nodes and 30,320 edges.
Constructed graph for Artery_Tibial with 11,319 nodes and 29,219 edges.
Constructed graph for Bladder with 12,344 nodes and 30,189 edges.
Constructed graph for Brain_Amygdala with 12,069 nodes and 31,323 edges.
Constructed graph for Brain_Anterior_cingulate_cortex with 12,060 nodes and 31,108 edges.
Constructed graph for Brain_Caudate with 12,215 nodes and 31,451 edges.
Constructed graph for Brain_Cerebellar_Hemisphere with 11,729 nodes and 29,636 edges.
Constructed graph for Brain_Cerebellum with 11,885 nodes and 30,027 edges.
Constructed graph for Brain_Cortex with 12,155 nodes and 31,321 edges.
Constructed graph for Br

In [22]:
metrics = pd.read_csv('filtered-graph-metrics.csv')
print((list(metrics['name'])))

FileNotFoundError: [Errno 2] No such file or directory: 'filtered-graph-metrics.csv'

In [None]:
from collections import defaultdict
conn = sqlite3.connect('GTEx.db')
from classify_gene import classify_gene
tissue = 'Adipose_Subcutaneous'
tissue_df = pd.read_sql(f"""
        SELECT row, col, weight 
        FROM GTEx_network_age_adjusted 
        WHERE Tissue = '{tissue}'
    """, conn)

conn.close()
#print the number of genes in each class for the given tissue
classes = defaultdict(set)
for row in tissue_df.itertuples():
    gene1_class = classify_gene(row.row)['type']
    gene2_class = classify_gene(row.col)['type']
    classes[gene1_class].add(row.row)
    classes[gene2_class].add(row.col)
for c in classes.keys():
    print(f"{c}: {len(classes[c])}")
