In [27]:
! pip install binn



In [43]:
import numpy as np
import pandas as pd
import json
import scipy.sparse as sp

# Step 1: Load all data files
print("=== GENOTYPE DATA ===")
genotype_data = np.load('dummy_genotype_matrix.npz', allow_pickle=True)
genotype_matrix = genotype_data['matrix']
snps = genotype_data['snps']
samples = genotype_data['samples']
print(f"Matrix shape: {genotype_matrix.shape}")
print(f"Data type: {genotype_matrix.dtype}")
print(f"Sample values: {genotype_matrix[0, :5]}")

print("\n=== SPARSE CONNECTIVITY MATRIX ===")
sparse_data = np.load('sparse_matrix.npz', allow_pickle=True)
snp_to_gene_matrix = sp.csr_matrix(
   (sparse_data['data'], sparse_data['indices'], sparse_data['indptr']),
   shape=tuple(sparse_data['shape'])
)
print(f"Shape: {snp_to_gene_matrix.shape}")
print(f"Connections: {snp_to_gene_matrix.nnz}")
print(f"Density: {snp_to_gene_matrix.nnz / (snp_to_gene_matrix.shape[0] * snp_to_gene_matrix.shape[1]):.4f}")

print("\n=== SNP INDEX ===")
with open('snp_index.json', 'r') as f:
   snp_index = json.load(f)
print(f"Entries: {len(snp_index)}")
print(f"Sample: {list(snp_index.items())[:3]}")

print("\n=== GENE INDEX ===")
with open('gene_index.json', 'r') as f:
   gene_index = json.load(f)
print(f"Entries: {len(gene_index)}")
print(f"Sample: {list(gene_index.items())[:3]}")

=== GENOTYPE DATA ===
Matrix shape: (50, 100)
Data type: int64
Sample values: [2 0 1 1 1]

=== SPARSE CONNECTIVITY MATRIX ===
Shape: (100, 122)
Connections: 130
Density: 0.0107

=== SNP INDEX ===
Entries: 100
Sample: [('rs114757189', 0), ('rs9439506', 1), ('rs112434103', 2)]

=== GENE INDEX ===
Entries: 122
Sample: [('SDF4', 0), ('LINC02782', 1), ('ENSG00000284616', 2)]


In [44]:
# Step 2: Create BINN-compatible data matrix
samples_fixed = samples[:genotype_matrix.shape[0]]
connectivity_dense = snp_to_gene_matrix.toarray()
gene_data = genotype_matrix @ connectivity_dense

# Filter valid genes (remove ENSG IDs)
valid_genes = {gene: idx for gene, idx in gene_index.items() 
               if not gene.startswith('ENSG') and len(gene) <= 15}
valid_indices = list(valid_genes.values())
valid_gene_names = list(valid_genes.keys())

# Create BINN data matrix (genes as features, samples as rows)
gene_data_filtered = gene_data[:, valid_indices]
data_matrix = pd.DataFrame(
    gene_data_filtered,
    index=samples_fixed,
    columns=valid_gene_names
)

print(f"Data matrix shape: {data_matrix.shape}")
print(f"First 3 genes: {valid_gene_names[:3]}")
print(f"Sample data:\n{data_matrix.iloc[:3, :3]}")

Data matrix shape: (50, 103)
First 3 genes: ['SDF4', 'LINC02782', 'SPSB1']
Sample data:
          SDF4  LINC02782  SPSB1
Sample_0     2          0      1
Sample_1     2          0      2
Sample_2     2          1      2


In [45]:
# Step 3: Create design matrix
# Need target labels - create dummy ones for now
n_samples = len(samples_fixed)
target_labels = np.random.randint(0, 2, n_samples)  # Binary classification

design_matrix = pd.DataFrame({
   'sample': samples_fixed,
   'target': target_labels,
   'group': ['high' if t == 1 else 'low' for t in target_labels]
})

print(f"Design matrix shape: {design_matrix.shape}")
print(f"Sample design matrix:\n{design_matrix.head()}")
print(f"Target distribution: {pd.Series(target_labels).value_counts()}")

Design matrix shape: (50, 3)
Sample design matrix:
     sample  target group
0  Sample_0       1  high
1  Sample_1       0   low
2  Sample_2       1  high
3  Sample_3       0   low
4  Sample_4       1  high
Target distribution: 1    29
0    21
Name: count, dtype: int64


In [46]:
print(f"First 3 genes: {valid_gene_names}")

First 3 genes: ['SDF4', 'LINC02782', 'SPSB1', 'CASZ1', 'PLOD1', 'KAZN', 'PADI6', 'MIR4418', 'GRHL3', 'RSPO1', 'AIRIM', 'MTF1', 'FOXJ3', 'ZSWIM5', 'LINC01738', 'AGBL4', 'BEND5', 'SCP2', 'CDCP2', 'USP24', 'FGGY', 'LINC01739', 'RNU7-62P', 'CACHD1', 'PDE4B', 'DIRAS3', 'NEGR1', 'MSH4', 'LINC01140', 'PKN2-AS1', 'CDC7', 'TGFBR3', 'ABCD3', 'DPYD-AS1', 'RTCA', 'THAP3P1', 'NTNG1', 'VAV3', 'PSRC1', 'GNAI3', 'DENND2D', 'WNT2B', 'SYT6', 'TBX15', 'RNU1-59P', 'LINC00624', 'OTUD7B', 'CRCT1', 'LCE3E', 'S100A7A', 'S100A7', 'NUP210L', 'THBS3-AS1', 'MEF2D', 'PEAR1', 'RP11-85G21.2', 'FCRL5', 'ATP1A4', 'ITLN2', 'PCP4L1', 'NOS1AP', 'SH2D1B', 'UHMK1', 'SELP', 'SELL', 'MYOCOS', 'TNFSF4', 'RALGPS2', 'AL359853.1', 'CACNA1E', 'RNU6-41P', 'LAMC1', 'NR5A2', 'KIF14', 'NAV1', 'SYT2', 'CYB5R1', 'FMOD', 'PRELP', 'LINC01735', 'LINC02769', 'SERTAD4', 'HHAT', 'LPGAT1', 'FLVCR1', 'PROX1', 'KCNK2', 'AC096643.1', 'SLC30A10', 'DNAH14', 'ITPKB-IT1', 'RHOU', 'LINC02815', 'ARID4B', 'EDARADD', 'LGALS8', 'RYR2', 'CHRM3', 'GREM2', 

In [40]:
# Step 4: Create mapping from your SNP→Gene connectivity
mapping_data = []
coo = snp_to_gene_matrix.tocoo()

for i, j in zip(coo.row, coo.col):
    if i < len(snp_index) and j < len(gene_index):
        snp_id = list(snp_index.keys())[i]
        gene_id = list(gene_index.keys())[j]
        if gene_id in valid_gene_names:
            mapping_data.append([
                snp_id, gene_id, 
                f"https://snp.org/{snp_id}", 
                f"{snp_id} maps to {gene_id}",
                "direct", "Homo sapiens"
            ])

mapping = pd.DataFrame(mapping_data, 
    columns=["input", "translation", "url", "name", "x", "species"])

# Step 5: Create pathways - genes connecting to meta-pathways
pathways_data = []
for gene in valid_gene_names:
    # Group genes by function (simple heuristic)
    if any(x in gene for x in ['BRCA', 'ATM', 'TP53']):
        pathways_data.append([gene, "DNA_REPAIR_PATHWAY"])
    elif any(x in gene for x in ['CDK', 'CCND', 'RB1']):
        pathways_data.append([gene, "CELL_CYCLE_PATHWAY"])
    else:
        pathways_data.append([gene, "GENERAL_PATHWAY"])

pathways = pd.DataFrame(pathways_data, columns=["source", "target"])

print(f"Real mapping: {len(mapping)} SNP→Gene connections")
print(f"Functional pathways: {len(pathways)} Gene→Pathway connections")

Real mapping: 109 SNP→Gene connections
Functional pathways: 103 Gene→Pathway connections


In [47]:
# Fix data_matrix - add Protein column
data_matrix['Protein'] = valid_gene_names[0]  # Use first gene as identifier

# Test again
binn = BINN(
    data_matrix=data_matrix,
    mapping=mapping,
    pathways=pathways,
    n_layers=3,
    dropout=0.2
)


[INFO] BINN is on device: cpu


In [48]:
# Debug: Print exact formats
print("=== MAPPING FORMAT ===")
print(f"Columns: {mapping.columns.tolist()}")
print(f"Sample:\n{mapping.head(2)}")

print("\n=== PATHWAYS FORMAT ===")
print(f"Columns: {pathways.columns.tolist()}")
print(f"Sample:\n{pathways.head(2)}")

print("\n=== DATA MATRIX ===")
print(f"Columns: {data_matrix.columns[:5].tolist()}")
print(f"Has Protein col: {'Protein' in data_matrix.columns}")

# Check if your genes appear in mapping
genes_in_mapping = set(mapping['translation'].unique())
genes_in_data = set(data_matrix.columns) - {'Protein'}
print(f"\nGenes in mapping: {len(genes_in_mapping)}")
print(f"Genes in data: {len(genes_in_data)}")
print(f"Overlap: {len(genes_in_mapping & genes_in_data)}")

=== MAPPING FORMAT ===
Columns: ['input', 'translation', 'url', 'name', 'x', 'species']
Sample:
       input translation                         url                name  \
0       SDF4        SDF4       https://gene.org/SDF4       SDF4 function   
1  LINC02782   LINC02782  https://gene.org/LINC02782  LINC02782 function   

        x       species  
0  direct  Homo sapiens  
1  direct  Homo sapiens  

=== PATHWAYS FORMAT ===
Columns: ['source', 'target']
Sample:
      source           target
0       SDF4  GENERAL_PATHWAY
1  LINC02782  GENERAL_PATHWAY

=== DATA MATRIX ===
Columns: ['SDF4', 'LINC02782', 'SPSB1', 'CASZ1', 'PLOD1']
Has Protein col: True

Genes in mapping: 103
Genes in data: 103
Overlap: 103


In [49]:
# Fix: Map genes to genes (not SNPs to genes)
mapping_data = []
for gene in valid_gene_names:
    mapping_data.append([
        gene, gene,  # input=gene, translation=gene
        f"https://gene.org/{gene}",
        f"{gene} function",
        "direct", "Homo sapiens"
    ])

mapping = pd.DataFrame(mapping_data, 
    columns=["input", "translation", "url", "name", "x", "species"])

print(f"Fixed mapping sample:\n{mapping.head(2)}")

# Test again
binn = BINN(
    data_matrix=data_matrix,
    mapping=mapping,
    pathways=pathways,
    n_layers=3,
    dropout=0.2
)

Fixed mapping sample:
       input translation                         url                name  \
0       SDF4        SDF4       https://gene.org/SDF4       SDF4 function   
1  LINC02782   LINC02782  https://gene.org/LINC02782  LINC02782 function   

        x       species  
0  direct  Homo sapiens  
1  direct  Homo sapiens  

[INFO] BINN is on device: cpu


In [50]:
# Code to visualize the mapping translation
print("=== MAPPING TRANSLATION ===")
print("Data matrix columns (first 5):")
print(data_matrix.columns[:5].tolist())

print("\nMapping input→translation (first 5):")
for i, row in mapping.head(5).iterrows():
   print(f"{row['input']} → {row['translation']}")

print("\nCheck alignment:")
data_genes = set(data_matrix.columns) - {'Protein'}
mapping_inputs = set(mapping['input'])
print(f"Data genes: {len(data_genes)}")
print(f"Mapping inputs: {len(mapping_inputs)}")
print(f"Perfect match: {data_genes == mapping_inputs}")

# Show difference if any
missing = data_genes - mapping_inputs
extra = mapping_inputs - data_genes
if missing: print(f"Missing in mapping: {missing}")
if extra: print(f"Extra in mapping: {extra}")

=== MAPPING TRANSLATION ===
Data matrix columns (first 5):
['SDF4', 'LINC02782', 'SPSB1', 'CASZ1', 'PLOD1']

Mapping input→translation (first 5):
SDF4 → SDF4
LINC02782 → LINC02782
SPSB1 → SPSB1
CASZ1 → CASZ1
PLOD1 → PLOD1

Check alignment:
Data genes: 103
Mapping inputs: 103
Perfect match: True


In [51]:
# Compare your approach vs Reactome approach
print("=== YOUR CUSTOM APPROACH ===")
print("Mapping (Gene→Gene):")
print(mapping.head(3)[['input', 'translation']])

print("\nPathways (Gene→Custom_Pathway):")
print(pathways.head(3))

print("\n=== REACTOME APPROACH WOULD BE ===")
print("Mapping (Protein→Reactome_ID):")
reactome_example = pd.DataFrame({
   'input': ['SDF4', 'LINC02782'], 
   'translation': ['R-HSA-12345', 'R-HSA-67890']
})
print(reactome_example)

print("\nPathways (Reactome_ID→Reactome_ID hierarchy):")
reactome_pathways = pd.DataFrame({
   'source': ['R-HSA-12345', 'R-HSA-67890'],
   'target': ['R-HSA-999', 'R-HSA-999']
})
print(reactome_pathways)

=== YOUR CUSTOM APPROACH ===
Mapping (Gene→Gene):
       input translation
0       SDF4        SDF4
1  LINC02782   LINC02782
2      SPSB1       SPSB1

Pathways (Gene→Custom_Pathway):
      source           target
0       SDF4  GENERAL_PATHWAY
1  LINC02782  GENERAL_PATHWAY
2      SPSB1  GENERAL_PATHWAY

=== REACTOME APPROACH WOULD BE ===
Mapping (Protein→Reactome_ID):
       input  translation
0       SDF4  R-HSA-12345
1  LINC02782  R-HSA-67890

Pathways (Reactome_ID→Reactome_ID hierarchy):
        source     target
0  R-HSA-12345  R-HSA-999
1  R-HSA-67890  R-HSA-999


In [52]:
binn.layers

Sequential(
  (Layer_0): Linear(in_features=1, out_features=1, bias=True)
  (BatchNorm_0): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (Dropout_0): Dropout(p=0.2, inplace=False)
  (Tanh_0): Tanh()
  (Layer_1): Linear(in_features=1, out_features=1, bias=True)
  (BatchNorm_1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (Dropout_1): Dropout(p=0.2, inplace=False)
  (Tanh_1): Tanh()
  (Layer_2): Linear(in_features=1, out_features=1, bias=True)
  (BatchNorm_2): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (Dropout_2): Dropout(p=0.2, inplace=False)
  (Tanh_2): Tanh()
  (Output): Linear(in_features=1, out_features=2, bias=True)
)

In [53]:
binn.inputs[0]

'SDF4'

In [54]:
layers = binn.layer_names
layers[0][0]

'SDF4'

In [57]:
from binn import BINN, BINNDataLoader, BINNTrainer
import pandas as pd

In [58]:
data_for_binn = data_matrix.drop('Protein', axis=1).T
data_for_binn['Protein'] = data_for_binn.index

# Then run your training code
binn_dataloader = BINNDataLoader(binn)
dataloaders = binn_dataloader.create_dataloaders(
    data_matrix=data_for_binn,
    design_matrix=design_matrix,
    feature_column="Protein",
    group_column="group",
    sample_column="sample",
    batch_size=32,
    validation_split=0.2,
)

trainer = BINNTrainer(binn)
trainer.fit(dataloaders=dataloaders, num_epochs=50)

Mapping group labels: {'high': 0, 'low': 1}
[Epoch 1/50] Train Loss: 0.7127, Train Accuracy: 0.5312
[Epoch 1/50] Val Loss: 0.6958, Val Accuracy: 0.3000
[Epoch 2/50] Train Loss: 0.6719, Train Accuracy: 0.5000
[Epoch 2/50] Val Loss: 0.6969, Val Accuracy: 0.1000
[Epoch 3/50] Train Loss: 0.7010, Train Accuracy: 0.6094
[Epoch 3/50] Val Loss: 0.6978, Val Accuracy: 0.1000
[Epoch 4/50] Train Loss: 0.7220, Train Accuracy: 0.5625
[Epoch 4/50] Val Loss: 0.6990, Val Accuracy: 0.3000
[Epoch 5/50] Train Loss: 0.6712, Train Accuracy: 0.6562
[Epoch 5/50] Val Loss: 0.7008, Val Accuracy: 0.3000
[Epoch 6/50] Train Loss: 0.7455, Train Accuracy: 0.4844
[Epoch 6/50] Val Loss: 0.7028, Val Accuracy: 0.3000
[Epoch 7/50] Train Loss: 0.6911, Train Accuracy: 0.6562
[Epoch 7/50] Val Loss: 0.7049, Val Accuracy: 0.3000
[Epoch 8/50] Train Loss: 0.7237, Train Accuracy: 0.5469
[Epoch 8/50] Val Loss: 0.7084, Val Accuracy: 0.3000
[Epoch 9/50] Train Loss: 0.7152, Train Accuracy: 0.5156
[Epoch 9/50] Val Loss: 0.7118, Val A