In [5]:
import numpy as np
import pandas as pd
import json
from binn import BINN, BINNDataLoader, BINNTrainer

# TODO: move these to config file later
SAMPLE_SIZE = 200
EPOCHS = 50

def generate_clinical_data(n=200):
    """Generate dummy clinical data for testing"""
    np.random.seed(42)  # for reproducibility
    
    data = {}
    
    # basic demographics
    data['sex'] = np.random.choice(['Male', 'Female'], n, p=[0.5, 0.5])
    data['age'] = np.random.normal(65, 12, n).clip(18, 95)
    data['ethnicity'] = np.random.choice(['White', 'Black', 'Asian', 'Hispanic'], n, p=[0.7, 0.15, 0.1, 0.05])
    
    # socioeconomic stuff
    data['education'] = np.random.choice(['HS', 'College', 'Grad'], n, p=[0.4, 0.4, 0.2])
    data['income'] = np.random.lognormal(np.log(50000), 0.6, n).clip(20000, 200000)
    
    # health metrics
    data['bmi'] = np.random.normal(26, 5, n).clip(18, 45)
    data['family_history'] = np.random.choice(['Yes', 'No'], n, p=[0.3, 0.7])
    data['smoking'] = np.random.choice(['Never', 'Former', 'Current'], n, p=[0.5, 0.3, 0.2])
    data['diabetes'] = np.random.choice([1, 0], n, p=[0.15, 0.85])
    data['hypertension'] = np.random.choice([1, 0], n, p=[0.25, 0.75])
    
    # lab values
    data['cholesterol'] = np.random.normal(200, 40, n).clip(150, 300)
    data['bp_systolic'] = np.random.normal(120, 15, n).clip(90, 180)
    
    samples = [f'P{i:03d}' for i in range(n)]
    
    # create outcome - quick and dirty risk model
    risk = (0.1 + 
            0.2 * (data['age'] > 70) + 
            0.15 * (data['sex'] == 'Male') +
            0.1 * (data['smoking'] == 'Current') +
            0.08 * data['diabetes'] +
            0.05 * (data['bmi'] > 30))
    
    outcome = np.random.binomial(1, np.clip(risk, 0.05, 0.8), n)
    
    df = pd.DataFrame(data, index=samples)
    
    design = pd.DataFrame({
        'sample': samples,
        'target': outcome,
        'group': ['high' if x == 1 else 'low' for x in outcome]
    })
    
    return df, design

def encode_data(df):
    """One-hot encode categoricals, standardize continuous"""
    encoded = pd.DataFrame(index=df.index)
    info = {}
    
    for col in df.columns:
        if df[col].dtype == 'object':
            # categorical - one hot encode
            dummies = pd.get_dummies(df[col], prefix=col)
            encoded = pd.concat([encoded, dummies], axis=1)
            info[col] = {'type': 'cat', 'levels': list(dummies.columns)}
        else:
            # continuous - standardize
            mean_val = df[col].mean()
            std_val = df[col].std()
            encoded[col] = (df[col] - mean_val) / std_val
            info[col] = {'type': 'cont', 'mean': mean_val, 'std': std_val}
    
    return encoded, info

def create_pathways(encoded_df):
    """Create pathway assignments"""
    
    # basic pathway groups that make sense clinically
    groups = {
        'BASELINE_RISK': ['age', 'sex_Male', 'sex_Female'],
        'SOCIAL': ['education_HS', 'education_College', 'education_Grad', 'income'],
        'LIFESTYLE': ['bmi', 'smoking_Never', 'smoking_Former', 'smoking_Current'],
        'GENETICS': ['family_history_Yes', 'family_history_No', 'ethnicity_Black', 'ethnicity_Asian'],
        'CARDIO': ['bp_systolic', 'cholesterol', 'hypertension', 'age'],
        'METABOLIC': ['bmi', 'diabetes', 'cholesterol']
    }
    
    pathway_data = []
    for pathway, features in groups.items():
        for feat in features:
            if feat in encoded_df.columns:
                pathway_data.append([feat, pathway])
    
    # catch any missing features
    assigned = set([x[0] for x in pathway_data])
    for col in encoded_df.columns:
        if col not in assigned:
            pathway_data.append([col, 'OTHER'])  # general bucket
    
    return pd.DataFrame(pathway_data, columns=['source', 'target'])

def create_mapping(encoded_df):
    """Create identity mapping for BINN"""
    mapping_data = []
    for feat in encoded_df.columns:
        mapping_data.append([
            feat, feat, 
            f"https://reactome.org/{feat}",  # dummy URL
            f"Clinical: {feat}", 
            "direct", "human"
        ])
    
    return pd.DataFrame(mapping_data, 
                       columns=["input", "translation", "url", "name", "x", "species"])

def run_binn_integration():
    """Main integration function"""
    
    print("Generating clinical data...")
    clinical_df, design_df = generate_clinical_data(SAMPLE_SIZE)
    print(f"Generated {len(clinical_df)} samples with {len(clinical_df.columns)} variables")
    
    print("Encoding variables...")
    encoded_df, var_info = encode_data(clinical_df)
    print(f"Encoded to {len(encoded_df.columns)} features")
    
    print("Creating pathways...")
    pathways_df = create_pathways(encoded_df)
    print(f"Created {len(pathways_df['target'].unique())} pathway groups")
    
    mapping_df = create_mapping(encoded_df)
    
    # prep for BINN - need this weird Protein column
    data_matrix = encoded_df.copy()
    data_matrix['Protein'] = encoded_df.columns[0]  # just use first column
    
    print("Initializing BINN...")
    binn = BINN(
        data_matrix=data_matrix,
        mapping=mapping_df,
        pathways=pathways_df,
        n_layers=3,
        dropout=0.2
    )
    
    # format data for BINN dataloader - features need to be rows
    data_for_loader = data_matrix.drop('Protein', axis=1).T
    data_for_loader['Protein'] = data_for_loader.index
    
    dataloader = BINNDataLoader(binn)
    dataloaders = dataloader.create_dataloaders(
        data_matrix=data_for_loader,
        design_matrix=design_df,
        feature_column="Protein",
        group_column="group", 
        sample_column="sample",
        batch_size=32,
        validation_split=0.2
    )
    
    print(f"Training for {EPOCHS} epochs...")
    trainer = BINNTrainer(binn)
    trainer.fit(dataloaders=dataloaders, num_epochs=EPOCHS)
    
    return {
        'model': binn,
        'trainer': trainer,
        'data': clinical_df,
        'encoded': encoded_df,
        'pathways': pathways_df,
        'mapping': mapping_df,
        'design': design_df
    }

def save_outputs(results, output_dir='/Users/rohan/Downloads/clinical_binn'):
    """Save key files"""
    import os
    os.makedirs(output_dir, exist_ok=True)
    
    results['data'].to_csv(f'{output_dir}/clinical_data.csv')
    results['encoded'].to_csv(f'{output_dir}/encoded_data.csv') 
    results['pathways'].to_csv(f'{output_dir}/pathways.csv', index=False)
    results['mapping'].to_csv(f'{output_dir}/mapping.csv', index=False)
    results['design'].to_csv(f'{output_dir}/design_matrix.csv', index=False)
    
    print(f"Files saved to {output_dir}/")

def analyze_results(results):
    """Quick analysis of results"""
    pathways = results['pathways']
    
    print("\nPathway summary:")
    for pathway in pathways['target'].unique():
        count = len(pathways[pathways['target'] == pathway])
        print(f"  {pathway}: {count} features")
    
    print(f"\nTotal: {len(results['encoded'].columns)} features across {len(pathways['target'].unique())} pathways")

if __name__ == "__main__":
    print("=== Clinical BINN Integration ===")
    
    # run the integration
    results = run_binn_integration()
    
    # save outputs
    save_outputs(results)
    
    # quick analysis
    analyze_results(results)
    
    print("\nDone!")

=== Clinical BINN Integration ===
Generating clinical data...
Generated 200 samples with 12 variables
Encoding variables...
Encoded to 21 features
Creating pathways...
Created 7 pathway groups
Initializing BINN...

[INFO] BINN is on device: cpu
Mapping group labels: {'high': 0, 'low': 1}
Training for 50 epochs...
[Epoch 1/50] Train Loss: 0.6793, Train Accuracy: 0.5750
[Epoch 1/50] Val Loss: 0.6951, Val Accuracy: 0.4531
[Epoch 2/50] Train Loss: 0.6580, Train Accuracy: 0.5938
[Epoch 2/50] Val Loss: 0.6926, Val Accuracy: 0.4531
[Epoch 3/50] Train Loss: 0.6860, Train Accuracy: 0.6500
[Epoch 3/50] Val Loss: 0.6875, Val Accuracy: 0.4531
[Epoch 4/50] Train Loss: 0.6709, Train Accuracy: 0.6375
[Epoch 4/50] Val Loss: 0.7005, Val Accuracy: 0.4531
[Epoch 5/50] Train Loss: 0.6772, Train Accuracy: 0.6125
[Epoch 5/50] Val Loss: 0.7081, Val Accuracy: 0.4531
[Epoch 6/50] Train Loss: 0.6644, Train Accuracy: 0.6562
[Epoch 6/50] Val Loss: 0.7085, Val Accuracy: 0.4531
[Epoch 7/50] Train Loss: 0.6774, Trai

  dm = dm.reindex(features).fillna(0)


[Epoch 12/50] Train Loss: 0.6776, Train Accuracy: 0.6125
[Epoch 12/50] Val Loss: 0.7498, Val Accuracy: 0.4531
[Epoch 13/50] Train Loss: 0.6574, Train Accuracy: 0.6937
[Epoch 13/50] Val Loss: 0.7476, Val Accuracy: 0.4531
[Epoch 14/50] Train Loss: 0.6926, Train Accuracy: 0.6062
[Epoch 14/50] Val Loss: 0.7465, Val Accuracy: 0.4531
[Epoch 15/50] Train Loss: 0.6779, Train Accuracy: 0.5938
[Epoch 15/50] Val Loss: 0.7514, Val Accuracy: 0.4531
[Epoch 16/50] Train Loss: 0.6683, Train Accuracy: 0.6500
[Epoch 16/50] Val Loss: 0.7520, Val Accuracy: 0.4531
[Epoch 17/50] Train Loss: 0.6834, Train Accuracy: 0.5437
[Epoch 17/50] Val Loss: 0.7517, Val Accuracy: 0.4531
[Epoch 18/50] Train Loss: 0.6798, Train Accuracy: 0.6250
[Epoch 18/50] Val Loss: 0.7514, Val Accuracy: 0.4531
[Epoch 19/50] Train Loss: 0.6807, Train Accuracy: 0.6062
[Epoch 19/50] Val Loss: 0.7506, Val Accuracy: 0.4531
[Epoch 20/50] Train Loss: 0.6585, Train Accuracy: 0.6625
[Epoch 20/50] Val Loss: 0.7488, Val Accuracy: 0.4531
[Epoch 21/

In [4]:
! pwd

/Users/rohan
