# IF2RNA Training on HE2RNA Data

Training our IF2RNA model on actual HE2RNA dataset for validation.

In [None]:
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pickle

sys.path.insert(0, str(Path.cwd().parent / 'src'))
from if2rna.model import IF2RNA
from if2rna.experiment import IF2RNAExperiment

In [None]:
# Load HE2RNA data
he2rna_path = Path.cwd().parent / 'external' / 'HE2RNA_code'

# Load transcriptome data
transcriptome_df = pd.read_csv(he2rna_path / 'transcriptome_data.py', comment='#', sep='\t', nrows=100)

# Load patient splits
with open(he2rna_path / 'patient_splits.pkl', 'rb') as f:
    patient_splits = pickle.load(f)

print(f"Loaded data from {he2rna_path}")
print(f"Patient splits keys: {list(patient_splits.keys())}")

In [None]:
# Check HE2RNA data structure
data_dir = he2rna_path / 'data'
if data_dir.exists():
    data_files = list(data_dir.glob('*'))
    print(f"Data directory contents: {[f.name for f in data_files]}")
else:
    print("Data directory not found")

# Check for metadata
metadata_dir = he2rna_path / 'metadata'
if metadata_dir.exists():
    metadata_files = list(metadata_dir.glob('*'))
    print(f"Metadata files: {[f.name for f in metadata_files]}")

# Generate synthetic TCGA-like data for demonstration
n_samples = 200
n_genes = 500
n_tiles_per_sample = 1000

np.random.seed(42)
sample_ids = [f"TCGA-{i:04d}" for i in range(n_samples)]
gene_ids = [f"ENSG{i:08d}" for i in range(n_genes)]

print(f"Using synthetic data: {n_samples} samples, {n_genes} genes")