In [1]:
import os
import sys
sys.path.append("/home/romainlhardy/code/hyperbolic-cancer/PoincareMaps")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from matplotlib.colors import LinearSegmentedColormap, to_hex
from sklearn.decomposition import PCA
from PoincareMaps.data import prepare_data, compute_rfa
from PoincareMaps.model import PoincareEmbedding, PoincareDistance, poincare_root, poincare_translation
from PoincareMaps.rsgd import RiemannianSGD
from PoincareMaps.train import train
from PoincareMaps.visualize import plotPoincareDisc, plot_poincare_disc
from torch.utils.data import TensorDataset, DataLoader

In [None]:
data_dir = "/home/romainlhardy/data/hyperbolic-cancer/bladder"
dset = "bladder"

file_path = "/home/romainlhardy/data/hyperbolic-cancer/bladder/GSM4307111_GEO_processed_BC159-T_3_log2TPM_matrix_final.txt"
df = pd.read_csv(file_path, sep="\t", index_col=0)

expression_matrix = df.values.astype(np.float32).T

# PCA
pca = PCA(n_components=20)
features = pca.fit_transform(expression_matrix)
features = torch.DoubleTensor(features)
print(features.shape)

In [3]:
labels = [0 for _ in range(features.shape[0])]

In [None]:
rfa = compute_rfa(
    features,
    mode="features", 
    k_neighbours=30, 
    distlocal="minkowski",
    distfn="MFIsym", 
    connected=True, 
    sigma=1.0
) # Pairwise distances in the original data space

In [None]:
device = "cuda"
indices = torch.arange(len(rfa))

indices = indices.to(device)
rfa = rfa.to(device)

dataset = TensorDataset(indices, rfa)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

print(f"Dataset size: {len(dataset)}")

In [6]:
predictor = PoincareEmbedding(
    len(dataset), 
    2,
    dist=PoincareDistance,
    max_norm=1,
    Qdist="laplace", 
    lossfn="klSym",
    gamma=2.0,
    cuda=0
).to(device)

In [None]:
batch = next(iter(dataloader))
inputs, targets = batch
outputs = predictor(inputs.to(device)) # [batch_size, len(dataset)]

assert outputs.sum(dim=-1).allclose(torch.ones(len(batch[0])).to(device))

predictor.lossfn(outputs, targets) # Try to match the distance distributions in the data space and the embedding space

In [8]:
optimizer = RiemannianSGD(predictor.parameters(), lr=0.1) # RiemannianSGD optimizer

In [None]:
class PoincareOptions:
    def __init__(self, debugplot=False, epochs=500, batchsize=-1, lr=0.1, burnin=500, lrm=1.0, earlystop=0.0001, cuda=0):
        self.debugplot = debugplot
        self.epochs = epochs
        self.batchsize = batchsize
        self.lr = lr
        self.burnin = burnin
        self.lrm = lrm
        self.earlystop = earlystop
        self.cuda = cuda

opt = PoincareOptions(epochs=1000, batchsize=16)
embeddings, loss, epoch = train(
    predictor,
    dataset,
    optimizer,
    opt,
    fout=f"/home/romainlhardy/code/hyperbolic-cancer/data/outputs/{dset}",
    labels=labels,
    earlystop=1e-6,
    color_dict=None
)

In [10]:
from PoincareMaps.visualize import plotPoincareDisc, plot_poincare_disc

# root_hat = poincare_root(root, labels, features)
# embeddings_rotated = poincare_translation(-embeddings[root_hat, :], embeddings)
color_dict = plotPoincareDisc(embeddings.T, labels, file_name=f"/home/romainlhardy/code/hyperbolic-cancer/data/outputs/{dset}_raw")

# plot_poincare_disc(
#     embeddings_rotated,
#     labels=labels,
#     file_name=f"/home/romainlhardy/code/hyperbolic-cancer/data/outputs/{dset}_rot", 
#     coldict=color_dict,
#     d1=9.5, 
#     d2=9.0
# )