# Correlation using data averages

In [8]:
import os
import gc

os.environ["NUMBA_CACHE_DIR"] = "/scratch/st-jiaruid-1/yinian/tmp/"  # https://github.com/scverse/scanpy/issues/2113
from os.path import basename, join
from os import makedirs
from pathlib import Path
import yaml

import logging
import anndata as ad
import pickle
import numpy as np
import pandas as pd
import scanpy as sc
import scipy

import h5py
import hdf5plugin
import tables

import math

from sklearn.preprocessing import binarize
from sklearn.decomposition import TruncatedSVD

In [13]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient

    Source: https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor#Predicting
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## CITE

In [3]:
def load_data_as_anndata(filepaths, metadata_path):
    """
    Loads the files in <filepaths> as AnnData objects

    Source: https://github.com/openproblems-bio/neurips_2022_saturn_notebooks/blob/main/notebooks/loading_and_visualizing_all_data.ipynb
    """
    metadata_df = pd.read_csv(metadata_path)
    metadata_df = metadata_df.set_index("cell_id")

    adatas = {}
    chunk_size = 10000
    for name, filepath in filepaths.items():
        filename = basename(filepath)[:-3]
        logging.info(f"Loading {filename}")

        h5_file = h5py.File(filepath)
        h5_data = h5_file[filename]

        features = h5_data["axis0"][:]
        cell_ids = h5_data["axis1"][:]

        features = features.astype(str)
        cell_ids = cell_ids.astype(str)

        technology = metadata_df.loc[cell_ids, "technology"].unique().item()

        sparse_chunks = []
        n_cells = h5_data["block0_values"].shape[0]

        for chunk_indices in np.array_split(np.arange(n_cells), 100):
            chunk = h5_data["block0_values"][chunk_indices]
            sparse_chunk = scipy.sparse.csr_matrix(chunk)
            sparse_chunks.append(sparse_chunk)

        X = scipy.sparse.vstack(sparse_chunks)

        adata = ad.AnnData(
            X=X,
            obs=metadata_df.loc[cell_ids],
            var=pd.DataFrame(index=features),
        )

        adatas[name] = adata

    return adatas

In [67]:
config = yaml.safe_load(Path('/scratch/st-jiaruid-1/yinian/my_jupyter/scRNA-competition/experiments/basic-nn-cite.yaml').read_text())
adatas = load_data_as_anndata(config['paths'], config['metadata'])

In [68]:
x_train = adatas['x']
x_test = adatas['x_test']
y_train = adatas['y']
# combined_data = ad.concat([x_train, x_test])

In [69]:
validation_ids = np.load('/arc/project/st-jiaruid-1/yinian/multiome/cite_val_set_5000_0.75_day_7.npy', allow_pickle=True)

In [70]:
x_validation = x_train[validation_ids]
x_train = x_train[list(set(x_train.obs_names).difference(validation_ids))]
y_validation = y_train[validation_ids]
y_train = y_train[list(set(y_train.obs_names).difference(validation_ids))]

### Correlation on a uniform random vector from -1 to 1

In [71]:
np.random.seed(42)

In [72]:
y_random = np.random.rand(5000, 140) * 2 - 1
y_true = y_validation.X.toarray()
correlation_score(y_true, y_random)

-0.0008090604130566115

### Correlation on a random vector sampled from Gaussian

In [73]:
y_random = np.random.randn(5000, 140)
correlation_score(y_true, y_random)

-0.0009447022867286958

### Correlation on the average of the entire dataset

In [74]:
y_avg = np.average(y_train.X.toarray(), axis=0)
correlation_score(y_true, np.vstack([y_avg] * 5000))

0.7919203883032416

### Correlation on the average on each cell type

In [75]:
cell_types = set(y_train.obs['cell_type'])

In [76]:
total_corr = 0
for cell_type in cell_types:
    y_cell_type = y_train[y_train.obs['cell_type'] == cell_type]
    y_validation_cell_type = y_validation[y_validation.obs['cell_type'] == cell_type]
    if len(y_validation_cell_type) == 0:
        continue
    avg_cell_type = np.average(y_cell_type.X.toarray(), axis=0)
    total_corr += correlation_score(
        y_validation_cell_type.X.toarray(), np.vstack([avg_cell_type] * len(y_validation_cell_type))
    ) * len(y_validation_cell_type)
total_corr / 5000

0.8496705689416485

## Multiome

In [51]:
config = yaml.safe_load(Path('/scratch/st-jiaruid-1/yinian/my_jupyter/scRNA-competition/experiments/basic-nn-multiome.yaml').read_text())
adatas = load_data_as_anndata(config['paths'], config['metadata'])

In [52]:
x_train = adatas['x']
x_test = adatas['x_test']
y_train = adatas['y']

In [53]:
validation_ids = np.load('/arc/project/st-jiaruid-1/yinian/multiome/multi_val_set_10000_0.75_day_7.npy', allow_pickle=True)

In [54]:
x_validation = x_train[validation_ids]
x_train = x_train[list(set(x_train.obs_names).difference(validation_ids))]
y_validation = y_train[validation_ids]
y_train = y_train[list(set(y_train.obs_names).difference(validation_ids))]

In [55]:
y_true = y_validation.X.toarray()

### Correlation on a uniform random vector from -1 to 1

In [62]:
y_random = np.random.rand(9999, 23418) * 2 - 1
y_true = y_validation.X.toarray()
correlation_score(y_true, y_random)

5.460375985999174e-05

### Correlation on a random vector sampled from Gaussian

In [63]:
y_random = np.random.randn(9999, 23418)
correlation_score(y_true, y_random)

-2.9084550279502755e-06

### Correlation on the average of the entire dataset

In [64]:
y_avg = np.average(y_train.X.toarray(), axis=0)
correlation_score(y_true, np.vstack([y_avg] * 9999))

0.5945994044835909

### Correlation on the average of each cell type

In [65]:
cell_types = set(y_train.obs['cell_type'])

In [66]:
total_corr = 0
for cell_type in cell_types:
    if cell_type == 'hidden':
        continue
    y_cell_type = y_train[y_train.obs['cell_type'] == cell_type]
    y_validation_cell_type = y_validation[y_validation.obs['cell_type'] == cell_type]
    if len(y_validation_cell_type) == 0:
        continue
    avg_cell_type = np.average(y_cell_type.X.toarray(), axis=0)
    total_corr += correlation_score(
        y_validation_cell_type.X.toarray(), np.vstack([avg_cell_type] * len(y_validation_cell_type))
    ) * len(y_validation_cell_type)
total_corr / 9999

0.6105872028499381