In [2]:
from pathlib import Path
from scipy.io import mmread
from scipy.sparse import coo_matrix
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

WORKSPACE_ROOT = Path("../../").resolve()

In [3]:
fadata = ad.read_h5ad(WORKSPACE_ROOT/"data/tosica/PHCA.h5ad")
fadata

AnnData object with n_obs × n_vars = 131608 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Cell_category', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'n_genes'
    obsm: 'X_umap'

In [4]:
ssdata = sc.pp.subsample(fadata, n_obs=int(fadata.shape[0]*0.001), copy=True)
print(ssdata)

AnnData object with n_obs × n_vars = 131 × 36601
    obs: 'Organ', 'Donor', 'Chemistry', 'Cell_category', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'n_genes'
    obsm: 'X_umap'


In [5]:
from sklearn.preprocessing import LabelEncoder
import torch
def todense(adata):
    import scipy
    if isinstance(adata.X, scipy.sparse.csr_matrix) or isinstance(adata.X, scipy.sparse.csc_matrix):
        return adata.X.todense()
    else:
        return adata.X

def balance_populations_modified(data):
    ct_names = np.unique(data[:,-1])
    ct_counts = pd.value_counts(data[:,-1])
    max_val = min(ct_counts.max(),np.int32(2000000/len(ct_counts)))
    print(data)
    balanced_data = np.empty(shape=(1,data.shape[1]),dtype=np.float32)
    for ct in ct_names:
        tmp = data[data[:,-1] == ct]
        idx = np.random.choice(range(len(tmp)), max_val-len(tmp))
        tmp_X = tmp[idx]
        balanced_data = np.r_[balanced_data,tmp,tmp_X]
    return np.delete(balanced_data,0,axis=0)

adata = ssdata
tr_ratio = 0.7
label_name='Manually_curated_celltype'

print(f"splitDataSet")
label_encoder = LabelEncoder()
el_data = pd.DataFrame(todense(adata),index=np.array(adata.obs_names).tolist(), columns=np.array(adata.var_names).tolist())
print(f"densed")
el_data[label_name] = adata.obs[label_name].astype('str')

#el_data = pd.read_table(data_path,sep=",",header=0,index_col=0)
genes = el_data.columns.values[:-1]
#el_data = np.array(el_data)
el_data = el_data.values
# el_data = np.delete(el_data,-1,axis=1)
el_data[:,-1] = label_encoder.fit_transform(el_data[:,-1])
print("label embedding fit_transform() finished")
inverse = label_encoder.inverse_transform(range(0,np.max(el_data[:,-1])+1))
print("label embedding finished")
el_data = el_data.astype(np.float32)

# el_data = balance_populations(data = el_data)
n_genes = len(el_data[1])-1
train_size = int(len(el_data) * tr_ratio)
train_dataset, valid_dataset = torch.utils.data.random_split(el_data, [train_size,len(el_data)-train_size])

# balance the training dataset
# balanced_train_data = balance_populations(data = np.array(train_dataset))
print('Split finished')
balanced_train_data = balance_populations_modified(data = np.array(train_dataset))
print('balance finished')
# n_genes = len(balanced_train_data[1])-1
# exp_train = torch.from_numpy(np.array(train_dataset)[:,:n_genes].astype(np.float32))
# label_train = torch.from_numpy(np.array(train_dataset)[:,-1].astype(np.int64))
exp_train =  torch.from_numpy(np.array(balanced_train_data)[:,:n_genes].astype(np.float32))
label_train = torch.from_numpy(np.array(balanced_train_data)[:,-1].astype(np.int64))
exp_valid = torch.from_numpy(np.array(valid_dataset)[:,:n_genes].astype(np.float32))
label_valid = torch.from_numpy(np.array(valid_dataset)[:,-1].astype(np.int64))

# return exp_train, label_train, exp_valid, label_valid, inverse, genes

splitDataSet
densed
label embedding fit_transform() finished
label embedding finished
Split finished
[[ 0.  0.  0. ...  0.  0. 30.]
 [ 0.  0.  0. ...  0.  0.  8.]
 [ 0.  0.  0. ...  0.  0. 13.]
 ...
 [ 0.  0.  0. ...  0.  0. 28.]
 [ 0.  0.  0. ...  0.  0. 14.]
 [ 0.  0.  0. ...  0.  0. 20.]]
balance finished


resample by index to avoid densing matrix

In [6]:
adata = fadata
tr_ratio = 0.7
seed = 42

# encode labels
labels = adata.obs[label_name]
label_encoder = LabelEncoder()
label_encoder.fit(labels)
enc_labels = label_encoder.transform(labels)
enc_labelsi = np.vstack((np.arange(len(labels)), enc_labels)).T # entry by (index, class)
unique_labels = np.unique(enc_labels)
inverse_map = np.vstack((label_encoder.inverse_transform(unique_labels), unique_labels)).T

# train/valid split
train_size = int(len(enc_labelsi) * tr_ratio)
rng = torch.Generator().manual_seed(seed)
train_indexes, valid_indexes = [x.dataset for x in torch.utils.data.random_split(enc_labelsi, [train_size, len(enc_labelsi)-train_size], rng)]

# calc resample target N
unique, counts = np.unique(train_indexes[:, 1], return_counts=True)
abs_max_val = np.int32(2000000/len(unique)) # scale absolute limit based on inverse of number of classes
re_sample_target = min(counts.max(), abs_max_val)

# resample each class for train
_accumulator = []
for cls in unique:
    indexes = enc_labelsi[enc_labelsi[:,1] == cls][:, 0]
    diff = re_sample_target - indexes.shape[0]
    to_add = np.random.choice(indexes, diff)
    _accumulator.append(np.hstack((indexes, to_add)))
super_sampled_train = np.hstack(_accumulator)

# prepare results for return
train_xi, train_y = super_sampled_train, enc_labels[super_sampled_train]
valid_xi, valid_y = valid_indexes[:, 0], enc_labels[valid_indexes[:, 0]]
train_y = torch.from_numpy(np.array(train_y).astype(np.int64))
valid_y = torch.from_numpy(np.array(valid_y).astype(np.int64))
genes = np.array(adata.var_names)

In [7]:
train_xi.shape

(225000,)

In [8]:
np.array(adata.var_names)

array(['MIR1302-2HG', 'FAM138A', 'OR4F5', ..., 'AC007325.1', 'AC007325.4',
       'AC007325.2'], dtype=object)

In [9]:
adata.X[train_xi[:5]].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

check gene names match

In [10]:
df = pd.read_csv("../scanpy/cache/genes.csv")
print(df.shape)
df.head(2)

(36601, 2)


Unnamed: 0,locus,gene
0,ENSG00000000003,TSPAN6
1,ENSG00000000005,TNMD


In [11]:
a = set(df["gene"])
b = set(adata.var_names)
a-b

set()