In [1]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight



In [2]:
data = "../Data/scGNN_matrix.h5ad"

In [3]:
adata = sc.read_h5ad(data)
print('X matrix is sparse:', scipy.sparse.issparse(adata.X))
print('X size =', adata.X.shape)

X matrix is sparse: False
X size = (27603, 23693)


In [4]:
adata

AnnData object with n_obs × n_vars = 27603 × 23693
    obs: 'celltype'

In [5]:
print(adata.X)
print(adata.X.shape)
print(type(adata.X))
print(adata.X.dtype)

[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  3.1264480e-02 4.9962458e-01]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  0.0000000e+00 1.3223546e-02]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  3.9397422e-02 6.4014697e-01]
 ...
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  2.9884007e-02 7.8507602e-01]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
  3.6549248e-02 1.0351356e+00]
 [0.0000000e+00 0.0000000e+00 6.6088885e-04 ... 0.0000000e+00
  3.1275883e-02 3.8957477e-01]]
(27603, 23693)
<class 'numpy.ndarray'>
float32


In [6]:
adata.X.max()

8.515175

In [7]:
adata.obs['celltype']

AAACCTGAGCGTAGTG_sc69_1     0
AAACCTGAGCTGATAA_sc69_1     1
AAACCTGCAAATCCGT_sc69_1     2
AAACCTGCAAGGGTCA_sc69_1     3
AAACCTGCACATCCGG_sc69_1     2
                           ..
TTTGTCAGTTCAACCA_sc72_1     2
TTTGTCAGTTCAGTAC_sc72_1    21
TTTGTCATCACAAACC_sc72_1     2
TTTGTCATCAGCACAT_sc72_1     2
TTTGTCATCTCGAGTA_sc72_1    20
Name: celltype, Length: 27603, dtype: int64

In [8]:
X = adata.X
y = adata.obs['celltype'].values

train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state=14)
train_features, val_features, train_labels, val_labels = train_test_split(train_features, train_labels, test_size = 0.25, random_state=17)

train_features = np.array(train_features)
test_features = np.array(test_features)
val_features = np.array(val_features)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=train_labels
)

print('Sample weights shape:', sample_weights.shape)

np.save('../Arrays/train_features_scgnn.npy', train_features)
np.save('../Arrays/test_features_scgnn.npy', test_features)
np.save('../Arrays/val_features_scgnn.npy', val_features)
np.save('../Arrays/train_labels_scgnn.npy', train_labels)
np.save('../Arrays/test_labels_scgnn.npy', test_labels)
np.save('../Arrays/val_labels_scgnn.npy', val_labels)
np.save('../Arrays/sample_weights_scgnn.npy', sample_weights)

Training features shape: (16561, 23693)
Validation features shape: (5521, 23693)
Test features shape: (5521, 23693)
Training labels shape: (16561,)
Validation labels shape: (5521,)
Test labels shape: (5521,)
Sample weights shape: (16561,)


In [9]:
# for umap display
indices = train_test_split(adata.obs.index, test_size=0.2, random_state=14)[1]
indices = np.array(indices)
np.save('../Arrays/indices_scgnn.npy', indices)