In [1]:
# %%
import copy
import gc
import json
import os
from pathlib import Path
import shutil
import sys
import time
import traceback
from typing import List, Tuple, Dict, Union, Optional
import warnings
import pandas as pd
# from . import asyn
import pickle
import torch
from anndata import AnnData
import scanpy as sc
# import scvi
import seaborn as sns
import numpy as np
import wandb
from scipy.sparse import issparse
import matplotlib.pyplot as plt
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from torchtext.vocab import Vocab
from torchtext._torchtext import (
    Vocab as VocabPybind,
)
from sklearn.metrics import confusion_matrix

sys.path.insert(0, "../")
import scgpt as scg
from scgpt.model import TransformerModel, AdversarialDiscriminator
from scgpt.tokenizer import tokenize_and_pad_batch, random_mask_value
from scgpt.loss import (
    masked_mse_loss,
    masked_relative_error,
    criterion_neg_log_bernoulli,
)
from scgpt.tokenizer.gene_tokenizer import GeneVocab
from scgpt.preprocess import Preprocessor
from scgpt import SubsetsBatchSampler
from scgpt.utils import set_seed, category_str2int, eval_scib_metrics

sc.set_figure_params(figsize=(6, 6))
os.environ["KMP_WARNINGS"] = "off"
warnings.filterwarnings('ignore')



In [2]:
msDataPath = Path("../data/ms")
covidDataPath = Path("../data/covid")

msDataTest = sc.read(msDataPath / "filtered_ms_adata.h5ad") # loading MS test data
msData = sc.read(msDataPath / "c_data.h5ad") # loading MS data
covidData = sc.read(covidDataPath / "covidObj.h5ad") # loading Covid data


In [3]:
print(f'COVID dataset \n Cells: {covidData.X.shape[0]} \n Genes: {covidData.X.shape[1]}\n')
print(f'MS dataset \n Cells: {msData.X.shape[0]} \n Genes: {msData.X.shape[1]}\n')
print(f'MS Test dataset \n Cells: {msDataTest.X.shape[0]} \n Genes: {msDataTest.X.shape[1]}\n')

COVID dataset 
 Cells: 375438 
 Genes: 14063

MS dataset 
 Cells: 7844 
 Genes: 3000

MS Test dataset 
 Cells: 13468 
 Genes: 3000



In [68]:
msGenes = msData.var['gene_name'] # 3000 unique genes, same for the others
covidGenes = covidData.var['features'] # 14063 unique genes

commonGenes = msGenes[msGenes.isin(covidGenes)].unique() # check all common genes
uniqueGenes = pd.concat([msGenes, covidGenes]).unique() # check all unique genes - a union

print(f'Common genes between MS dataset and COVID dataset: {commonGenes}')
print(f'Unique genes between MS dataset and COVID dataset: {uniqueGenes}')

Common genes between MS dataset and COVID dataset: ['CFH', 'BAD', 'LAP3', 'WNT16', 'KLHL13', ..., 'DACH1', 'MARCKS', 'NEFL', 'IGHG1', 'TRAC']
Length: 1953
Categories (3000, object): ['A2M', 'AAMDC', 'ABCA8', 'ABCB1', ..., 'ZNF804A', 'ZNF880', 'ZWINT', 'yR211F11.2']
Unique genes between MS dataset and COVID dataset: ['CFH' 'BAD' 'LAP3' ... 'ZZEF1' 'GNG10' 'SMIM2-AS1']
