In [19]:
import sys
import os

import pandas as pd
import scanpy as sc
import numpy as np
import warnings

import torch
import torch.nn.functional as F
import torch.nn as nn

from torch.utils.data import DataLoader, TensorDataset
from torch.nn import DataParallel
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import anndata
import seaborn as sns
import matplotlib.font_manager
from matplotlib import rcParams

font_list = []
fpaths = matplotlib.font_manager.findSystemFonts()
for i in fpaths:
    try:
        f = matplotlib.font_manager.get_font(i)
        font_list.append(f.family_name)
    except RuntimeError:
        pass

font_list = set(font_list)
plot_font = 'Helvetica' if 'Helvetica' in font_list else 'FreeSans'
rcParams['font.family'] = plot_font
rcParams.update({'font.size': 10})
rcParams.update({'figure.dpi': 300})
rcParams.update({'figure.figsize': (3,3)})
rcParams.update({'savefig.dpi': 500})
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
data_dir = 'datasets'
train_adata = sc.read_h5ad(os.path.join(data_dir,'diff_train_raw.h5ad'))
val_adata   = sc.read_h5ad(os.path.join(data_dir,'diff_val_raw.h5ad'))
test_adata  = sc.read_h5ad(os.path.join(data_dir,'diff_test_raw.h5ad'))


In [21]:
train_adata

AnnData object with n_obs × n_vars = 4800 × 203
    obs: 'assigned', 'auxDir', 'cell_filter', 'cell_name', 'compatible_fragment_ratio', 'day', 'donor', 'expected_format', 'experiment', 'frag_dist_length', 'gc_bias_correct', 'is_cell_control', 'is_cell_control_bulk', 'is_cell_control_control', 'library_types', 'libType', 'log10_total_counts', 'log10_total_counts_endogenous', 'log10_total_counts_ERCC', 'log10_total_counts_feature_control', 'log10_total_counts_MT', 'log10_total_features', 'log10_total_features_endogenous', 'log10_total_features_ERCC', 'log10_total_features_feature_control', 'log10_total_features_MT', 'mapping_type', 'mates1', 'mates2', 'n_alt_reads', 'n_total_reads', 'num_assigned_fragments', 'num_bias_bins', 'num_bootstraps', 'num_compatible_fragments', 'num_consistent_mappings', 'num_inconsistent_mappings', 'num_libraries', 'num_mapped', 'num_processed', 'num_targets', 'nvars_used', 'pct_counts_endogenous', 'pct_counts_ERCC', 'pct_counts_feature_control', 'pct_counts_MT

In [22]:
train_adata.obs['train_data'] = 1
test_adata.obs['train_data'] = 0
train_adata = train_adata[train_adata.obs['day'].isin(['day0','day3'])]

combined_adata = sc.concat([train_adata,test_adata])
sc.tl.pca(combined_adata, svd_solver='arpack')
sc.pp.neighbors(combined_adata, n_neighbors=30, n_pcs=50)
sc.tl.umap(combined_adata,min_dist=0.6)

train_adata.obs = train_adata.obs[['day']]

train_adata.obs['day'] = train_adata.obs['day'].map({
'day0':0,
'day1':1,
'day2':2,
'day3':3                                    
                             })

test_adata.obs = test_adata.obs[['day']]

test_adata.obs['day'] = test_adata.obs['day'].map(
    {'day0':0,
     'day1':1,
     'day2':2,
     'day3':3,
})

val_adata.obs  = val_adata.obs[['day']]
val_adata.obs['day'] = val_adata.obs['day'].map(
    {'day0':0,
     'day1':1,
     'day2':2,
     'day3':3,
})



In [23]:
train_adata

AnnData object with n_obs × n_vars = 2400 × 203
    obs: 'day'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    layers: 'counts'

In [24]:
train_adata_ = anndata.AnnData(train_adata.X)
train_adata_.var_names = train_adata.var_names
train_adata_.obs_names = train_adata.obs_names
train_adata_.obs['Group'] = train_adata.obs['day']
train_adata_.write('datasets/diff_train.h5ad')

In [25]:
test_adata_ = anndata.AnnData(test_adata.X)
test_adata_.var_names = test_adata.var_names
test_adata_.obs_names = test_adata.obs_names
test_adata_.obs['Group'] = test_adata.obs['day']
test_adata_.write('datasets/diff_test.h5ad')