In [3]:
# test reading in 10x data with AnnData
import pandas as pd
import numpy as np
import scanpy as sc
from glob import glob
import os
import pickle
from scipy import sparse
import anndata as ad
import scrublet as scr

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_palette(sns.color_palette("Spectral"))
sns.set_style("whitegrid")

from matplotlib.ticker import StrMethodFormatter
# ax.xaxis.set_major_formatter(StrMethodFormatter("{x:.2f}"))

import requests
from tqdm import tqdm

data_dir = "C:/Users/randymi/Desktop/seuratwork/data"

# data_dir = "/share/studies/Dermatology_Data/HS_Data_Portal/scanpy_seq/data/"

gsm_dict = {}

for root, folders, _ in os.walk(data_dir):
    for i in folders:
        if i.startswith("GSM"):
            gsm_dict[i.split("_")[0]] = os.path.join(root,i)

# linux has different delimiters:
def detect_delimiter(path):
    if '/' in path:
        return '/'
    elif '\\' in path:
        return '\\'
    else:
        return None  # No delimiter found


gse_dict = {key : value.split(detect_delimiter(value))[1] for key, value in gsm_dict.items()}

# returns full path from GSM number
def gsm_path(gsm):
    return gsm_dict["GSM" + str(gsm)]

In [28]:
df = pd.read_excel("../sample_list_healthy.xlsx")

hc_gsm = df.loc[df['HC Data'],["GSE","GSM"]]

hc_gs_zip = list(zip(list(hc_gsm.to_dict()['GSE'].values()), list(hc_gsm.to_dict()['GSM'].values())))

In [30]:
# reads in a dir from GSM, returns adata
def read_dir(gse_code, gsm_code):

    data_path = gsm_path(gsm_code[3:])

    matrix_path = os.path.join(data_path, 'matrix.mtx')
    barcodes_path = os.path.join(data_path, 'barcodes.tsv')
    features_path = os.path.join(data_path, 'features.tsv')
    
    adata = sc.read_mtx(matrix_path)
    adata_bc = pd.read_csv(barcodes_path, header=None, delimiter = "\t")
    adata_features = pd.read_csv(features_path, header=None, delimiter = '\t')
    
    adata = adata.T

    adata.obs['cell_id'] = adata_bc[0].values

    # gene ids
    adata.var['symbol'] = adata_features[1].values

    # make ensembl id the gene index since it is unique
    adata.var_names = adata_features[0].values

    # Each cell gets unique name
    adata.obs_names = [f"cell_{i:d}" for i in range(adata.n_obs)]
    
    adata.obs["GSM"] = gsm_code[3:]

    # GSE is batch key
    adata.obs["batch"] = gse_code[3:]
    
    return adata


def before_scrublet_step(gsm):
    # preprocess healthy data

    adata = read_dir(gsm)

    # mitochondrial genes
    adata.var["mt"] = adata.var["symbol"].str.startswith("MT-")

    # ribosomal genes
    adata.var["ribo"] = adata.var["symbol"].str.startswith(("RPS","RPL"))

    # hemoglobin genes
    adata.var["hb"] = adata.var["symbol"].str.contains("^HB[^(P)]")

    # calculate qc metrics
    sc.pp.calculate_qc_metrics(
    adata,
    qc_vars = ["mt","ribo","hb"],
    inplace = True,
    )

    # filter cells and genes
    sc.pp.filter_cells(adata, min_genes = 300)

    # remove cells with mitocondrial dna > 20%
    adata = adata[adata.obs["pct_counts_mt"] < 20]

    # remove cells with ribosomal dna > 55%
    adata = adata[adata.obs["pct_counts_ribo"] < 55]

    # remove genes with less than 3 cells
    sc.pp.filter_genes(adata, min_cells = 3)

    return adata


def post_scrublet_step(adata):
    # moved these steps to after scrublet since they might be removing doublets
    sc.pp.filter_cells(adata, max_genes = 6000)

    # remove cells with rna counts > 40000
    adata = adata[adata.obs["total_counts"] < 40000]



In [35]:
# for samples from GSE173706, they are csv so must be read in differenctly
gsm_path("5277170")

'C:/Users/randymi/Desktop/seuratwork/data\\GSE173706\\GSM5277170_NS-AR001'

In [34]:
hc_gs_zip[0]

('GSE173706', 'GSM5277170')

In [None]:


# remove doublets using scanpy.pp.scrublet
# set expected doublet rate according to number of cells recovered on the 10x site
sc.pp.scrublet(adata, verbose=True, expected_doublet_rate = 0.05)



# run scrublet 

In [None]:
def chunk_list(lst, n):
    return [lst[i:i + n] for i in range(0, len(lst), n)]

# takes list of gsm codes, loop merge pickles three at a time
def merge_anndata(codes):
    first_chunk = True
    chunk_size = 3
    for chunk in tqdm(chunk_list(codes, chunk_size)):
        adatas = []
        for code in chunk:
            with open('preprocessing/pickles/{}.pkl'.format(str(code)), 'rb') as File:
                adatas.append(pickle.load(File))
        if not first_chunk:
            adatas.append(combined_adata)
        
        combined_adata = ad.concat(
                adatas,
                axis=0,
                join='outer'
        )
        
        first_chunk = False
    return combined_adata