# Herring 2022

- [HCA](https://data.humancellatlas.org/explore/projects/77dedd59-1376-4887-9bca-dc42b56d5b7a)
- [Google Bucket for annotation](https://console.cloud.google.com/storage/browser/neuro-dev/Processed_data;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)


## Metadata

- batch
- RL# (sampleID로 rename)
- age
- chem: v2 or v3
- stage_id: Fetal to Adult (다른 연구들 보면서 다시 수정할 필요가 있음)
- Race:
- PMI:
- Brain Region (BA9)
- cell_type
- major_clust
- sub_clust

In [1]:
import scanpy as sc
import os,sys,glob
import pandas as pd
import numpy as np
import anndata as ad

In [10]:
cols = [i for i in range(96) if i != 13]
meta = pd.read_csv('/home/sonic/scData/HCA_Herring/Processed_data/Processed_data_RNA-all_BCs-meta-data.csv', sep=',', index_col=0, usecols=cols)
meta.rename(columns={'RL#':'sampleID', 'Brain Regions*':'Brain_Region', 
                     'chem':'Assay', 'stage_id':'Stage', 'major_clust':'cluster_original', 'cell_type':'cluster_main'}, inplace=True)
samples = list(meta.sampleID.unique())
cols_to_use = ['batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library',
               'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'cluster_original', 'cluster_main']

In [17]:
def calculate_age(age):
    if 'd' in age:
        # Extract the numeric part of the age string and convert it to an integer
        age_days = int(age.split('d')[0])
        # Add 365 to the age in days
        age_days += 365
        return age_days
    elif 'y' in age:
        # Extract the numeric part of the age string and convert it to an integer
        age_years = int(age.split('y')[0])
        # Multiply the age in years by 365 and add 365 to get the age in days
        age_days = age_years * 365 + 365
        return age_days
    elif 'ga' in age:
        # Extract the numeric part of the age string and convert it to an integer
        age_weeks = int(age.split('ga')[1])
        # Multiply the age in years by 365 and add 365 to get the age in days
        age_days = (age_weeks + 2) * 7
        return age_days
    elif 'pcw' in age:
        # Extract the numeric part of the age string and convert it to an integer
        age_weeks = int(age.split('pcw')[1])
        # Multiply the age in years by 365 and add 365 to get the age in days
        age_days = age_weeks * 7
        return age_days
    else:
        # If the age string doesn't contain 'd' or 'y', return None
        return None

In [20]:
# Update the cell type
meta.loc[meta['cluster_main'] == 'PN', 'cluster_main'] = 'Ext'
meta.loc[meta['cluster_original'] == 'Astro', 'cluster_main'] = 'Astro'
meta.loc[meta['cluster_original'] == 'OPC', 'cluster_main'] = 'OPC'
meta.loc[meta['cluster_original'] == 'Oligo', 'cluster_main'] = 'OD'
meta.loc[meta['cluster_original'] == 'Micro', 'cluster_main'] = 'MG'
meta.loc[meta['cluster_original'] == 'Vas', 'cluster_main'] = 'Others'

In [21]:
# Filtere cells with poor quality
meta = meta.loc[meta.cluster_original!='Poor-Quality']

# Update age
meta['Age'] = meta['age'].apply(calculate_age)

# Update assay
meta.loc[meta['Assay'] == 'v2', 'Assay'] = "10x Chromium 3' v2"
meta.loc[meta['Assay'] == 'v3', 'Assay'] = "10x Chromium 3' v3"

# Add the dataset name
meta['Dataset'] = 'Herring'
meta['Diagnosis'] = 'normal'
meta['Hemisphere'] = 'A'
meta['Library'] = 'snRNA'

# Filter by columns
meta = meta[cols_to_use]

In [22]:
meta[['sampleID', 'Age', 'Stage']].drop_duplicates().sort_values(by='Age')

Unnamed: 0,sampleID,Age,Stage
AAACCCAAGAGTCTTC-RL2103_ga22_v3,RL2103,168,Fetal
AAACCCAAGATACAGT-RL2107_ga24_v3,RL2107,182,Fetal
AAACCCAAGGTAGACC-RL2121_ga34_v3,RL2121,252,Fetal
AAACCCAAGTACAACA-RL1777_2d_v3,RL1777,367,Neonatal
AAACCTGAGAGTCGGT-RL1612_34d_v2,RL1612,399,Neonatal
AAACCCAAGAGGCCAT-RL2100_86d_v3,RL2100,451,Infancy
AAACCCAAGGCCACCT-RL2104_118d_v3,RL2104,483,Infancy
AAACCCAAGCAGGCAT-RL2108_179d_v3,RL2108,544,Infancy
AAACCCAAGTCACTGT-RL2122_301d_v3,RL2122,666,Infancy
AAACCCAAGCCTGAGA-RL2125_422d_v3,RL2125,787,Childhood


In [23]:
fs = sorted(glob.glob('/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/*filtered_feature_bc_matrix.h5'))

In [24]:
fs = [f for f in fs if f.split('_')[4] in samples]

In [25]:
fs

['/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138509_RL2103_ga22_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138511_RL2107_ga24_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138513_RL2121_ga34_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138514_RL1777_1m_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138516_RL1612_2m_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138517_RL2100_3m_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138519_RL2104_4m_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scData/HCA_Herring/GSE168408_RAW/RNA_filtered/GSM5138520_RL2108_6m_snRNAseq_filtered_feature_bc_matrix.h5',
 '/home/sonic/scDa

In [26]:
list_data = []

for f in fs:
    adata1 = sc.read_10x_h5(f)
    adata1.var_names_make_unique()

    # Get sample information
    RL = f.split('_')[4]
    age = f.split('_')[5]
    sid = '_'.join(['Herring' , RL, age])

    # Subset meta information and update barcodes
    meta1 = meta.loc[meta.sampleID==RL]
    new_barcodes = {a: a.split('-')[0] + '-1' for a in list(meta1.index)}
    meta1 = meta1.rename(index=new_barcodes)

    # Use only intersected barcodes for concat
    intersected_index = meta1.index.intersection(adata1.obs.index)

    # Concat meta information
    meta1 = meta1.loc[intersected_index]
    adata1 = adata1[intersected_index]
    adata1.obs = pd.concat([adata1.obs, meta1], axis=1)

    # Update the barcodes
    adata1.name = RL
    adata1.obs_names = [f'{adata1.name}_{i}' for i in adata1.obs_names]


    list_data.append(adata1)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [27]:
adata = ad.concat(list_data, join="outer")

In [28]:
adata.obs.columns

Index(['batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI',
       'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis',
       'cluster_original', 'cluster_main'],
      dtype='object')

In [29]:
adata

AnnData object with n_obs × n_vars = 149035 × 32738
    obs: 'batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 'PMI', 'Hemisphere', 'Library', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'cluster_original', 'cluster_main'

In [30]:
adata.obs

Unnamed: 0,batch,sampleID,Age,Assay,Stage,Race,PMI,Hemisphere,Library,Brain_Region,Dataset,Sex,Diagnosis,cluster_original,cluster_main
RL2103_AAACCCAAGAGTCTTC-1,RL2103_ga22_v3,RL2103,168,10x Chromium 3' v3,Fetal,AA,5,A,snRNA,BA9,Herring,M,normal,PN_dev,Ext
RL2103_AAACCCAAGCTTCGTA-1,RL2103_ga22_v3,RL2103,168,10x Chromium 3' v3,Fetal,AA,5,A,snRNA,BA9,Herring,M,normal,CGE_dev,IN
RL2103_AAACCCACAAAGGATT-1,RL2103_ga22_v3,RL2103,168,10x Chromium 3' v3,Fetal,AA,5,A,snRNA,BA9,Herring,M,normal,L2-3_CUX2,Ext
RL2103_AAACCCACAGAACATA-1,RL2103_ga22_v3,RL2103,168,10x Chromium 3' v3,Fetal,AA,5,A,snRNA,BA9,Herring,M,normal,Astro,Astro
RL2103_AAACCCACAGTGACCC-1,RL2103_ga22_v3,RL2103,168,10x Chromium 3' v3,Fetal,AA,5,A,snRNA,BA9,Herring,M,normal,L5-6_THEMIS,Ext
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RL2124_TTTGTTGAGACGACGT-1,RL2124_40yr_v3,RL2124,14965,10x Chromium 3' v3,Adult,White,9,A,snRNA,BA9,Herring,M,normal,L2-3_CUX2,Ext
RL2124_TTTGTTGAGCCTCTGG-1,RL2124_40yr_v3,RL2124,14965,10x Chromium 3' v3,Adult,White,9,A,snRNA,BA9,Herring,M,normal,ID2,IN
RL2124_TTTGTTGAGTCTCCTC-1,RL2124_40yr_v3,RL2124,14965,10x Chromium 3' v3,Adult,White,9,A,snRNA,BA9,Herring,M,normal,L2-3_CUX2,Ext
RL2124_TTTGTTGCATGCGTGC-1,RL2124_40yr_v3,RL2124,14965,10x Chromium 3' v3,Adult,White,9,A,snRNA,BA9,Herring,M,normal,L2-3_CUX2,Ext


In [31]:
adata.write('anndata/Herring2022.h5ad')

FileNotFoundError: [Errno 2] Unable to synchronously create file (unable to open file: name = 'anndata/Herring2022.h5ad', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)