# Cameron 2023

- https://figshare.com/articles/dataset/Single_nuclei_gene_expression_matrix_from_5_regions_of_the_human_prenatal_brain/11629311
- https://www.sciencedirect.com/science/article/pii/S0006322322014044?via%3Dihub
- https://ega-archive.org/studies/EGAS00001006537


In [1]:
import scanpy as sc
import os,sys,glob
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt

In [2]:
list_data = []
# Frontal cortex
# read count matrix into pandas dataframe
counts_df = pd.read_csv('/home/sonic/scData/Cameron2023/cameron_2022_snRNAseq_FC_raw_count_gEX_matrix.txt', 
                 sep=' ', index_col=0, header=0)

# read metadata into pandas dataframe
metadata_df = pd.read_csv("/home/sonic/scData/Cameron2023/cameron_2022_snRNAseq_FC_metadata.txt.gz", 
                          index_col=0, sep='\t')

# create AnnData object from count matrix dataframe
adata1 = sc.AnnData(counts_df)
adata1 = adata1.transpose()

# add metadata to AnnData object
adata1.obs = metadata_df
adata1.obs['Brain_Region'] = "Frontal Cortex"

list_data.append(adata1)

  adata1 = sc.AnnData(counts_df)


In [3]:
nan_mask = np.isnan(adata1.X).sum(axis=0) > 0
print (np.sum(nan_mask))

0


In [4]:
# Ganglionic Eminence
# read count matrix into pandas dataframe
counts_df = pd.read_csv('/home/sonic/scData/Cameron2023/cameron_2022_snRNAseq_GE_raw_count_gEX_matrix.txt', 
                 sep=' ', index_col=0, header=0)

# read metadata into pandas dataframe
metadata_df = pd.read_csv("/home/sonic/scData/Cameron2023/cameron_2022_snRNAseq_GE_metadata.txt.gz", 
                          index_col=0, sep='\t')

# create AnnData object from count matrix dataframe
adata1 = sc.AnnData(counts_df)
adata1 = adata1.transpose()

# add metadata to AnnData object
adata1.obs = metadata_df
adata1.obs['Brain_Region'] = "Ganglionic Eminence"

list_data.append(adata1)

  adata1 = sc.AnnData(counts_df)


In [5]:
nan_mask = np.isnan(adata1.X).sum(axis=0) > 0
print (np.sum(nan_mask))

0


In [6]:
adata = ad.concat(list_data, join="outer")

In [7]:
nan_mask = np.isnan(adata.X).sum(axis=0) > 0
print (np.sum(nan_mask))

2007


In [8]:
adata = adata[:, ~nan_mask]

In [9]:
adata

View of AnnData object with n_obs × n_vars = 18091 × 28830
    obs: 'cellIDs', 'sample', 'Brain_Region'

In [10]:
# Update the cell type
adata.obs['cluster_original'] = adata.obs['cellIDs']

adata.obs.loc[adata.obs['cluster_original'].str.contains('ExN-'), 'cluster_main'] = 'Ext'
adata.obs.loc[adata.obs['cluster_original'].str.contains('InN-'), 'cluster_main'] = 'IN'

adata.obs.loc[adata.obs['cluster_original'].str.contains('RG-'), 'cluster_main'] = 'RG'
adata.obs.loc[adata.obs['cluster_original'].str.contains('CycPro'), 'cluster_main'] = 'RG'
adata.obs.loc[adata.obs['cluster_original'].str.contains('-IP'), 'cluster_main'] = 'RG'

adata.obs.loc[adata.obs['cluster_original'].str.contains('-OPC'), 'cluster_main'] = 'OPC'
adata.obs.loc[adata.obs['cluster_original'].str.contains('-MG'), 'cluster_main'] = 'MG'

adata.obs.loc[adata.obs['cluster_original'].str.contains('-N-undef'), 'cluster_main'] = 'Others'
adata.obs.loc[adata.obs['cluster_original'].str.contains('-Endo'), 'cluster_main'] = 'Others'
adata.obs['cluster_original'] = adata.obs['cluster_original'].str.replace('FC-', '').str.replace('GE-', '')

# Update the meta information
adata.obs['sampleID'] = adata.obs['sample']
adata.obs['Age'] = 14.5 * 7
adata.obs['Assay'] = "10x Chromium 3' v3"
adata.obs['Library'] = "snRNA"
adata.obs['batch'] = adata.obs['sample'].str.split('_').str[2]
adata.obs['Stage'] = "Fetal"
adata.obs['Race'] = "Unknown"
adata.obs['Hemisphere'] = "A"
adata.obs['PMI'] = "Unknown"
adata.obs['Dataset'] = "Cameron"
adata.obs['Sex'] = "F"
adata.obs['Diagnosis'] = "normal"


  adata.obs['cluster_original'] = adata.obs['cellIDs']


In [11]:
cols_to_use = ['batch', 'sampleID', 'Age', 'Assay', 'Stage', 'Race', 
               'Library','Hemisphere', 'cluster_original', 'cluster_main', 
               'PMI', 'Brain_Region', 'Dataset', 'Sex', 'Diagnosis', 'Dataset']

adata.obs[cols_to_use]

Unnamed: 0_level_0,batch,sampleID,Age,Assay,Stage,Race,Library,Hemisphere,cluster_original,cluster_main,PMI,Brain_Region,Dataset,Sex,Diagnosis,Dataset
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
510_PFC_AAACCCAAGAGTGGCT-1,B1,510_PFC_B1,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,InN-2,IN,Unknown,Frontal Cortex,Cameron,F,normal,Cameron
510_PFC_AAACCCAAGTTGCGCC-1,B1,510_PFC_B1,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,IP,RG,Unknown,Frontal Cortex,Cameron,F,normal,Cameron
510_PFC_AAACCCATCAAGCTGT-1,B1,510_PFC_B1,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,RG-1,RG,Unknown,Frontal Cortex,Cameron,F,normal,Cameron
510_PFC_AAACCCATCGAACCTA-1,B1,510_PFC_B1,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,ExN-1,Ext,Unknown,Frontal Cortex,Cameron,F,normal,Cameron
510_PFC_AAACCCATCGACGCGT-1,B1,510_PFC_B1,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,ExN-1,Ext,Unknown,Frontal Cortex,Cameron,F,normal,Cameron
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993_WGE_TTTGGTTTCACTGGGC-1,B2,993_WGE_B2,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,InN-3,IN,Unknown,Ganglionic Eminence,Cameron,F,normal,Cameron
993_WGE_TTTGGTTTCGACGCTG-1,B2,993_WGE_B2,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,InN-5,IN,Unknown,Ganglionic Eminence,Cameron,F,normal,Cameron
993_WGE_TTTGTTGCAAGGTACG-1,B2,993_WGE_B2,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,InN-3,IN,Unknown,Ganglionic Eminence,Cameron,F,normal,Cameron
993_WGE_TTTGTTGTCCTAACAG-1,B2,993_WGE_B2,101.5,10x Chromium 3' v3,Fetal,Unknown,snRNA,A,RG-3,RG,Unknown,Ganglionic Eminence,Cameron,F,normal,Cameron


In [None]:
adata.write('/home/sonic/scData/anndata/Cameron2023.h5ad')