This document details the merging of the five replicated datasets. While it could theoretically be used to recreate our process exactly, that would require the recreation of our system of file directories and configuring the appropriate paths to match. Rather, this document is intended for someone to gain an idea of our (very uninteresting) merging process. 

In [None]:
%reload_ext rpy2.ipython
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
%%R
lib_loc <- "/content/drive/MyDrive/r-lib2"

In [None]:
%%R
library(Seurat, lib = lib_loc)

In [None]:
%%R
#input here is the file from its replication in the Jupyter notebook for Liao et al.
load('liaocolab')

sobj[['Original_Cell_Types']] <- Idents(sobj)

liao <- sobj
rm(sobj)

In [None]:
%%R
#input here is the file from its replication in the Jupyter notebook for Lake et al.
load('lakecolab')

ank.3[['Original_Cell_Types']] <- Idents(ank.3)

lake <- ank.3
rm(ank.3)

In [None]:
%%R
lakemito <- lake$percent_mito

In [None]:
%%R
#input here is the file from its replication in the Jupyter notebook for Young et al.
load('youngcolab')

sobj[['Original_Cell_Types']] <- Idents(sobj)

young <- sobj
rm(sobj)

In [None]:
%%R
#input here is the file from its replication in the Jupyter notebook for Menon et al.
load('menoncolab')

sobj[['Original_Cell_Types']] <- Idents(sobj)

menon <- sobj
rm(sobj)

In [None]:
%%R
#input here is the file from its replication in the Jupyter notebook for Wu et al.
load('wucolab')

sobj[['Original_Cell_Types']] <- Idents(sobj)

wu <- sobj
rm(sobj)

In [None]:
%%R
lake[['Study']] <- rep('Lake', length(lake$Leveled_Names))
liao[['Study']] <- rep('Liao', length(liao$Leveled_Names))
menon[['Study']] <- rep('Menon', length(menon$Leveled_Names))
wu[['Study']] <- rep('Wu', length(wu$Leveled_Names))
young[['Study']] <- rep('Young', length(young$Leveled_Names))

In [None]:
%%R
sobjm <- merge(x = lake, y = c(liao, menon, wu, young))
rm(lake, liao, menon, wu, young)

The important cell is immediately above

Now we label some metadata and remove junky cells

In [None]:
%%R
sobjm[['Sex']] <- rep(NA, length(sobjm$nCount_RNA))
sobjm[['Sampling_Location']] <- rep(NA, length(sobjm$nCount_RNA))
sobjm[['Participant']] <- rep(NA, length(sobjm$nCount_RNA))
sobjm[['Sample']] <- rep(NA, length(sobjm$nCount_RNA))
sobjm[['Age']] <- rep(NA, length(sobjm$nCount_RNA))

In [None]:
%%R
locations <- sobjm$orig.ident != 'SeuratProject'
sobjm$Sample[locations] <- sobjm$orig.ident[locations]

locations <- !is.na(sobjm$SangerID)
sobjm$Sample[locations] <- sobjm$SangerID[locations]

In [None]:
%%R
# m stands for metadata inputter, but is a much shorter name
m <- function(sample, sex, sampling_location, participant, age) {
    
    selected_locations <- sobjm[['Sample']] == sample

    print(unique(sobjm[['Sample']][selected_locations]))

    sobjm[['Sex']][selected_locations] <<- sex
    sobjm[['Sampling_Location']][selected_locations] <<- sampling_location
    sobjm[['Participant']][selected_locations] <<- participant
    sobjm[['Age']][selected_locations] <<- age
}

In [None]:
%%R
#Young
m('4602STDY6949192', 'Male', 'Cortex', 'Wilms1', 4.75)
m('4602STDY6949195', 'Male', 'Cortex', 'Wilms1', 4.75)
m('4602STDY7090432', 'Female', 'ArteryVein', 'Wilms2', 1.58)
m('4602STDY7090428', 'Female', 'Cortex', 'Wilms2', 1.58)
m('4602STDY7090429', 'Female', 'Medulla', 'Wilms2', 1.58)
m('4602STDY7090430', 'Female', 'Pelvis', 'Wilms2', 1.58)
m('4602STDY7018926', 'Female', 'ArteryVein', 'Wilms3', 2.25)
m('4602STDY7018923', 'Female', 'Cortex', 'Wilms3', 2.25)
m('4602STDY7018924', 'Female', 'Medulla', 'Wilms3', 2.25)
m('4602STDY7018925', 'Female', 'Pelvis', 'Wilms3', 2.25)
m('4602STDY6949178', 'Female', 'Cortex', 'VHL', 49)
m('4602STDY6949179', 'Female', 'Cortex', 'VHL', 49)
m('4602STDY6949180', 'Female', 'Cortex', 'VHL', 49)
m('4602STDY6949181', 'Female', 'Cortex', 'VHL', 49)
m('4602STDY7018628', 'Male', 'Cortex', 'RCC3', 72)
m('4602STDY7018629', 'Male', 'Cortex', 'RCC3', 72)
m('4602STDY7018630', 'Male', 'Cortex', 'RCC3', 72)
m('4602STDY6976426', 'Female', 'Cortex', 'RCC2', 63)
m('4602STDY6976427', 'Female', 'Cortex', 'RCC2', 63)
m('4602STDY6976428', 'Female', 'Cortex', 'RCC2', 63)
m('4602STDY6930852', 'Male', 'Cortex', 'RCC1', 67)
m('4602STDY6930853', 'Male', 'Cortex', 'RCC1', 67)
m('4602STDY6930856', 'Male', 'Corticomedulla', 'RCC1', 67)
m('4602STDY6795897', 'Male', 'Cortex', 'PapRCC', 70)
m('4602STDY6795898', 'Male', 'Cortex', 'PapRCC', 70)
m('4602STDY6949184', 'Female', 'Cortex', 'TeenTx', 12.75)
m('4602STDY6949185', 'Female', 'Cortex', 'TeenTx', 12.75)
m('4602STDY6949187', 'Female', 'Medulla', 'TeenTx', 12.75)
m('4602STDY6949188', 'Female', 'Medulla', 'TeenTx', 12.75)
m('4602STDY6949186', 'Female', 'Cortex', 'TeenTx', 12.75)
m('4602STDY6949189', 'Female', 'Medulla', 'TeenTx', 12.75)
m('CZIKidney7587405', 'Female', 'Medulla', 'TXK1', 53)
m('CZIKidney7587406', 'Female', 'Pelvis', 'TXK1', 53)
m('CZIKidney7587407', 'Female', 'Cortex', 'TXK1', 53)
m('CZIKidney7587408', 'Female', 'Cortex', 'TXK1', 53)
m('CZIKidney7587409', 'Female', 'Cortex', 'TXK2', 44)
m('CZIKidney7587411', 'Female', 'Medulla', 'TXK2', 44)
m('CZIKidney7587412', 'Female', 'Medulla', 'TXK2', 44)
m('CZIKidney7587422', 'Male', 'Cortex', 'TXK3', 64)
m('CZIKidney7587423', 'Male', 'Cortex', 'TXK3', 64)
m('CZIKidney7587424', 'Male', 'Medulla', 'TXK3', 64)
m('CZIKidney7587425', 'Male', 'Medulla', 'TXK3', 64)
m('CZIKidney7632802', 'Male', 'Medulla', 'TXK4', 72)
m('CZIKidney7632803', 'Male', 'Cortex', 'TXK4', 72)
m('CZIKidney7632804', 'Male', 'Cortex', 'TXK4', 72)
m('4602STDY6930857', 'Male', 'Ureter', 'RCC1', 67)
m('4602STDY7018631', 'Male', 'Ureter', 'RCC3', 72)
m('4602STDY7018927', 'Female', 'Ureter', 'Wilms3', 2.25)

In [None]:
%%R
#Menon
m('TN117.17.116.Lib.4b', 'Male', NA, 'TN117-17-116-Lib-4b', '70 - 79')
m('X17.1606.2.1', 'Female', NA, '17-1606', '50 -59')
m('X18142.5.1', 'Female', NA, '18-142', '60 - 69')
m('X18142.5.2', 'Female', NA, '18-142', '60 - 69')
m('X18.139', 'Male', NA, '18-139', '60 - 69')
m('TN109.18.242.Lib', 'Male', NA, 'TN109-18-242-Lib', '50 - 59')
m('X17.1606.2.0', 'Female', NA, '17-1606', NA)
m('TN121.18.696.10', 'Male', NA, 'TN121-18-696-10', '50 - 59')
m('Trans004.1', 'Female', NA, 'Trans-SV-004', '30 - 39')
m('X18.342', 'Male', NA, '18-342', '60 - 69')
m('Trans006.1', NA, NA, 'Trans-SV-006', '20 - 29')
m('TN120.18.696.4', 'Male', NA, 'TN120-18-696-4', '50 - 59')
m('Trans005.1', 'Female', NA, 'Trans-SV-005', '50 - 59')
m('TN116.17.116.Lib.4a', 'Male', NA,  'TN116-17-116-Lib-4a', '70 - 79')
m('X18.162', 'Female', NA, '18-162', '60 - 69')
m('Trans008.2', 'Female', NA, 'Trans-SV-008', '50 - 59')

In [None]:
%%R
#Wu
m('Tube1', 'Male', NA, 'Wu', 70)
m('Tube2', 'Male', NA, 'Wu', 70)
m('Tube3', 'Male', NA, 'Wu', 70)
m('Tube4', 'Male', NA, 'Wu', 70)

In [None]:
%%R
#Liao
m('kidney1', 'Male', NA, 'kidney1', 57)
m('kidney2', 'Female', NA, 'kidney2', 59)
m('kidney3', 'Male', NA, 'kidney3', 65)

In [None]:
%%R
mapping <- read.csv('LakeMetadata.csv', header = FALSE)

In [None]:
%%R
head(mapping)

In [None]:
%%R
for(library_index in 1:length(mapping$V1)){
    locations <- sobjm$Sample == mapping$V1[library_index]
    sobjm$Sampling_Location[locations] <- mapping$V2[library_index]
    if(!is.na(mapping$V3[library_index]))
    {  if(mapping$V3[library_index] == 'M'){
          sobjm$Sex[locations] <- 'Male'
      }else if(mapping$V3[library_index] == 'F'){
          sobjm$Sex[locations] <- 'Female'
      } else {
          print('sanity check')
      }
     }

    sobjm$Participant[locations] <- mapping$V4[library_index]

    sobjm$Age[sobjm$Participant == 'K1700233'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700234'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700578'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700572'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700574'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1800017'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1800010'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700442'] <- '< 50'
    sobjm$Age[sobjm$Participant == 'K1700443'] <- '< 50'
    sobjm$Age[sobjm$Participant == 'K1800105'] <- '> 50'
    sobjm$Age[sobjm$Participant == 'K1700432'] <- '< 50'
    sobjm$Age[sobjm$Participant == 'K1700435'] <- '< 50'
    sobjm$Age[sobjm$Participant == 'K1700543'] <- '> 50'

    print(library_index)
}

In [None]:
%%R
sobjm$Original_Cell_Types <- paste(sobjm$Original_Cell_Types, sobjm$Study, sep = '_')

In [None]:
%%R 
# This removes duplicate cells between Lake and Menon from Lake, as they were originally from Menon.
sobjm <- subset(sobjm, subset = Sample != 'NK71')
sobjm <- subset(sobjm, subset = Sample != 'NK72')
sobjm <- subset(sobjm, subset = Sample != 'NK79')
sobjm <- subset(sobjm, subset = Sample != 'NK80')
sobjm <- subset(sobjm, subset = Sample != 'NK81')
sobjm <- subset(sobjm, subset = Sample != 'NK86')
sobjm <- subset(sobjm, subset = Sample != 'NK87')
sobjm <- subset(sobjm, subset = Sample != 'NK88')
sobjm <- subset(sobjm, subset = Sample != 'NK89')
sobjm <- subset(sobjm, subset = Sample != 'NK90')
sobjm <- subset(sobjm, subset = Sample != 'NK91')
sobjm <- subset(sobjm, subset = Sample != 'NK92')

# This removes uncooperative cells, each of which is mentioned more specifically in the paper acompanying this repository. 
sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Nephron_epithelium_Young')
sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Private_Young')
sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Epithelial Cells (unassigned)_Lake')

sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Proximal Tubule Epithelial Cells - Stress/Inflam_Lake')
sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Cycling Cells_Wu')
sobjm <- subset(sobjm, subset = Original_Cell_Types != 'Collecting system - PCs - Stressed Dissoc Subset_Lake')
sobjm <- subset(sobjm, subset = Sample != 'kidney1')
sobjm <- subset(sobjm, subset = Leveled_Names != 'Junk')

In [None]:
%%R
# This removes non adult cells from fetal and child samples
sobjm <- subset(sobjm, subset = Sample != '4602STDY7018924')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949184')
sobjm <- subset(sobjm, subset = Sample != '4602STDY7090429')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949192')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949187')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949189')

sobjm <- subset(sobjm, subset = Sample != '4602STDY7018925')
sobjm <- subset(sobjm, subset = Sample != '4602STDY7090430')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949195')

sobjm <- subset(sobjm, subset = Sample != '4602STDY7090428')
sobjm <- subset(sobjm, subset = Sample != '4602STDY7018923')

sobjm <- subset(sobjm, subset = Sample != '4602STDY6949185')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949186')
sobjm <- subset(sobjm, subset = Sample != '4602STDY6949188')
sobjm <- subset(sobjm, subset = Sample != '4602STDY7090431')

In [None]:
%%R
#renaming from sample to participant id as appropriate
sobjm$Participant[sobjm$Participant == 'K1600543'] <- 'PPID3351'
sobjm$Participant[sobjm$Participant == 'K1700233'] <- 'PPID3395'
sobjm$Participant[sobjm$Participant == 'K1700578'] <- 'PPID3432'
sobjm$Participant[sobjm$Participant == 'K1700572'] <- 'PPID3431'
sobjm$Participant[sobjm$Participant == 'K1800017'] <- 'PPID3435'
sobjm$Participant[sobjm$Participant == 'K1800010'] <- 'PPID3434'
sobjm$Participant[sobjm$Participant == 'K1800016'] <- 'PPID3435'
sobjm$Participant[sobjm$Participant == 'K1700443'] <- 'PPID3414'
sobjm$Participant[sobjm$Participant == 'K1600543'] <- 'PPID3351'
sobjm$Participant[sobjm$Participant == 'K1700233'] <- 'PPID3395'
sobjm$Participant[sobjm$Participant == 'K1700578'] <- 'PPID3432'
sobjm$Participant[sobjm$Participant == 'K1700572'] <- 'PPID3431'
sobjm$Participant[sobjm$Participant == 'K1800017'] <- 'PPID3435'
sobjm$Participant[sobjm$Participant == 'K1800010'] <- 'PPID3434'
sobjm$Participant[sobjm$Participant == 'K1800016'] <- 'PPID3435'
sobjm$Participant[sobjm$Participant == 'K1700443'] <- 'PPID3414'          
sobjm$Participant[sobjm$Participant == 'K1700442'] <- 'PPID3414'
sobjm$Participant[sobjm$Participant == 'K1800105'] <- 'PPID3444'
sobjm$Participant[sobjm$Participant == 'K1700234'] <- 'PPID3395'
sobjm$Participant[sobjm$Participant == 'K1700432'] <- 'PPID3411'
sobjm$Participant[sobjm$Participant == 'K1700435'] <- 'PPID3412'
sobjm$Participant[sobjm$Participant == 'K1700573'] <- 'PPID3431'

In [None]:
%%R
#For samples where the participant is not specified, the participant id is assumed to be the sample id
sobjm$Participant[is.na(sobjm$Participant)] <- sobjm$Sample[is.na(sobjm$Participant)]

In [None]:
%%R
#binning ages
print(unique(sobjm$Age))
print(colnames(sobjm@meta.data))

sobjm$Age[sobjm$Age == 59] <- '50 - 59'
sobjm$Age[sobjm$Age == "50 -59"] <- '50 - 59'
sobjm$Age[sobjm$Age == 65 | sobjm$Age == 63 | sobjm$Age == 67] <- '60 - 69'
sobjm$Age[sobjm$Age == 49] <- '40 - 49'
sobjm$Age[sobjm$Age == 70 | sobjm$Age == 72] <- '70 - 79'



meta <- sobjm@meta.data

In [None]:
%%R
#removes metadata that cannot be compared across samples
metarevised <- meta[c(2,3,16,17,18,19,31,32,33,34,35)]
sobjm@meta.data <- metarevised

In [None]:
%%R
#this makes mitochondrial comparisons consistent
sobjm[['percent.mt']] <- PercentageFeatureSet(sobjm, pattern = '^MT-')
lakemito2 <- lakemito[names(lakemito) %in% names(sobj$nCount_RNA)]
sobjm[['percent.mt']][sobj$Study == 'Lake'] <- lakemito2

In [None]:
%%R
saveRDS(sobjm, file = 'MergedObject.RDS')