### Setting up the google colab (optional)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy
!pip install pandas
!pip install scanpy
!pip install scanpy.external
!pip install harmonypy
!pip install seaborn
!pip install mudata
!pip install muon

Collecting scanpy
  Downloading scanpy-1.9.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anndata>=0.7.4 (from scanpy)
  Downloading anndata-0.9.2-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting umap-learn>=0.3.10 (from scanpy)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting session-info (from scanpy)
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn>=0.3.10->scanpy)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.

### Importing modules and settings

I ran this notebook for both dataset modalities, with and without multimappers

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import harmonypy as hm
import seaborn as sns
import mudata as md
import muon as mu
import matplotlib.pyplot as plt

In [None]:
#!python -m pip install matplotlib==3.5
from matplotlib.pyplot import rc_context

General settings of Scanpy

In [None]:
sc.settings.verbosity = 4
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, transparent = True, format = 'pdf', vector_friendly = True)

scanpy==1.9.3 anndata==0.9.2 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.5.3 scikit-learn==1.2.2 statsmodels==0.14.0 pynndescent==0.5.10


### Declaring the input and output files

This part was done running two times, for with and without multimappers

In [None]:
#With multimappers
name_of_analysis = 'hydractinia_atlas_genome_2.1_with_multimappers'

In [None]:
#With multimappers
results_file = '/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_with_multimappers/'+name_of_analysis+'.h5ad'

In [None]:
#No multimappers
name_of_analysis = 'hydractinia_atlas_genome_2.1_no_multimappers'

In [None]:
#No multimappers
results_file = '/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/'+name_of_analysis+'.h5ad'

Here we are going to try to upload the libraries independently and directly from the output of splitseq_toolbox, this skips using R for now

In [None]:
from scipy.sparse import csr_matrix

In [None]:
# Load data from txt.gz file library 20
import gzip
import anndata as ad
with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib201_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_201 = ad.AnnData(matrix)
lib_201.obs_names = cells
lib_201.var_names = genes
experiments=['lib_20']
libraries=['lib_20_1']
lib_201.obs["Experiment"]= 'lib_20'
lib_201.obs["Library"]= 'lib_20_1'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib202_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_202 = ad.AnnData(matrix)
lib_202.obs_names = cells
lib_202.var_names = genes
experiments=['lib_20']
libraries=['lib_20_2']
lib_202.obs["Experiment"]= 'lib_20'
lib_202.obs["Library"]= 'lib_20_2'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib203_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_203 = ad.AnnData(matrix)
lib_203.obs_names = cells
lib_203.var_names = genes
experiments=['lib_20']
libraries=['lib_20_3']
lib_203.obs["Experiment"]= 'lib_20'
lib_203.obs["Library"]= 'lib_20_3'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib204_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_204 = ad.AnnData(matrix)
lib_204.obs_names = cells
lib_204.var_names = genes
experiments=['lib_20']
libraries=['lib_20_4']
lib_204.obs["Experiment"]= 'lib_20'
lib_204.obs["Library"]= 'lib_20_4'


# Load data from txt.gz from library 9
with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib901_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_091 = ad.AnnData(matrix)
lib_091.obs_names = cells
lib_091.var_names = genes
experiments=['lib_09']
libraries=['lib_09_1']
lib_091.obs["Experiment"]= 'lib_09'
lib_091.obs["Library"]= 'lib_09_1'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib902_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_092 = ad.AnnData(matrix)
lib_092.obs_names = cells
lib_092.var_names = genes
experiments=['lib_09']
libraries=['lib_09_2']
lib_092.obs["Experiment"]= 'lib_09'
lib_092.obs["Library"]= 'lib_09_2'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib903_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_093 = ad.AnnData(matrix)
lib_093.obs_names = cells
lib_093.var_names = genes
experiments=['lib_09']
libraries=['lib_09_3']
lib_093.obs["Experiment"]= 'lib_09'
lib_093.obs["Library"]= 'lib_09_3'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib904_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_094 = ad.AnnData(matrix)
lib_094.obs_names = cells
lib_094.var_names = genes
experiments=['lib_09']
libraries=['lib_09_4']
lib_094.obs["Experiment"]= 'lib_09'
lib_094.obs["Library"]= 'lib_09_4'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib905_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_095 = ad.AnnData(matrix)
lib_095.obs_names = cells
lib_095.var_names = genes
experiments=['lib_09']
libraries=['lib_09_5']
lib_095.obs["Experiment"]= 'lib_09'
lib_095.obs["Library"]= 'lib_09_5'

In [None]:
#Uploading library 29
with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib291_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_291 = ad.AnnData(matrix)
lib_291.obs_names = cells
lib_291.var_names = genes
experiments=['lib_29']
libraries=['lib_29_1']
lib_291.obs["Experiment"]= 'lib_29'
lib_291.obs["Library"]= 'lib_29_1'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib292_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_292= ad.AnnData(matrix)
lib_292.obs_names = cells
lib_292.var_names = genes
experiments=['lib_29']
libraries=['lib_29_2']
lib_292.obs["Experiment"]= 'lib_29'
lib_292.obs["Library"]= 'lib_29_2'


#Uploading library 27
with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib271_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_271 = ad.AnnData(matrix)
lib_271.obs_names = cells
lib_271.var_names = genes
experiments=['lib_27']
libraries=['lib_27_1']
lib_271.obs["Experiment"]= 'lib_27'
lib_271.obs["Library"]= 'lib_27_1'

with gzip.open('/mnt/sda/david/hydractinia/matrices_and_seurat/20230703_genome_2.1_no_multimappers/lib272_merge.bam_50GENESPERCELL_MQ0.txt.gz', 'rt') as f:
    data = pd.read_table(f, sep="\t")
counts=data.iloc[:,1:].values.T
genes=data.iloc[:,0].tolist()
cells=data.iloc[:,1:].columns.tolist()
matrix=csr_matrix(counts, dtype=np.float32)
lib_272 = ad.AnnData(matrix)
lib_272.obs_names = cells
lib_272.var_names = genes
experiments=['lib_27']
libraries=['lib_27_2']
lib_272.obs["Experiment"]= 'lib_27'
lib_272.obs["Library"]= 'lib_27_2'

In [None]:
lib_201

AnnData object with n_obs × n_vars = 20997 × 20485
    obs: 'Experiment', 'Library'

In [None]:
lib_091

AnnData object with n_obs × n_vars = 12539 × 19856
    obs: 'Experiment', 'Library'

In [None]:
lib_094

AnnData object with n_obs × n_vars = 23638 × 20428
    obs: 'Experiment', 'Library'

In [None]:
lib_292

AnnData object with n_obs × n_vars = 21573 × 20656
    obs: 'Experiment', 'Library'

In [None]:
lib_291

AnnData object with n_obs × n_vars = 18896 × 20235
    obs: 'Experiment', 'Library'

In [None]:
lib_271

AnnData object with n_obs × n_vars = 15368 × 20586
    obs: 'Experiment', 'Library'

In [None]:
adata_lib20 = lib_201.concatenate(lib_202, lib_203, lib_204, join="outer")

In [None]:
adata_rest = lib_291.concatenate(lib_292, lib_271, lib_272, lib_091, lib_092, lib_093, lib_094, lib_095, join="outer")

In [None]:
adata_rest.obs.pop('batch')
adata_lib20.obs.pop('batch')

CAACCACAAACAACCACGAACTTA-0    0
AGTCACTACTGTAGCCAGCCATGC-0    0
CACTTCGAGTCGTAGAAACGCTTA-0    0
ATGCCTAAGAGTTAGCATTGGCTC-0    0
CAACCACAGTCGTAGACAGCGTTA-0    0
                             ..
AAGGTACACACCTTACCAGCGTTA-3    3
GAATCTGAACAAGCTATGAAGAGA-3    3
CTGGCATATGGCTTCACATACCAA-3    3
CTGGCATAAACTCACCCATACCAA-3    3
ACACAGAAGGAGAACAATAGCGAC-3    3
Name: batch, Length: 93901, dtype: category
Categories (4, object): ['0', '1', '2', '3']

In [None]:
adata_lib20.obs

Unnamed: 0,Experiment,Library
CAACCACAAACAACCACGAACTTA-0,lib_20,lib_20_1
AGTCACTACTGTAGCCAGCCATGC-0,lib_20,lib_20_1
CACTTCGAGTCGTAGAAACGCTTA-0,lib_20,lib_20_1
ATGCCTAAGAGTTAGCATTGGCTC-0,lib_20,lib_20_1
CAACCACAGTCGTAGACAGCGTTA-0,lib_20,lib_20_1
...,...,...
AAGGTACACACCTTACCAGCGTTA-3,lib_20,lib_20_4
GAATCTGAACAAGCTATGAAGAGA-3,lib_20,lib_20_4
CTGGCATATGGCTTCACATACCAA-3,lib_20,lib_20_4
CTGGCATAAACTCACCCATACCAA-3,lib_20,lib_20_4


In [None]:
adata_lib20.var_names_make_unique()
adata_rest.var_names_make_unique()

In [None]:
adata_lib20.var_names
adata_rest.var_names

Index(['LOC130612030', 'LOC130612032', 'LOC130612033', 'LOC130612034',
       'LOC130612035', 'LOC130612036', 'LOC130612037', 'LOC130612038',
       'LOC130612039', 'LOC130612040',
       ...
       'Trnaw-cca_144', 'Trnaw-cca_145', 'Trnaw-cca_146', 'Trnaw-cca_167',
       'Trnaw-cca_204', 'Trnaw-cca_284', 'Trnay-gua_152', 'Trnay-gua_17',
       'Trnay-gua_247', 'Trnay-gua_466'],
      dtype='object', length=22550)

In [None]:
adata_lib20.var
adata_rest.var

LOC130612030
LOC130612032
LOC130612033
LOC130612034
LOC130612035
...
Trnaw-cca_284
Trnay-gua_152
Trnay-gua_17
Trnay-gua_247
Trnay-gua_466


In [None]:
adata_rest.obs

Unnamed: 0,Experiment,Library
AGAGTCAAAATGTTGCAAACATCG-0,lib_29,lib_29_1
CTGTAGCCAAACATCGGACTAGTA-0,lib_29,lib_29_1
CGAACTTACAAGACTAACAGATTC-0,lib_29,lib_29_1
AGAGTCAACGACACACGATAGACA-0,lib_29,lib_29_1
CGCTGATCATGCCTAACCGTGAGA-0,lib_29,lib_29_1
...,...,...
AGCAGGAAGAGTTAGCAGTCACTA-8,lib_09,lib_09_5
AGAGTCAAACAGATTCTGAAGAGA-8,lib_09,lib_09_5
CACTTCGACTGAGCCAACACGACC-8,lib_09,lib_09_5
AAACATCGGGAGAACAATTGGCTC-8,lib_09,lib_09_5


In [None]:
adata_lib20

AnnData object with n_obs × n_vars = 93901 × 22557
    obs: 'Experiment', 'Library'

In [None]:
adata_rest

AnnData object with n_obs × n_vars = 183628 × 22550
    obs: 'Experiment', 'Library'

### Reading the barcodes for library 20 only and adding labels for the rest

Will advice to do this part first since adding lables was done in different subsets of the data  

In [None]:
barcodes = pd.read_excel('/mnt/sda/david/hydractinia/barcodes hydractinia.xlsx', header = None, index_col = 0)

In [None]:
barcodes

Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,AACGTGAT,Stolons
A2,AAACATCG,Stolons
A3,ATGCCTAA,Stolons
A4,AGTGGTCA,Stolons
A5,ACCACTGT,Stolons
A6,ACATTGGC,Stolons
A7,CAGATCTG,Stolons
A8,CATCAAGT,Stolons
A9,CGCTGATC,Stolons
A10,ACAAGCTA,Stolons


The barcodes dataframe has to have 48 rows, with the list of barcodes in one column and the names of the different samples. We also need a samples_name that describes the type of samples

In [None]:
samples_name = 'Body_part'

In [None]:
barcodes.rename( columns = {1:'bc', 2: samples_name}, inplace = True)

In [None]:
barcodes

Unnamed: 0_level_0,bc,Body_part
0,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,AACGTGAT,Stolons
A2,AAACATCG,Stolons
A3,ATGCCTAA,Stolons
A4,AGTGGTCA,Stolons
A5,ACCACTGT,Stolons
A6,ACATTGGC,Stolons
A7,CAGATCTG,Stolons
A8,CATCAAGT,Stolons
A9,CGCTGATC,Stolons
A10,ACAAGCTA,Stolons


There are the following **samples** to be annotated:

In [None]:
barcodes[samples_name].value_counts()

Stolons    24
Polyps     24
Name: Body_part, dtype: int64

In [None]:
samples = barcodes[samples_name].value_counts().index.to_list()

In [None]:
samples

['Stolons', 'Polyps']

In [None]:
adata_rest.uns['Body_part'] = samples

In [None]:
adata_lib20.uns['Body_part'] = samples

In [None]:
adata_rest.uns

OverloadedDict, wrapping:
	OrderedDict([('Body_part', ['Stolons', 'Polyps'])])
With overloaded keys:
	['neighbors'].

In [None]:
adata_rest

AnnData object with n_obs × n_vars = 183628 × 22550
    obs: 'Experiment', 'Library'
    uns: 'Body_part'

In [None]:
adata_lib20.uns

OverloadedDict, wrapping:
	OrderedDict([('Body_part', ['Stolons', 'Polyps'])])
With overloaded keys:
	['neighbors'].

The following nested loop does this automatically.  

It loops through each of the samples

For each sample, it establishes a filter in the barcodes dataframe

Then obtains the different barcodes for each into a list

and then loops through the different barcodes of each sample.

In the inner nested loop, there is a filtering of all cells that contain the barcode followed by 16 characters and a "-0 or -1 or -2 or -3"

and then it creates the new column and inserts the name of the sample



In [None]:
for sample_i in samples:
    filt = (barcodes[samples_name] == sample_i)
    li = barcodes[filt]['bc'].to_list()
    for barcode in li:
        cellfilt = adata_lib20.obs.index.str.contains(barcode, regex=True, case=True)#just did this for each of the 4 sublibraries of the 20
        adata_lib20.obs.loc[cellfilt, samples_name] = sample_i

In [None]:
adata_lib20.obs

Unnamed: 0,Experiment,Library,Body_part
CAACCACAAACAACCACGAACTTA-0,lib_20,lib_20_1,Polyps
AGTCACTACTGTAGCCAGCCATGC-0,lib_20,lib_20_1,Polyps
CACTTCGAGTCGTAGAAACGCTTA-0,lib_20,lib_20_1,Polyps
ATGCCTAAGAGTTAGCATTGGCTC-0,lib_20,lib_20_1,Stolons
CAACCACAGTCGTAGACAGCGTTA-0,lib_20,lib_20_1,Polyps
...,...,...,...
AAGGTACACACCTTACCAGCGTTA-3,lib_20,lib_20_4,Polyps
GAATCTGAACAAGCTATGAAGAGA-3,lib_20,lib_20_4,Polyps
CTGGCATATGGCTTCACATACCAA-3,lib_20,lib_20_4,Polyps
CTGGCATAAACTCACCCATACCAA-3,lib_20,lib_20_4,Polyps


In [None]:
adata_lib20

AnnData object with n_obs × n_vars = 93901 × 22557
    obs: 'Experiment', 'Library', 'Body_part'
    uns: 'Body_part'

Now we concatenate both parts of the dataset

In [None]:
adata = adata_rest.concatenate(adata_lib20, join="outer")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [None]:
adata.obs

Unnamed: 0,Experiment,Library,Body_part,batch
AGAGTCAAAATGTTGCAAACATCG-0-0,lib_29,lib_29_1,,0
CTGTAGCCAAACATCGGACTAGTA-0-0,lib_29,lib_29_1,,0
CGAACTTACAAGACTAACAGATTC-0-0,lib_29,lib_29_1,,0
AGAGTCAACGACACACGATAGACA-0-0,lib_29,lib_29_1,,0
CGCTGATCATGCCTAACCGTGAGA-0-0,lib_29,lib_29_1,,0
...,...,...,...,...
AAGGTACACACCTTACCAGCGTTA-3-1,lib_20,lib_20_4,Polyps,1
GAATCTGAACAAGCTATGAAGAGA-3-1,lib_20,lib_20_4,Polyps,1
CTGGCATATGGCTTCACATACCAA-3-1,lib_20,lib_20_4,Polyps,1
CTGGCATAAACTCACCCATACCAA-3-1,lib_20,lib_20_4,Polyps,1


In [None]:
adata

AnnData object with n_obs × n_vars = 277529 × 23126
    obs: 'Experiment', 'Library', 'Body_part', 'batch'

And now for the rest of the libraries i.e. 27, 29 and 9

In [None]:
libpolyp= adata.obs['Library'].str.contains('lib_09_1|lib_09_2|lib_09_3|lib_09_4|lib_09_5|lib_27_1|lib_27_2|lib_29_1|lib_29_2', regex=True, case= False, na=False)

In [None]:
adata.obs.loc[libpolyp,'Body_part']= 'Polyps'

The same goes for the libraries with either feeding or sexual polyps

In [None]:
libpolypmix= adata.obs['Library'].str.contains('lib_09_1|lib_09_2|lib_09_3|lib_09_4|lib_09_5', regex=True, case= False)
libpolypfeeding= adata.obs['Library'].str.contains('lib_27_1|lib_27_2', regex=True, case= False)
libpolypsexual= adata.obs['Library'].str.contains('lib_29_1|lib_29_2', regex=True, case= False)
libstolons= adata.obs['Body_part'].str.contains('Stolons', regex=True, case= False, na=False)

In [None]:
libpolypsexual

AGAGTCAAAATGTTGCAAACATCG-0-0     True
CTGTAGCCAAACATCGGACTAGTA-0-0     True
CGAACTTACAAGACTAACAGATTC-0-0     True
AGAGTCAACGACACACGATAGACA-0-0     True
CGCTGATCATGCCTAACCGTGAGA-0-0     True
                                ...  
AAGGTACACACCTTACCAGCGTTA-3-1    False
GAATCTGAACAAGCTATGAAGAGA-3-1    False
CTGGCATATGGCTTCACATACCAA-3-1    False
CTGGCATAAACTCACCCATACCAA-3-1    False
ACACAGAAGGAGAACAATAGCGAC-3-1    False
Name: Library, Length: 277529, dtype: bool

In [None]:
adata.obs.loc[libpolypmix,'Colony_part']= 'Polyp_Mix'
adata.obs.loc[libpolypfeeding,'Colony_part']= 'Polyp_Feeding'
adata.obs.loc[libpolypsexual,'Colony_part']= 'Polyp_Sexual'
adata.obs['Colony_part']= adata.obs['Colony_part'].fillna('Polyp_Mix')

In [None]:
adata.obs.loc[libstolons,'Colony_part']= 'Stolon'

### Annotating the libraries and samples

Annotating the different libraries in adata.obs

In [None]:
adata.obs

Making PEG labels for each library:

In [None]:
libpeg= adata.obs['Library'].str.contains('lib_20_3|lib_20_4|lib_27_1|lib_27_2|lib_29_1|lib_29_2', regex=True, case= False)

In [None]:
libnopeg= adata.obs['Library'].str.contains('lib_20_1|lib_20_2|lib_09_1|lib_09_2|lib_09_3|lib_09_4|lib_09_5', regex=True, case=False)

In [None]:
adata.obs.loc[libpeg,'Presence_of_PEG']= 'PEG'
adata.obs.loc[libnopeg, 'Presence_of_PEG']= 'noPEG'

In [None]:
batcha = adata.obs['Library'].str.contains('lib_20_1|lib_20_2|lib_20_3|lib_20_4',regex=True, case=False)
batchb = adata.obs['Library'].str.contains('lib_27_1|lib_27_2', regex=True, case=False)
batchc = adata.obs['Library'].str.contains('lib_29_1|lib_29_2', regex=True, case=False)
batchd = adata.obs['Library'].str.contains('lib_09_1|lib_09_2|lib_09_3|lib_09_4|lib_09_5', regex=True, case=False)

In [None]:
adata.obs.pop('batch')

AGAGTCAAAATGTTGCAAACATCG-0-0    0
CTGTAGCCAAACATCGGACTAGTA-0-0    0
CGAACTTACAAGACTAACAGATTC-0-0    0
AGAGTCAACGACACACGATAGACA-0-0    0
CGCTGATCATGCCTAACCGTGAGA-0-0    0
                               ..
AAGGTACACACCTTACCAGCGTTA-3-1    1
GAATCTGAACAAGCTATGAAGAGA-3-1    1
CTGGCATATGGCTTCACATACCAA-3-1    1
CTGGCATAAACTCACCCATACCAA-3-1    1
ACACAGAAGGAGAACAATAGCGAC-3-1    1
Name: batch, Length: 277529, dtype: category
Categories (2, object): ['0', '1']

In [None]:
adata.obs.loc[batcha, 'batch'] = '1'
adata.obs.loc[batchb, 'batch'] = '2'
adata.obs.loc[batchc, 'batch'] = '3'
adata.obs.loc[batchd, 'batch'] = '4'

In [None]:
adata.obs

### Creating a unique id for sample plus library

This cell annotates each cell in a unique sample ID (sample and library)

In [None]:
adata.obs['Unique'] = adata.obs['Library'].astype(str) +"_"+ adata.obs['Body_part'].astype(str) +"_"+ adata.obs['Colony_part'].astype(str) +"_"+ adata.obs['Presence_of_PEG'].astype(str)

In [None]:
print(str(adata.obs['Unique']))

In [None]:
adata.obs

In [None]:
adata.obs['Unique'].value_counts()

In [None]:
adata.obs['Unique'].value_counts(normalize=True)

In [None]:
adata.obs['Body_part'].value_counts()

Polyps     251639
Stolons     25890
Name: Body_part, dtype: int64

In [None]:
adata.obs['Body_part'].value_counts(normalize=True)

Polyps     0.906712
Stolons    0.093288
Name: Body_part, dtype: float64

In [None]:
adata

### Checking if the libraries are labelled correctly

In [None]:
check = 'lib_27_2'

In [None]:
cfilt = adata.obs['Library'].str.contains(check)

In [None]:
cfilt

AGAGTCAAAATGTTGCAAACATCG-0-0    False
CTGTAGCCAAACATCGGACTAGTA-0-0    False
CGAACTTACAAGACTAACAGATTC-0-0    False
AGAGTCAACGACACACGATAGACA-0-0    False
CGCTGATCATGCCTAACCGTGAGA-0-0    False
                                ...  
AAGGTACACACCTTACCAGCGTTA-3-1    False
GAATCTGAACAAGCTATGAAGAGA-3-1    False
CTGGCATATGGCTTCACATACCAA-3-1    False
CTGGCATAAACTCACCCATACCAA-3-1    False
ACACAGAAGGAGAACAATAGCGAC-3-1    False
Name: Library, Length: 277529, dtype: bool

In [None]:
adata.obs[cfilt]

In [None]:
adata.write(results_file)