## Setting up the google colab (optional)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy
!pip install pandas
!pip install scanpy
!pip install scanpy.external
!pip install harmonypy
!pip install seaborn
!pip install mudata
!pip install muon
!pip install mudatasets

Collecting scanpy
  Downloading scanpy-1.9.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anndata>=0.7.4 (from scanpy)
  Downloading anndata-0.9.2-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting umap-learn>=0.3.10 (from scanpy)
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting session-info (from scanpy)
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn>=0.3.10->scanpy)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.

## Importing modules and settings

In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import mudata as md
import muon as mu
import mudatasets as mds
import seaborn as sns
import scanpy.external as sce

In [None]:
import matplotlib.pyplot as plt

General settings of Scanpy

In [None]:
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, transparent = True, format = 'pdf', vector_friendly = True)

scanpy==1.9.1 anndata==0.8.0 umap==0.5.2 numpy==1.22.4 scipy==1.10.1 pandas==1.4.3 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.10.4 louvain==0.7.1 pynndescent==0.5.6


In [None]:
umap_cmap = sns.blend_palette(['lightgrey', 'xkcd:sapphire'], as_cmap = True)

## Functions for diamond blast querying

This set of functions explores the diamond blast annotation to return the most informative hit. It works like this:
if there is a hit of the 'preferred species' in the diamond, it returns this one. The preferred species is coded below as "Hydra" in the default
Otherwise, it looks for an informative hit, this means avoiding hits like "hypothetical protein", etc. These are coded in the list of words (words to avoid). If there is no such nice hit it returns "no nice hit:" followed by the best hit by pvalue, that will then contain either "hypothetical" etc. If there are no hits in the diamond, it returns "not in Diamond"

In [None]:
def check_for_species(df, species):
    mask = df['match'].str.contains(species, na=False)
    if mask.any():
        contained = df.loc[mask].sort_values('match')
        return contained.iloc[0, 1]
    else:
        return None

In [None]:
def contains_words (hit_string, li):
    ret = False
    for word in li:
    #print(word)
        if word.lower() in hit_string:
        #print(word)
            ret = True
    return ret

In [None]:
def get_informative(df, li):
    if len(df.index) == 0:
        return "not in Diamond"
    else:
        for i, row in df.iterrows():
            se = row[1]
            if isinstance(se, str) and not contains_words(se.lower(), li):
                return se
        return "no informative hits: " + str(df.iloc[0, 1])

In [None]:
def get_best_hit (transcript, species, li):
    search = diamond[diamond['gene_id'] == transcript]
    if check_for_species(search, species) is not None:
        return check_for_species(search, species)
    else:
        return get_informative (search, li)
###########################################################

In [None]:
def check_for_species_2(df, species):
    mask = df['ncbi_name'].str.contains(species, na=False)
    if mask.any():
        contained = df.loc[mask].sort_values('ncbi_name')
        return contained.iloc[0, 2]
    else:
        return None

In [None]:
def get_informative_2(df, li):
    if len(df.index) == 0:
        return "other RNA"
    else:
        for i, row in df.iterrows():
            se = row[1]
            if isinstance(se, str) and not contains_words(se.lower(), li):
                return se
        return str(df.iloc[0, 2])

In [None]:
def get_best_hit_2 (transcript, species, li):
    search = diamond[diamond['gene_id'] == transcript]
    if check_for_species_2(search, species) is not None:
        return check_for_species_2(search, species)
    else:
        return get_informative_2 (search, li)

In [None]:
preferred_species = 'Hydra'

In [None]:
preferred_species_2 = 'Hydractinia'

In [None]:
list_of_words = ['hypothetical', 'uncharacterized', 'unnamed']

In [None]:
diamond=  pd.read_csv('/mnt/sda/david/interpro_diamond_blast/hydractinia_only/recip_blasts/20230711_equivalence_gene_protein_diamond_ncbi.tsv', sep='\t', usecols=[1,6,7])#, skiprows=1)

In [None]:
diamond

Unnamed: 0,gene_id,match,ncbi_name
0,LOC130612030,,melatonin receptor type 1C-like [Hydractinia s...
1,LOC130612031,,circumsporozoite protein-like [Hydractinia sym...
2,LOC130612032,XP_012562270.1 PREDICTED: thymidylate synthase...,thymidylate synthase-like [Hydractinia symbiol...
3,LOC130612032,ACS44779.1 thymidylate synthase [Penaeus vanna...,thymidylate synthase-like [Hydractinia symbiol...
4,LOC130612032,6K7Q_A Crystal structure of thymidylate syntha...,thymidylate synthase-like [Hydractinia symbiol...
...,...,...,...
559870,LOC130662872,XP_004209014.1 PREDICTED: uncharacterized prot...,52 kDa repressor of the inhibitor of the prote...
559871,LOC130662872,XP_004205780.1 PREDICTED: uncharacterized prot...,52 kDa repressor of the inhibitor of the prote...
559872,LOC130662872,XP_002162793.1 PREDICTED: 52 kDa repressor of ...,52 kDa repressor of the inhibitor of the prote...
559873,LOC130662872,XP_004207246.1 PREDICTED: uncharacterized prot...,52 kDa repressor of the inhibitor of the prote...


## Declaring the input and output files

In [None]:
mdata = mu.read('/mnt/sda/david/hydractinia/hydractinia_atlas_20230812.h5mu')



In [None]:
adata= (mdata.mod['no'])

In [None]:
name_of_analysis = 'hysim_atlas'

In [None]:
leiden_names = adata.obs.columns[adata.obs.columns.str.contains('leiden')].to_list()

In [None]:
leiden_names

['leiden_1', 'leiden_1.5', 'leiden_2']

In [None]:
adata

AnnData object with n_obs × n_vars = 199113 × 18061
    obs: 'Experiment', 'Library', 'Body_part', 'Presence_of_PEG', 'batch', 'n_counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_1', 'leiden_1.5', 'leiden_2', 'Colony_part', 'Unique'
    var: 'ratio_with_no', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Library_colors', 'dendrogram_leiden_1.5', 'hvg', 'leiden', 'leiden_1.5_colors', 'leiden_1_colors', 'leiden_2_colors', 'log1p', 'neighbors', 'pca', 'rank_genes_groups_logreg_leiden_1', 'rank_genes_groups_logreg_leiden_1.5', 'rank_genes_groups_logreg_leiden_2', 'rank_genes_groups_wilcox_leiden_1', 'rank_genes_groups_wilcox_leiden_1.5', 'rank_genes_groups_wilcox_leiden_2', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [None]:
leiden_names_selected = leiden_names

In [None]:
leiden_names_selected

['leiden_1', 'leiden_1.5', 'leiden_2']

In [None]:
clusteringlayer = 'leiden_1.5'

In [None]:
os.mkdir('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_pdfs_and_excels_supp')

In [None]:
with pd.ExcelWriter('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_pdfs_and_excels_supp/'+clusteringlayer+'_markers_overlap_supp.xlsx') as writer:
# change directory
    for i in adata.obs[clusteringlayer].cat.categories:

        lfc_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges'][i])
        pval_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals'][i])
        he_m = max(list(set(pval_s[pval_s < 0.05].index.to_list()) & set(lfc_s[lfc_s > 0].index.to_list())))

        a = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].rename('names')
        b = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges']).head(he_m)[i].rename('logfoldchanges')
        c = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals']).head(he_m)[i].rename('pvals')
        d = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals_adj']).head(he_m)[i].rename('pvals_adj')
        e = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['scores']).head(he_m)[i].rename('scores')
        f = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
        g = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].apply(get_best_hit_2, args=(preferred_species_2, list_of_words)).rename('ncbi_name')
        df = pd.concat([a, b, c, d, e, f, g], axis = 1)
        df.to_excel(writer, sheet_name='Cluster '+i)

Exception ignored in: <function ZipFile.__del__ at 0x7f871c950b80>
Traceback (most recent call last):
  File "/usr/lib/python3.8/zipfile.py", line 1821, in __del__
    self.close()
  File "/usr/lib/python3.8/zipfile.py", line 1838, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


In [None]:
os.mkdir('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_wilcox_pdfs_and_excels/')

In [None]:
with pd.ExcelWriter('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_wilcox_pdfs_and_excels/'+clusteringlayer+'_markers_wilcoxon.xlsx') as writer:
# change directory
    for i in adata.obs[clusteringlayer].cat.categories:
        lfc_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges'][i])
        pval_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals'][i])
        he_m = max(list(set(pval_s[pval_s < 0.05].index.to_list()) & set(lfc_s[lfc_s > 0].index.to_list())))


        a = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].rename('names')
        b = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges']).head(he_m)[i].rename('logfoldchanges')
        c = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals']).head(he_m)[i].rename('pvals')
        d = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals_adj']).head(he_m)[i].rename('pvals_adj')
        e = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['scores']).head(he_m)[i].rename('scores')
        f = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
        g = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i].apply(get_best_hit_2, args=(preferred_species_2, list_of_words)).rename('ncbi_name')
        df = pd.concat([a, b, c, d, e, f, g], axis = 1)
        df.to_excel(writer, sheet_name='Cluster '+i)

In [None]:
os.mkdir('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_logreg_pdfs_and_excels/')

In [None]:
with pd.ExcelWriter('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_logreg_pdfs_and_excels/'+clusteringlayer+'_markers.xlsx') as writer:
        for i in adata.obs[clusteringlayer].cat.categories:
            a = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(30)[i].rename('names')
            b = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['scores']).head(30)[i].rename('scores')
            c = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(30)[i].apply(get_best_hit, args=(preferred_species, list_of_words)).rename('diamond')
            d = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(30)[i].apply(get_best_hit_2, args=(preferred_species_2, list_of_words)).rename('ncbi_name')
            df = pd.concat([a, b, c, d], axis = 1)
            df.to_excel(writer, sheet_name='Cluster '+i)

In [None]:
def get_plots (ad,clusteringlayer, cluster, li_markers):
    fig, axs = plt.subplots(4, 4, figsize = (15, 15))

    sc.pl.umap(adata, color= clusteringlayer, legend_loc = 'on data', groups = cluster, na_in_legend = False, size = 5, legend_fontsize = 7, title = clusteringlayer+' cluster '+cluster, show = False, ax = axs[0, 0])



    while len(li_markers) < 15:
        li_markers.append(None)

    gene01 = li_markers[0]
    gene02 = li_markers[1]
    gene03 = li_markers[2]
    gene10 = li_markers[3]
    gene11 = li_markers[4]
    gene12 = li_markers[5]
    gene13 = li_markers[6]
    gene20 = li_markers[7]
    gene21 = li_markers[8]
    gene22 = li_markers[9]
    gene23 = li_markers[10]
    gene30 = li_markers[11]
    gene31 = li_markers[12]
    gene32 = li_markers[13]
    gene33 = li_markers[14]



    #Row 0 first row
    sc.pl.umap(ad, color= gene01, title = gene01, color_map = umap_cmap, show = False, ax = axs[0, 1])
    sc.pl.umap(ad, color= gene02, title = gene02, color_map = umap_cmap, show = False, ax = axs[0, 2])
    sc.pl.umap(ad, color= gene22, title = gene03, color_map = umap_cmap, show = False, ax = axs[0, 3])


    #Row 1 second row

    sc.pl.umap(ad, color= gene10, title = gene10, color_map = umap_cmap, show = False, ax = axs[1, 0])
    sc.pl.umap(ad, color= gene11, title = gene11, color_map = umap_cmap, show = False, ax = axs[1, 1])
    sc.pl.umap(ad, color= gene12, title = gene12, color_map = umap_cmap, show = False, ax = axs[1, 2])
    sc.pl.umap(ad, color= gene22, title = gene13, color_map = umap_cmap, show = False, ax = axs[1, 3])


    #Row 2 third row

    sc.pl.umap(ad, color= gene20, title = gene20, color_map = umap_cmap, show = False, ax = axs[2, 0])
    sc.pl.umap(ad, color= gene21, title = gene21, color_map = umap_cmap, show = False, ax = axs[2, 1])
    sc.pl.umap(ad, color= gene22, title = gene22, color_map = umap_cmap, show = False, ax = axs[2, 2])
    sc.pl.umap(ad, color= gene23, title = gene22, color_map = umap_cmap, show = False, ax = axs[2, 3])

    #Row 3 fourth row

    sc.pl.umap(ad, color= gene30, title = gene30, color_map = umap_cmap, show = False, ax = axs[3, 0])
    sc.pl.umap(ad, color= gene31, title = gene31, color_map = umap_cmap, show = False, ax = axs[3, 1])
    sc.pl.umap(ad, color= gene32, title = gene32, color_map = umap_cmap, show = False, ax = axs[3, 2])
    sc.pl.umap(ad, color= gene22, title = gene33, color_map = umap_cmap, show = False, ax = axs[3, 3])

    #new_fig = fig
    #plt.close(fig)
    #fig.clf()
    return fig
    plt.close(fig)

In [None]:
for i in adata.obs[clusteringlayer].cat.categories:
    li = []
    lfc_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['logfoldchanges'][i])
    pval_s = pd.Series(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['pvals'][i])
    he_m = max(list(set(pval_s[pval_s < 0.05].index.to_list()) & set(lfc_s[lfc_s > 0].index.to_list())))
    wl = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(he_m)[i]
    lr = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(80)[i]
    li = wl[wl.isin(lr)].to_list()
    figure = get_plots(adata, clusteringlayer, i, li)
    figure.savefig('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_pdfs_and_excels_supp/umap_'+clusteringlayer+'_cluster_'+i+'.pdf',format = 'pdf')
    figure.clf()
    plt.close(figure)

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 

In [None]:
for i in adata.obs[clusteringlayer].cat.categories:
        li = []
        mark = pd.DataFrame(adata.uns['rank_genes_groups_wilcox_'+clusteringlayer]['names']).head(15)[i]#or 'rank_genes_groups_wilcox' dependending which one is going to be calculated
        li = mark.to_list()
        figure = get_plots(adata, clusteringlayer, i, li)
        figure.savefig('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_wilcox_pdfs_and_excels/umap_'+clusteringlayer+'_cluster_'+i+'.pdf',format = 'pdf')# same here for each respective folder
        figure.clf()
        plt.close(figure)

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 

In [None]:
for i in adata.obs[clusteringlayer].cat.categories:
        li = []
        mark = pd.DataFrame(adata.uns['rank_genes_groups_logreg_'+clusteringlayer]['names']).head(15)[i]#or 'rank_genes_groups_wilcox' dependending which one is going to be calculated
        li = mark.to_list()
        figure = get_plots(adata, clusteringlayer, i, li)
        figure.savefig('/mnt/sda/david/hydractinia/Supplementary_Files/'+name_of_analysis+'_marker_logreg_pdfs_and_excels/umap_'+clusteringlayer+'_cluster_'+i+'.pdf',format = 'pdf')# same here for each respective folder
        figure.clf()
        plt.close(figure)

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 

  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), np.nan)
  values = values.replace(values.categories.difference(groups), 