In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = True

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
mpl.rc("figure", dpi=200)
import numpy as np
import jack
from lmz import *

# Data preparation
import cellsaw
import cellsaw.io_utils
import notebookhelper


In [None]:
# loading data-> [[anndata]] 
filenames = notebookhelper.filenames44
path = '/home/ubuntu/data/scdata/'
datasets = [[cellsaw.io_utils.nuread(sample_size=1000, 
                            sampleseed = seed,
                            dir = path,
                            remove_cells = {'celltype': ["no pangalo", "Unknown"]},
                            dataset =f) for f in filenames]
                                #for seed in [42,1337,31337]] TODO
                                for seed in [42]]

In [None]:
%%time 
from matplotlib import pyplot as plt
from cellsaw import similarity 
from cellsaw.similarity import measures
import logging


def datasets_to_similarity(datasets):
    ranked_datasets_list = similarity.rank_by_sim_splitbyname(
                                    datasets,filenames,
                                    method = "cell_ranger",numgenes =2450,
                                    similarity = "jaccard")
    return ranked_datasets_list

datasets = Map(datasets_to_similarity, datasets)




In [None]:
from cellsaw import annotate
from ubergauss import tools
import cellsaw.preprocess as prep

def evaluate(methods):
    res = []
    
    def mkscore(method_i_j_data):
        method, i,j,data = method_i_j_data
        score,name = method(data)
        return {'reapeat':i,'dataset':j,'score':score, 'method':name}
   
    warp = (  [method,i,j,pair]  for i,rankedlist in enumerate(datasets) for j, pair in enumerate(rankedlist) for method in methods)
    return tools.xmap(mkscore, warp)


best = {'linear_assignment_factor': 5.62261789365175, 
        'n_genes': 1050, 'n_inter_neighbors': 4, 'n_intra_neighbors': 5, 
        'pca_dim': 37, 'pp': "cell_ranger", 
        'sigmafac': 64.4539639069615, 'umap_dim': 8}

def mymethod(pair):
    target, source = pair[0], pair[1]
    
    annotate.predict_celltype(target,source,source_label = 'celltype',  
                                   target_label='predicted_celltype', 
                              make_even = True, 
                              **best)
                                   
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'predicted_celltype')
    return score, 'Diffusion'


def linsum(pair):
    target, source = pair[0], pair[1]
    linoptimized = {'n_genes': 1150, 'pca_dim': 32, 'umap_dim': 10} 
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    target = annotate.linsum_copylabel(target,source,source_label = 'celltype', 
                                                   target_label= 'linsum_copy',
                                                   **linoptimized)    
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'linsum_copy')
    return score, 'linsum_copy'


def knn(pair):
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    bestknn =  {'k': 14, 'n_genes': 1050, 'pca_dim': 43, 'umap_dim': 9} 
    target = annotate.label_knn(target,source,source_label = 'celltype', 
                                       target_label='knn',
                                       **bestknn)
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'knn')
    return score, 'knn'


def rawdiff(pair):
    bestparams = {'gamma': 0.745859419828891, 'n_genes': 1300, 
                   'pca_dim': 36, 'umap_dim': 12}  # rbf kernel
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'raw_diffusion'
    target = annotate.raw_diffusion(target,source,source_label = 'celltype', 
                                               target_label=tlabel,**bestparams)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def rawdiff_combat(pair):
    target, source = pair[0], pair[1]
    bestparams = {'gamma': 0.9111000495395297, 'n_genes': 1300, 'pca_dim': 51, 'umap_dim': 13}
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'raw_diffusion_combat'
    target = annotate.raw_diffusion_combat(target,source,source_label = 'celltype', 
                                               target_label=tlabel,**bestparams)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel


def tunnelclust(pair):
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'linear assignment EM'
    em_best = {'n_genes': 1000, 'pca_dim': 31, 'umap_dim': 14}
    target = annotate.tunnelclust(target,source,source_label = 'celltype', 
                                               target_label=tlabel,
                                                **em_best)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel



def multiannotate3(pair):
    tlabel = 'Ensemble k=3'
    target = pair[0]
    source = pair[1:4]
    
    prep.annotate_genescore(pair[:4],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel
    
def multiannotate2(pair):
    tlabel = 'Ensemble k=2'
    target = pair[0]
    source = pair[1:3]
    prep.annotate_genescore(pair[:3],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def multiannotate4(pair):
    tlabel = 'Ensemble k=4'
    target = pair[0]
    source = pair[1:5]
    
    prep.annotate_genescore(pair[:5],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel
    

def markercount(pair):
    tlabel = 'markercount'
    target = pair[0]
    source = pair[1]
    target = annotate.markercount(target,source,source_label = 'celltype', 
                                   target_label=tlabel) 

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel


if True:
    #bla = evaluate([mymethod, linsum, knn, rawdiff,markercount, rawdiff_combat, tunnelclust, multiannotate3, multiannotate2, multiannotate4])  # TODO/
    bla = evaluate([multiannotate3, multiannotate2, multiannotate4])
    import pandas as pd
    import seaborn as sns
    df = pd.DataFrame(bla)
    sns.barplot(data = df, y= 'score', x = 'method' )
    
#evaluate([multiannotate])

In [None]:
asd = [{'reapeat': 0, 'dataset': 0, 'score': 0.806, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 1, 'score': 0.876, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 2, 'score': 0.944, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 3, 'score': 0.925, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 4, 'score': 0.862, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 5, 'score': 0.8, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 6, 'score': 0.441, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 7, 'score': 0.826, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 8, 'score': 0.841, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 9, 'score': 0.953, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 10, 'score': 0.776, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 11, 'score': 0.647, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 12, 'score': 0.81, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 13, 'score': 0.873, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 14, 'score': 0.787, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 15, 'score': 0.808, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 16, 'score': 0.959, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 17, 'score': 0.741, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 18, 'score': 0.838, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 19, 'score': 0.648, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 20, 'score': 0.952, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 21, 'score': 0.836, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 22, 'score': 0.961, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 23, 'score': 0.831, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 24, 'score': 0.882, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 25, 'score': 0.758, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 26, 'score': 0.673, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 27, 'score': 0.502, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 28, 'score': 0.988, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 29, 'score': 0.953, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 30, 'score': 0.826, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 31, 'score': 0.793, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 32, 'score': 0.788, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 33, 'score': 0.82, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 34, 'score': 0.928, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 35, 'score': 0.906, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 36, 'score': 0.924, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 37, 'score': 0.994, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 38, 'score': 0.963, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 39, 'score': 0.956, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 40, 'score': 0.972, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 41, 'score': 0.993, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 42, 'score': 0.989, 'method': 'multi_annotate'},
 {'reapeat': 0, 'dataset': 43, 'score': 0.999, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 0, 'score': 0.76, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 1, 'score': 0.892, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 2, 'score': 0.936, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 3, 'score': 0.92, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 4, 'score': 0.877, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 5, 'score': 0.747, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 6, 'score': 0.671, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 7, 'score': 0.83, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 8, 'score': 0.487, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 9, 'score': 0.351, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 10, 'score': 0.699, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 11, 'score': 0.564, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 12, 'score': 0.823, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 13, 'score': 0.876, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 14, 'score': 0.789, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 15, 'score': 0.81, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 16, 'score': 0.945, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 17, 'score': 0.64, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 18, 'score': 0.848, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 19, 'score': 0.85, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 20, 'score': 0.942, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 21, 'score': 0.856, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 22, 'score': 0.508, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 23, 'score': 0.859, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 24, 'score': 0.891, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 25, 'score': 0.632, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 26, 'score': 0.709, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 27, 'score': 0.514, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 28, 'score': 0.974, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 29, 'score': 0.955, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 30, 'score': 0.806, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 31, 'score': 0.79, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 32, 'score': 0.768, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 33, 'score': 0.852, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 34, 'score': 0.927, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 35, 'score': 0.89, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 36, 'score': 0.918, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 37, 'score': 0.994, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 38, 'score': 0.946, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 39, 'score': 0.961, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 40, 'score': 0.967, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 41, 'score': 0.998, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 42, 'score': 0.989, 'method': 'multi_annotate'},
 {'reapeat': 1, 'dataset': 43, 'score': 1.0, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 0, 'score': 0.774, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 1, 'score': 0.868, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 2, 'score': 0.96, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 3, 'score': 0.932, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 4, 'score': 0.859, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 5, 'score': 0.808, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 6, 'score': 0.479, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 7, 'score': 0.797, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 8, 'score': 0.623, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 9, 'score': 0.877, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 10, 'score': 0.724, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 11, 'score': 0.594, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 12, 'score': 0.808, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 13, 'score': 0.874, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 14, 'score': 0.786, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 15, 'score': 0.796, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 16, 'score': 0.952, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 17, 'score': 0.589, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 18, 'score': 0.822, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 19, 'score': 0.445, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 20, 'score': 0.963, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 21, 'score': 0.835, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 22, 'score': 0.837, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 23, 'score': 0.89, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 24, 'score': 0.87, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 25, 'score': 0.601, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 26, 'score': 0.7, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 27, 'score': 0.521, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 28, 'score': 0.875, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 29, 'score': 0.948, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 30, 'score': 0.816, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 31, 'score': 0.82, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 32, 'score': 0.743, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 33, 'score': 0.836, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 34, 'score': 0.894, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 35, 'score': 0.889, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 36, 'score': 0.921, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 37, 'score': 0.991, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 38, 'score': 0.934, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 39, 'score': 0.953, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 40, 'score': 0.977, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 41, 'score': 0.997, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 42, 'score': 0.978, 'method': 'multi_annotate'},
 {'reapeat': 2, 'dataset': 43, 'score': 1.0, 'method': 'multi_annotate'}]

import pandas as pd
import seaborn as sns
dff = pd.DataFrame(asd)
np.mean(dff['score'])

In [None]:
sns.barplot(data = df, y= 'score', x = 'method' )

In [None]:
sns.set(font_scale=0.5)
sns.heatmap(df.pivot_table('score','method','dataset'),square=True) 

In [None]:
df2 = df.pivot_table('score','method','dataset')

In [None]:
val1 = df2.loc['Diffusion'].tolist()
val = df2.loc['raw_diffusion'].tolist()
sns.scatterplot(x= val1, y= val)
sns.lineplot(x = (0,1),y=(0,1))

# here
make 2 violin plots: 
- first the methods -> barplot -> violin
- then the multi-annotator with the best and show 1 2 3 4 :D


In [None]:

multi = np.array([x.startswith('Ensemble') for x in df['method']])
inver = np.logical_not(multi)


sns.violinplot(data=df.iloc[multi], x="method", y="score",
                inner="points")
plt.show()
sns.violinplot(data=df.iloc[inver], x="method", y="score",
                inner="points")