In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = True

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
mpl.rc("figure", dpi=200)
import numpy as np
import jack
from lmz import *

# Data preparation
import cellsaw
import cellsaw.io_utils
import notebookhelper


In [None]:
# loading data-> [[anndata]] 
filenames = notebookhelper.filenames
path = '/home/ubuntu/data/scdata/'
datasets = [cellsaw.io_utils.read(sample_size=1000, 
                            sampleseed = seed,
                            suffix = '.h5',
                            dir = path, 
                            remove_cells = {'celltype': ["no pangalo", "Unknown"]},
                            datasets = filenames) 
                                for seed in [42 ,1337,31337,501,404]]
                                            


In [None]:
%%time 
from matplotlib import pyplot as plt
from cellsaw import similarity 
from cellsaw.similarity import measures
import logging
logging
# 
# TODO: set prefered algorithm
# TODO: i should add a mode to force the category to be 
# correct -> wrapper that groups by filename

def datasets_to_similarity(datasets):
    target_datasets = datasets
    source_datasets = datasets
    ranked_datasets_list, similarity_df = similarity.rank_by_similarity(
                                    method = "seurat_v3",
                                    target = target_datasets, 
                                    numgenes =200,
                                    similarity = "jaccard",
                                    source = source_datasets,            
                                    return_similarity = True)
    return ranked_datasets_list

datasets = Map(datasets_to_similarity, datasets)




In [None]:
from cellsaw import annotate
def evaluate(methods):
    res = []
    for i,rankedlist in enumerate(datasets):
        for j, pair in enumerate(rankedlist):
            t,s = pair[0].copy(), pair[1].copy()
            premerged = annotate.mergewrap(t,s,
                                umap_dim = 5, pca = 20, make_even=True, sortfield = 2)
            for method in methods:
                score, name = method(premerged, t)
                res.append({'reapeat':i,'dataset':j,'score':score, 'method':name})
    return res


def mymethod(premerged, target):
    #target, source = pair[0].copy(), pair[1].copy()
    annotate.predict_celltype(target,None,source_label = 'celltype',  
                                   target_label='predicted_celltype', 
                                   premerged = premerged,
                                   pca_dim = 20, umap_dim = 5,
                                   n_intra_neighbors = 5,
                                   n_inter_neighbors = 1,
                                   make_even= False,
                                   sigmafac = 1,
                                   linear_assignment_factor = 1,
                                   similarity_scale_factor = 1.0) 
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'predicted_celltype')
    return score, 'Diffusion'


def linsum(premerged, target):
    target = annotate.linsum_copylabel(target,None,source_label = 'celltype', 
                                                   target_label= 'linsum_copy', premerged = premerged,
                                                   pca_dim = 20, umap_dim = 0)    

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'linsum_copy')
    return score, 'linsum_copy'



def knn(premerged, target):

    target = annotate.label_knn(target,None,source_label = 'celltype', 
                                       target_label='knn',  premerged = premerged,
                                       pca_dim = 20, umap_dim = 0,k=5)
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'knn')
    return score, 'knn'

def rawdiff(premerged, target):
    tlabel = 'raw_diffusion'
    target = annotate.raw_diffusion(target,None,source_label = 'celltype', 
                                               target_label=tlabel,
                                              premerged = premerged,
                                                n_neighbors = 5,gamma = .1,
                                               pca_dim = 40, umap_dim = 10)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel



def markercount(premerged, target):
    tlabel = 'markercount'

    # ! carefull i exclude unknowns here
    target = annotate.markercount(target,None,source_label = 'celltype', 
                                   target_label=tlabel,  premerged = premerged,
                                   pca_dim = 20, umap_dim = 0)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel


bla = evaluate([mymethod, linsum, knn, rawdiff,markercount])
import pandas as pd
import seaborn as sns
df = pd.DataFrame(bla)
sns.barplot(data = df, y= 'score', x = 'method' )
            
    

In [None]:
sns.barplot(data = df, y= 'score', x = 'method' )

In [None]:
sns.heatmap(df.pivot_table('score','method','dataset'),square=True) # fontsize smaller 