In [None]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = True

In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
mpl.rc("figure", dpi=100)
import numpy as np
import jack
from lmz import *

# Data preparation
import cellsaw
import cellsaw.io_utils
import notebookhelper


In [None]:
# loading data-> [[anndata]] 
filenames = notebookhelper.filenames44
path = '/home/ubuntu/data/scdata/'
datasets = [[cellsaw.io_utils.nuread(sample_size=1000, 
                            sampleseed = seed,
                            dir = path,
                            remove_cells = {'celltype': ["no pangalo", "Unknown"]},
                            dataset =f) for f in filenames]
                                for seed in [42,1336,31338]] 
                                #for seed in [42]]now

In [None]:
%%time 
from matplotlib import pyplot as plt
from cellsaw import similarity 
from cellsaw.similarity import measures
import logging


def datasets_to_similarity(datasets):
    ranked_datasets_list = similarity.rank_by_sim_splitbyname(
                                    datasets,filenames,
                                    method = "cell_ranger",numgenes =2450,
                                    similarity = "jaccard")
    return ranked_datasets_list

datasets = Map(datasets_to_similarity, datasets)




In [None]:
from cellsaw import annotate
from ubergauss import tools
import cellsaw.preprocess as prep
import copy

def evaluate(methods):
    res = []
    def mkscore(method_i_j):
        method, i,j = method_i_j
        d = [z.copy() for z in datasets[i][j]]
        score,name = method(d)
        return {'reapeat':i,'dataset':j,'score':score, 'method':name}
    
    warp = (  [method,i,j]  for i,rankedlist in enumerate(datasets) 
                for j, pair in enumerate(rankedlist) for method in methods)
    return tools.xmap(mkscore, warp, n_jobs = 16)

best = {'linear_assignment_factor': 5.62261789365175, 
        'n_genes': 1050, 'n_inter_neighbors': 4, 'n_intra_neighbors': 5, 
        'pca_dim': 37, 'pp': "cell_ranger", 
        'sigmafac': 64.4539639069615, 'umap_dim': 8}

def mymethod(pair):
    target, source = pair[0], pair[1]
    
    annotate.predict_celltype(target,source,source_label = 'celltype',  
                                   target_label='predicted_celltype', 
                              make_even = True, 
                              **best)
                                   
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'predicted_celltype')
    return score, 'Diffusion'


def linsum(pair):
    target, source = pair[0], pair[1]
    linoptimized = {'n_genes': 1150, 'pca_dim': 32, 'umap_dim': 10} 
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    target = annotate.linsum_copylabel(target,source,source_label = 'celltype', 
                                                   target_label= 'linsum_copy',
                                                   **linoptimized)    
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'linsum_copy')
    return score, 'linsum_copy'


def knn(pair):
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    bestknn =  {'k': 14, 'n_genes': 1050, 'pca_dim': 43, 'umap_dim': 9} 
    target = annotate.label_knn(target,source,source_label = 'celltype', 
                                       target_label='knn',
                                       **bestknn)
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = 'knn')
    return score, 'knn'


def rawdiff(pair):
    bestparams = {'gamma': 0.745859419828891, 'n_genes': 1300, 
                   'pca_dim': 36, 'umap_dim': 12}  # rbf kernel
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'raw_diffusion'
    target = annotate.raw_diffusion(target,source,source_label = 'celltype', 
                                               target_label=tlabel,**bestparams)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def rawdiff_combat(pair):
    target, source = pair[0], pair[1]
    bestparams = {'gamma': 0.9111000495395297, 'n_genes': 1300, 'pca_dim': 51, 'umap_dim': 13}
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'raw_diffusion_combat'
    target = annotate.raw_diffusion_combat(target,source,source_label = 'celltype', 
                                               target_label=tlabel,**bestparams)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel


def tunnelclust(pair):
    target, source = pair[0], pair[1]
    prep.annotate_genescore([target,source],selector = 'cell_ranger')
    tlabel = 'linear assignment EM'
    em_best = {'n_genes': 1000, 'pca_dim': 31, 'umap_dim': 14}
    target = annotate.tunnelclust(target,source,source_label = 'celltype', 
                                               target_label=tlabel,
                                                **em_best)

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def multiannotate2(pair):
    tlabel = 'Ensemble k=2'
    target = pair[0]
    source = pair[1:3]
    prep.annotate_genescore(pair[:3],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def multiannotate3(pair):
    tlabel = 'Ensemble k=3'
    target = pair[0]
    source = pair[1:4]
    
    prep.annotate_genescore(pair[:4],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

def multiannotate4(pair):
    tlabel = 'Ensemble k=4'
    target = pair[0]
    source = pair[1:5]
    
    prep.annotate_genescore(pair[:5],selector = 'cell_ranger')
    target = annotate.multi_annotate(target,source,annotator = annotate.predict_celltype,
                                     source_label = 'celltype', 
                                               target_label=tlabel, annotatorargs=best  )
    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel
    

def markercount(pair):
    tlabel = 'markercount'
    target = pair[0]
    source = pair[1]
    target = annotate.markercount(target,source,source_label = 'celltype', 
                                   target_label=tlabel) 

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel


if False:
    bla = evaluate([mymethod, linsum, knn, rawdiff,markercount, rawdiff_combat, tunnelclust, multiannotate3, multiannotate2, multiannotate4]) 
    #bla = evaluate([mymethod])
    import pandas as pd
    import seaborn as sns
    df = pd.DataFrame(bla)
    sns.barplot(data = df, y= 'score', x = 'method' )
    
    
    
def evaluate(methods):
    res = []
    def mkscore(method_i_j):
        method, i,j = method_i_j
        d = [z.copy() for z in datasets[i][j]]
        score,name = method(d)
        return {'reapeat':i,'dataset':j,'score':score, 'method':name}
    warp = (  [method,i,j]  for i,rankedlist in enumerate(datasets) 
                for j, pair in enumerate(rankedlist) for method in methods)
    return list(map(mkscore, warp))


def scanorama(pair):
    tlabel = 'scanorama'
    target = pair[0]
    source = pair[1]
    target = annotate.scanorama_integrate_diffusion(target,source,source_label = 'celltype',pca_dim= 40, umap_dim=10, gamma = .75, target_label=tlabel) 

    score = annotate.accuracy_evaluation(target,true='celltype',predicted = tlabel)
    return score, tlabel

scanores = evaluate([scanorama]) 
#evaluate([multiannotate])

In [None]:
df = pd.DataFrame(bla+scanores)

In [None]:
sns.set(font_scale=0.5)
sns.heatmap(df.pivot_table('score','method','dataset'),square=True) 
plt.show()
sns.heatmap(df.pivot_table('score','method','dataset',aggfunc=np.var),square=True) 

In [None]:
from cellsaw import merge as merg
from cellsaw import preprocess as pp 
from cellsaw import draw 
from cellsaw import util

def sad(adatas):
    adatas = [a.copy() for a in adatas[:2]]
    pp.annotate_genescore(adatas,selector = 'cell_ranger')
    m = merg.Merge(adatas,umaps= [2,10])
    #m.plot([a.obs['celltype'] for a in adatas])
    labels = [a.obs['celltype'] for a in adatas]
    m.plot(labels,mkmix = True)
    util.setd2_kernel_mds(m)
    m.plot(labels, mkmix = True)
    
pv = df.pivot_table('score','method','dataset')
for i,e in enumerate(pv.index):
    if e.startswith('Ens'):
        continue
    print(e)
    x = np.argsort(pv.to_numpy()[i])
    print(x[:10])

sad(datasets[0][16])

In [None]:
df2 = df.pivot_table('score','method','dataset')
val1 = df2.loc['Diffusion'].tolist()
val = df2.loc['raw_diffusion'].tolist()
sns.scatterplot(x= val1, y= val)
sns.lineplot(x = (0,1),y=(0,1))

# here
make 2 violin plots: 
- first the methods -> barplot -> violin
- then the multi-annotator with the best and show 1 2 3 4 :D


In [None]:
df

In [None]:
sns.set_context('notebook')
inner = 'points' # points
multi = np.array([x.startswith('Ensemble') or x == "Diffusion" for x in df['method']])
inver = np.logical_not(multi)

ax = sns.violinplot(data=df.iloc[multi], x="method", y="score", inner=inner)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30); plt.show()

ax = sns.boxplot(data=df.iloc[multi], x="method", y="score")
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30); plt.show()

ax = sns.violinplot(data=df.iloc[inver], x="method", y="score", inner=inner)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30); plt.show()

ax = sns.boxplot(data=df.iloc[inver], x="method", y="score")
ax.set_xticklabels(ax.get_xticklabels(),rotation = 30)

In [None]:
import anndata as ad
