# Template for classifying data with BLC

### From Forlin, Tajvar et al 

# Imports:

In [1]:
import anndata as ad
import pandas as pd
import scanpy as sc
import numpy as np
import pickle
import pandas as pd
from sklearn.preprocessing import quantile_transform
import pickle
import psutil
import os
import time
from tqdm import tqdm
from sklearn.metrics import f1_score
from BLC_classification_lib import MultiClassWrapper, calc_emd, OneVsOneWrapper

# Classifying with BLC classifier

In [4]:
#Function to classify new data
def classify_data(clf, data_to_classify):   
    #Target sum normalisation for data
    sc.pp.normalize_total(data_to_classify, target_sum = 1e4)
    df_data_to_classify = data_to_classify.to_df()
    
      
    all_markergenes = []

    for key in clf.classifiers.keys():
        all_markergenes += clf.classifiers[key].features_
    all_markergenes = list(set(all_markergenes))
    df_data_to_classify = df_data_to_classify[df_data_to_classify.columns[df_data_to_classify.columns.isin(all_markergenes)]]
    
    df_samples_scaled = quantile_transform(df_data_to_classify, n_quantiles=500, random_state = 0)
    df_samples_scaled = pd.DataFrame(df_samples_scaled, index = df_data_to_classify.index, columns = df_data_to_classify.columns)

    y_pred = clf.predict(df_samples_scaled)
    
    
    #Add meta data
    meta_data = data_to_classify.obs
    meta_data = pd.concat([meta_data, y_pred], axis = 1)

    data_to_classify.obs = meta_data
    
    return data_to_classify

In [2]:
# Insert path to trained classifier - here default of classifier used in original publication which can be found at:
# https://drive.google.com/drive/u/0/folders/1--eGmL9U-LCvs_G_EdhPEQDKIjZbvWuI
path = 'BLC_classifier_maxdepth30_maxgenes70n_estimators200_thres0.4_minsampleleaf20.pickle'
with open(path, 'rb') as handle:
    clf = pickle.load(handle)

In [9]:
#Insert data path for new data to be classified (h5ad-object)
data_path = 'insert data path here'
data_to_classify = sc.read(data_path)
object_classified = classify_data(clf, data_to_classify)


## When looking at the classified object, the new columns in the metadata are:
- ct_pred: The cell type predicted by BLC
- second_most_likely: The cell type that was the second most likely cell type overall
- third_most_likely: The cell type that was the third most likely cell type overall
- likelihood_vs_second_ct: The estimated likelihood that the cell is ct_pred cell type and not the second_most_likely
- likelihood_vs_third_ct: The estimated likelihood that the cell is ct_pred cell type and not the third_most_likely
- overall_likelihood: overall_likelihood is the mean of the three most likely cell types

In [None]:
object_classified.obs

## If you want to explore marker genes found by BLC to distinguish two cell types, you can do this by:

In [20]:
#clf.specific_marker_dic #dictionary object
clf.specific_marker_dic['Classical Monocyte', 'CD8T'] 

# NOTE: If you get a KeyError between two cell types you want to look up, try to reverse the order of the cell types 
# (for example "clf.specific_marker_dic['CD8T', 'Classical Monocyte']) will result in a KeyError
# You can also look at all cell type pairs through clf.specific_marker_dic.keys() to find the right key

{'Classical Monocyte': Index(['PGD', 'CSF3R', 'C1orf162', 'CTSS', 'S100A11', 'S100A9', 'S100A12',
        'S100A8', 'MNDA', 'FCER1G', 'NCF2', 'RGS2', 'CAPG', 'SLC11A1', 'TKT',
        'CSTA', 'VCAN', 'CD14', 'RNF130', 'LY86', 'LST1', 'AIF1', 'HLA-DRA',
        'HLA-DMA', 'CPVL', 'NCF1', 'FGL2', 'CD36', 'BRI3', 'NAMPT', 'AP1S2',
        'SAT1', 'CYBB', 'CFP', 'CTSB', 'ASAH1', 'CEBPD', 'KLF4', 'NUP214',
        'FCN1', 'TALDO1', 'SPI1', 'MPEG1', 'MS4A6A', 'FTH1', 'APLP2', 'PSAP',
        'CLEC12A', 'CLEC7A', 'PLBD1', 'LYZ', 'DUSP6', 'TNFSF13B', 'LGALS3',
        'NPC2', 'SERPINA1', 'TNFAIP2', 'CTSH', 'IGSF6', 'PYCARD', 'COTL1',
        'CD68', 'GRN', 'CST3', 'HCK', 'CFD', 'STXBP2', 'IFI30', 'TYROBP',
        'BLVRB', 'PLAUR', 'FTL', 'FCGRT', 'FPR1', 'LILRB2', 'TSPO', 'TYMP'],
       dtype='object'),
 'CD8T': Index(['LCK', 'CD2', 'CD8A', 'CD8B', 'ZAP70', 'IL7R', 'GZMA', 'IFITM1', 'CTSW',
        'PTPRCAP', 'CD3E', 'CD3D', 'CD3G', 'PRF1', 'GZMH', 'EVL', 'B2M', 'IL32',
        'CD7', 'CST7'