# Importing libraries

In [1]:
import sys
sys.path.append('/home/chayan/UMINTFS/')

import numpy as np
import pandas as pd
from utills import datasets as ds
from utills import DownstreamAnalysis as da

# Loading data and embeddings

In [2]:
dataname = 'pbmc10k_atac'
x1, x2, y = ds.LoadData(dataname)

In [3]:
umint_embedding = pd.read_csv('UMINT_'+dataname+'.csv',header=0,index_col=0)
umintfs_unsupervised_embedding = pd.read_csv('UMINTFS_'+dataname+'_rna.csv',header=0,index_col=0)
umintfs_supervised_embedding = pd.read_csv('UMINTFS_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)
muon_supervised_embedding = pd.read_csv('MUON_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)
Seurat_supervised_embedding = pd.read_csv('Seurat_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)

In [4]:
print(umint_embedding.shape, umintfs_unsupervised_embedding.shape,
      umintfs_supervised_embedding.shape, muon_supervised_embedding.shape, 
      Seurat_supervised_embedding.shape)

(7563, 64) (7563, 64) (7563, 87) (7563, 109) (7563, 117)


In [5]:
methodlist = [umint_embedding, umintfs_unsupervised_embedding,
              umintfs_supervised_embedding, muon_supervised_embedding,
              Seurat_supervised_embedding]
labellist = np.array(y['celltype'])

# Classification performance

### Knn classifier

In [6]:
neighbour = [5, 15, 25, 35, 45]

Results_KNN = []
for methods in methodlist:
    Results_knn = []
    for n in neighbour:
        sc = da.Knn(methods, labellist, n)
        Results_knn.append(sc)
    Results_KNN.append(pd.DataFrame(Results_knn, index=['neighbour '+ str(s) for s in neighbour],
                           columns=['Accuracy', 'Precision','Recall','F1-Score']))

In [7]:
Results_KNN[0]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.656973,0.639416,0.656973,0.637069
neighbour 15,0.673496,0.665778,0.673496,0.659823
neighbour 25,0.672835,0.65898,0.672835,0.655833
neighbour 35,0.675479,0.666063,0.685906,0.664705
neighbour 45,0.675479,0.666768,0.685906,0.664448


In [8]:
Results_KNN[1]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.813615,0.825613,0.813615,0.806207
neighbour 15,0.827495,0.837993,0.827495,0.820295
neighbour 25,0.836087,0.849698,0.836087,0.828246
neighbour 35,0.840053,0.851468,0.840053,0.83232
neighbour 45,0.837409,0.849074,0.837409,0.828571


In [9]:
Results_KNN[2]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.924653,0.92607,0.924653,0.924771
neighbour 15,0.926636,0.928659,0.926636,0.926573
neighbour 25,0.927958,0.929997,0.927958,0.927824
neighbour 35,0.927297,0.929249,0.927297,0.927123
neighbour 45,0.926636,0.92854,0.926636,0.926413


In [10]:
Results_KNN[3]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.894911,0.898405,0.894911,0.893449
neighbour 15,0.898876,0.902403,0.898876,0.897412
neighbour 25,0.894911,0.899045,0.894911,0.893155
neighbour 35,0.896233,0.900221,0.896233,0.894373
neighbour 45,0.895572,0.899647,0.895572,0.893647


In [11]:
Results_KNN[4]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.916722,0.92049,0.916722,0.917389
neighbour 15,0.916061,0.920837,0.916061,0.916742
neighbour 25,0.917383,0.921458,0.917383,0.917797
neighbour 35,0.910773,0.915969,0.910773,0.911416
neighbour 45,0.907469,0.913286,0.907469,0.908116


### Random Forest classifier

In [12]:
n_trees = [20, 40, 60, 80, 100]

Results_RFC = []
for methods in methodlist:
    Results_rfc = []
    for n in n_trees:
        sc = da.RFC(methods, labellist, n)
        Results_rfc.append(sc)
    Results_RFC.append(pd.DataFrame(Results_rfc, index=['neighbour '+ str(s) for s in neighbour],
                           columns=['Accuracy', 'Precision','Recall','F1-Score']))

In [13]:
Results_RFC[0]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.615995,0.610731,0.615995,0.604119
neighbour 15,0.614673,0.611422,0.614673,0.603529
neighbour 25,0.61269,0.609403,0.61269,0.601504
neighbour 35,0.612029,0.608775,0.612029,0.600842
neighbour 45,0.612029,0.608775,0.612029,0.600842


In [14]:
Results_RFC[1]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.847323,0.850526,0.847323,0.842785
neighbour 15,0.85922,0.860329,0.85922,0.855791
neighbour 25,0.861203,0.865269,0.861203,0.856294
neighbour 35,0.861203,0.864925,0.861203,0.856995
neighbour 45,0.865169,0.869007,0.865169,0.860219


In [15]:
Results_RFC[2]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.930601,0.93124,0.930601,0.930059
neighbour 15,0.932584,0.932912,0.932584,0.932364
neighbour 25,0.939194,0.939589,0.939194,0.939061
neighbour 35,0.945142,0.945519,0.945142,0.94493
neighbour 45,0.942498,0.942996,0.942498,0.942203


In [16]:
Results_RFC[3]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.90813,0.910318,0.90813,0.905964
neighbour 15,0.917383,0.9195,0.917383,0.914567
neighbour 25,0.917383,0.918748,0.917383,0.914445
neighbour 35,0.913417,0.916104,0.913417,0.910932
neighbour 45,0.921348,0.92355,0.921348,0.919391


In [17]:
Results_RFC[4]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.917383,0.917976,0.917383,0.916789
neighbour 15,0.931923,0.932559,0.931923,0.931573
neighbour 25,0.933245,0.933879,0.933245,0.933192
neighbour 35,0.928619,0.929483,0.928619,0.928402
neighbour 45,0.933906,0.934889,0.933906,0.93369


# Clustering performance

### KMeans clustering

In [18]:
iteration = 10

Results_KMS = []
for methods in methodlist:
    Results_kms = []
    for n in range(iteration):
        sc = da.kmeans(methods, labellist)
        Results_kms.append(sc)
    Results_KMS.append(pd.DataFrame(Results_kms, index=['Run '+ str(s) for s in range(iteration)],
                           columns=['ARI','NMI','FMI']))

In [19]:
Results_KMS[0]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.419,0.615505,0.510195
Run 1,0.41881,0.615078,0.509692
Run 2,0.418601,0.614908,0.509601
Run 3,0.418965,0.615872,0.510408
Run 4,0.419074,0.614898,0.509978
Run 5,0.418957,0.615642,0.510417
Run 6,0.418848,0.615315,0.509709
Run 7,0.418839,0.615084,0.509718
Run 8,0.418839,0.615084,0.509718
Run 9,0.418839,0.615084,0.509718


In [20]:
Results_KMS[1]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.547704,0.662661,0.614813
Run 1,0.529075,0.666285,0.597937
Run 2,0.536326,0.670208,0.604265
Run 3,0.542527,0.674258,0.609621
Run 4,0.536558,0.671946,0.604487
Run 5,0.53218,0.669914,0.600674
Run 6,0.543264,0.673063,0.610253
Run 7,0.523726,0.662606,0.593293
Run 8,0.531387,0.668963,0.599971
Run 9,0.541819,0.673189,0.608999


In [21]:
Results_KMS[2]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.782648,0.817982,0.815852
Run 1,0.837923,0.844487,0.863101
Run 2,0.783014,0.818803,0.816175
Run 3,0.838116,0.844967,0.863267
Run 4,0.784729,0.815839,0.817531
Run 5,0.780952,0.8171,0.814422
Run 6,0.781012,0.81712,0.814474
Run 7,0.789257,0.818011,0.821395
Run 8,0.837541,0.844325,0.862763
Run 9,0.782634,0.818012,0.815841


In [22]:
Results_KMS[3]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.625908,0.768368,0.696166
Run 1,0.744183,0.79957,0.782969
Run 2,0.735237,0.795866,0.775332
Run 3,0.703003,0.782624,0.74782
Run 4,0.735505,0.796044,0.775561
Run 5,0.739089,0.797211,0.778623
Run 6,0.658863,0.751819,0.710312
Run 7,0.689057,0.770604,0.736152
Run 8,0.741249,0.798781,0.780462
Run 9,0.683324,0.770436,0.731427


In [23]:
Results_KMS[4]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.816768,0.833684,0.84532
Run 1,0.816999,0.83376,0.845516
Run 2,0.82203,0.844821,0.850113
Run 3,0.818405,0.834259,0.846723
Run 4,0.817664,0.834124,0.846086
Run 5,0.729107,0.807107,0.770122
Run 6,0.816894,0.833791,0.845407
Run 7,0.821092,0.844255,0.849325
Run 8,0.817292,0.833967,0.845764
Run 9,0.818544,0.834487,0.846846


### Agglomerative clustering

In [24]:
index = ['umint_embedding', 'umintfs_unsupervised_embedding',
         'umintfs_supervised_embedding','muon_supervised_embedding',
        'seurat_supervised_embedding']

Results_AGL = []
for methods in methodlist:
    sc = da.Agglomerative(methods, labellist)
    Results_AGL.append(sc)

Results_AGL = pd.DataFrame(Results_AGL, index=index, columns=['ARI','NMI','FMI'])

In [25]:
Results_AGL

Unnamed: 0,ARI,NMI,FMI
umint_embedding,0.451654,0.625065,0.550209
umintfs_unsupervised_embedding,0.555552,0.657944,0.624891
umintfs_supervised_embedding,0.77198,0.81197,0.808504
muon_supervised_embedding,0.733369,0.774867,0.775316
seurat_supervised_embedding,0.752161,0.790608,0.790355
