# Importing libraries

In [1]:
import sys
sys.path.append('/home/chayan/UMINTFS/')

import numpy as np
import pandas as pd
from utills import datasets as ds
from utills import DownstreamAnalysis as da

# Loading data and embeddings

In [2]:
dataname = 'cbmc8k'
x1, x2, y = ds.LoadData(dataname)

In [3]:
umint_embedding = pd.read_csv('UMINT_'+dataname+'.csv',header=0,index_col=0)
umintfs_unsupervised_embedding = pd.read_csv('UMINTFS_'+dataname+'_rna.csv',header=0,index_col=0)
umintfs_supervised_embedding = pd.read_csv('UMINTFS_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)
muon_supervised_embedding = pd.read_csv('MUON_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)
Seurat_supervised_embedding = pd.read_csv('Seurat_Supervised_'+dataname+'_rna.csv',header=0,index_col=0)

In [4]:
print(umint_embedding.shape, umintfs_unsupervised_embedding.shape,
      umintfs_supervised_embedding.shape, muon_supervised_embedding.shape, 
      Seurat_supervised_embedding.shape)

(8617, 64) (8617, 64) (8617, 132) (8617, 98) (8617, 157)


In [5]:
methodlist = [umint_embedding, umintfs_unsupervised_embedding,
              umintfs_supervised_embedding, muon_supervised_embedding,
              Seurat_supervised_embedding]
labellist = np.array(y['rna_annotations'])

# Classification performance

### Knn classifier

In [6]:
neighbour = [5, 15, 25, 35, 45]

Results_KNN = []
for methods in methodlist:
    Results_knn = []
    for n in neighbour:
        sc = da.Knn(methods, labellist, n)
        Results_knn.append(sc)
    Results_KNN.append(pd.DataFrame(Results_knn, index=['neighbour '+ str(s) for s in neighbour],
                           columns=['Accuracy', 'Precision','Recall','F1-Score']))

In [7]:
Results_KNN[0]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.824826,0.824042,0.824826,0.818687
neighbour 15,0.830626,0.833091,0.830626,0.821594
neighbour 25,0.835267,0.838063,0.835267,0.824331
neighbour 35,0.830626,0.835932,0.840376,0.826377
neighbour 45,0.825986,0.829794,0.848124,0.832367


In [8]:
Results_KNN[1]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.758121,0.749749,0.758121,0.743663
neighbour 15,0.783063,0.777065,0.794585,0.769456
neighbour 25,0.774942,0.78109,0.815629,0.7853
neighbour 35,0.771462,0.776383,0.811966,0.778467
neighbour 45,0.772622,0.780788,0.813187,0.779583


In [9]:
Results_KNN[2]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.854988,0.864371,0.854988,0.847092
neighbour 15,0.864269,0.874783,0.864269,0.855053
neighbour 25,0.862529,0.871825,0.862529,0.852032
neighbour 35,0.854988,0.867814,0.854988,0.842873
neighbour 45,0.856148,0.870003,0.856148,0.841466


In [10]:
Results_KNN[3]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.816705,0.824457,0.816705,0.807328
neighbour 15,0.827726,0.8391,0.827726,0.817403
neighbour 25,0.833527,0.847274,0.833527,0.822336
neighbour 35,0.832947,0.845688,0.832947,0.820212
neighbour 45,0.834687,0.846109,0.844484,0.82935


In [11]:
Results_KNN[4]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.817285,0.823186,0.817285,0.814494
neighbour 15,0.832367,0.844834,0.832367,0.829195
neighbour 25,0.822506,0.83811,0.822506,0.81721
neighbour 35,0.820766,0.83651,0.820766,0.813087
neighbour 45,0.817285,0.833238,0.817285,0.809512


### Random Forest classifier

In [12]:
n_trees = [20, 40, 60, 80, 100]

Results_RFC = []
for methods in methodlist:
    Results_rfc = []
    for n in n_trees:
        sc = da.RFC(methods, labellist, n)
        Results_rfc.append(sc)
    Results_RFC.append(pd.DataFrame(Results_rfc, index=['neighbour '+ str(s) for s in neighbour],
                           columns=['Accuracy', 'Precision','Recall','F1-Score']))

In [13]:
Results_RFC[0]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.828306,0.828594,0.828306,0.821829
neighbour 15,0.836427,0.836429,0.836427,0.830828
neighbour 25,0.831206,0.831583,0.831206,0.823925
neighbour 35,0.834107,0.834098,0.834107,0.828829
neighbour 45,0.831206,0.832562,0.831206,0.825291


In [14]:
Results_RFC[1]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.791763,0.792742,0.791763,0.776249
neighbour 15,0.806845,0.800633,0.806845,0.788791
neighbour 25,0.802784,0.796739,0.802784,0.783501
neighbour 35,0.806265,0.803975,0.806265,0.789667
neighbour 45,0.796984,0.789104,0.796984,0.779203


In [15]:
Results_RFC[2]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.87181,0.87605,0.87181,0.864837
neighbour 15,0.87703,0.880872,0.87703,0.871832
neighbour 25,0.87993,0.884535,0.87993,0.873341
neighbour 35,0.87935,0.883426,0.87935,0.872623
neighbour 45,0.87993,0.884539,0.87993,0.873846


In [16]:
Results_RFC[3]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.861369,0.862539,0.861369,0.853463
neighbour 15,0.867749,0.873307,0.867749,0.858897
neighbour 25,0.87007,0.875871,0.87007,0.860693
neighbour 35,0.861369,0.866608,0.861369,0.852359
neighbour 45,0.86949,0.875817,0.86949,0.859755


In [17]:
Results_RFC[4]

Unnamed: 0,Accuracy,Precision,Recall,F1-Score
neighbour 5,0.843387,0.846967,0.843387,0.837541
neighbour 15,0.858469,0.859724,0.858469,0.852896
neighbour 25,0.858469,0.861447,0.858469,0.852469
neighbour 35,0.859629,0.862582,0.859629,0.853315
neighbour 45,0.861949,0.863788,0.861949,0.856118


# Clustering performance

### KMeans clustering

In [18]:
iteration = 10

Results_KMS = []
for methods in methodlist:
    Results_kms = []
    for n in range(iteration):
        sc = da.kmeans(methods, labellist)
        Results_kms.append(sc)
    Results_KMS.append(pd.DataFrame(Results_kms, index=['Run '+ str(s) for s in range(iteration)],
                           columns=['ARI','NMI','FMI']))

In [19]:
Results_KMS[0]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.526235,0.693608,0.597408
Run 1,0.52904,0.694489,0.599882
Run 2,0.529329,0.694296,0.600137
Run 3,0.527817,0.693561,0.59882
Run 4,0.529336,0.693963,0.60014
Run 5,0.527809,0.693556,0.598813
Run 6,0.527877,0.69401,0.598874
Run 7,0.529804,0.69468,0.600542
Run 8,0.529367,0.6943,0.600168
Run 9,0.526791,0.693685,0.59789


In [20]:
Results_KMS[1]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.343021,0.604193,0.436116
Run 1,0.357442,0.607725,0.4491
Run 2,0.358546,0.608195,0.450128
Run 3,0.35851,0.59901,0.451222
Run 4,0.334183,0.595154,0.428129
Run 5,0.337783,0.596579,0.431401
Run 6,0.379126,0.607545,0.470154
Run 7,0.357496,0.607748,0.449137
Run 8,0.356,0.608702,0.447527
Run 9,0.37631,0.604934,0.467459


In [21]:
Results_KMS[2]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.446941,0.658849,0.529381
Run 1,0.464001,0.687523,0.544915
Run 2,0.518014,0.707779,0.590656
Run 3,0.501317,0.708679,0.577301
Run 4,0.400201,0.662324,0.487574
Run 5,0.4167,0.652163,0.503339
Run 6,0.419591,0.685002,0.504959
Run 7,0.469463,0.700556,0.550423
Run 8,0.425032,0.669791,0.509336
Run 9,0.424898,0.676591,0.509669


In [22]:
Results_KMS[3]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.410431,0.630291,0.499336
Run 1,0.404596,0.62008,0.494143
Run 2,0.40976,0.624161,0.499183
Run 3,0.414928,0.626058,0.503614
Run 4,0.44443,0.64222,0.531381
Run 5,0.437382,0.661134,0.521897
Run 6,0.402621,0.640551,0.492095
Run 7,0.367207,0.622261,0.460121
Run 8,0.400765,0.628222,0.489311
Run 9,0.39146,0.636832,0.481716


In [23]:
Results_KMS[4]

Unnamed: 0,ARI,NMI,FMI
Run 0,0.404424,0.65567,0.490963
Run 1,0.413978,0.665574,0.49931
Run 2,0.409351,0.653612,0.495183
Run 3,0.405948,0.649924,0.492088
Run 4,0.431789,0.691314,0.516131
Run 5,0.415109,0.67516,0.501783
Run 6,0.410571,0.662761,0.496183
Run 7,0.449278,0.687959,0.533623
Run 8,0.408725,0.649647,0.495324
Run 9,0.466002,0.684753,0.545109


### Agglomerative clustering

In [24]:
index = ['umint_embedding', 'umintfs_unsupervised_embedding',
         'umintfs_supervised_embedding','muon_supervised_embedding',
        'seurat_supervised_embedding']

Results_AGL = []
for methods in methodlist:
    sc = da.Agglomerative(methods, labellist)
    Results_AGL.append(sc)

Results_AGL = pd.DataFrame(Results_AGL, index=index, columns=['ARI','NMI','FMI'])

In [25]:
Results_AGL

Unnamed: 0,ARI,NMI,FMI
umint_embedding,0.566984,0.703664,0.632645
umintfs_unsupervised_embedding,0.406586,0.612474,0.493736
umintfs_supervised_embedding,0.553767,0.735055,0.622125
muon_supervised_embedding,0.519182,0.686651,0.591296
seurat_supervised_embedding,0.593463,0.725066,0.658105
