In [1]:
import numpy as np
import os
import pandas as pd
import phenograph
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

from src.segmented_svc.segmented_svc import SegmentedSVC

test_df = pd.read_csv(os.path.join("test", "flow_cytometry_test_data.csv")).sample(500000)

In [2]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(test_df)

In [3]:
# Clustering the whole dataset

communities, graph, Q  = phenograph.cluster(
    scaled_df,
    clustering_algo = "leiden",
    )

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 58.51398801803589 seconds
Jaccard graph constructed in 27.602198362350464 seconds
Running Leiden optimization
Leiden completed in 630.0765178203583 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 725.9330031871796 seconds


In [4]:
communities.max()

33

In [5]:
# We can use the SegmentedSVC to do the same computation in less time

#   We need to use unscaled data because the SegmentedSVC object 
# will train and use its own

train_data, test_data, train_labels, test_labels = train_test_split(scaled_df,
                                                                    communities,
                                                                    train_size=0.8
                                                                    )

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

In [6]:
predicted_labels = celestia_object.predict(test_data)
predicted_labels

array([ 3., 12.,  4., ...,  0.,  8.,  2.])

In [7]:
report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97     11100
           1       0.95      0.96      0.96      9816
           2       0.95      0.95      0.95      9708
           3       0.93      0.95      0.94      7661
           4       0.94      0.94      0.94      7707
           5       0.98      0.98      0.98      7389
           6       0.94      0.93      0.94      7051
           7       0.95      0.95      0.95      7001
           8       0.96      0.97      0.96      3919
           9       0.95      0.93      0.94      3785
          10       0.96      0.95      0.95      3318
          11       0.97      0.97      0.97      2381
          12       0.96      0.95      0.95      2353
          13       0.95      0.94      0.95      2180
          14       0.98      0.98      0.98      1963
          15       0.94      0.92      0.93      1752
          16       0.98      0.98      0.98      1535
          17       0.95    

In [9]:
# This default performance has been repeatable to me with a large variety of data set types and complexities,
# and blows other classifiers out of the water

# (Some of these take a LOONG time, wayy longer than the initial labelling)

train_data, test_data, train_labels, test_labels = train_test_split(
    scaled_df.values,
    communities,
    train_size=0.8
)

# Pure SVC
from sklearn.svm import SVC

classifier = SVC()
classifier.fit(train_data, train_labels)
predicted_labels = classifier.predict(test_data)

report = classification_report(test_labels, predicted_labels)
print(report)

In [10]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(train_data, train_labels)
predicted_labels = classifier.predict(test_data)

report = classification_report(test_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84     10944
           1       0.67      0.88      0.76      9726
           2       0.69      0.80      0.74      9668
           3       0.41      0.54      0.47      7803
           4       0.49      0.64      0.56      7738
           5       0.63      0.72      0.67      7428
           6       0.61      0.59      0.60      7138
           7       0.34      0.31      0.32      6827
           8       0.12      0.04      0.06      3908
           9       0.75      0.76      0.75      3874
          10       0.41      0.28      0.33      3409
          11       0.77      0.68      0.72      2376
          12       0.67      0.53      0.59      2354
          13       0.10      0.02      0.04      2218
          14       0.43      0.22      0.29      1928
          15       0.71      0.57      0.63      1748
          16       0.94      0.94      0.94      1573
          17       0.22    

In [11]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(train_data, train_labels)
predicted_labels = classifier.predict(test_data)

report = classification_report(test_labels, predicted_labels)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96     10944
           1       0.95      0.96      0.95      9726
           2       0.95      0.95      0.95      9668
           3       0.94      0.95      0.94      7803
           4       0.93      0.94      0.94      7738
           5       0.98      0.98      0.98      7428
           6       0.94      0.93      0.93      7138
           7       0.95      0.95      0.95      6827
           8       0.96      0.96      0.96      3908
           9       0.95      0.94      0.95      3874
          10       0.96      0.96      0.96      3409
          11       0.97      0.96      0.97      2376
          12       0.94      0.94      0.94      2354
          13       0.96      0.95      0.95      2218
          14       0.97      0.98      0.98      1928
          15       0.93      0.91      0.92      1748
          16       0.99      0.97      0.98      1573
          17       0.94    

In [12]:
# PS: SegmentedSVC also works just fine with a small train set

train_data, test_data, train_labels, test_labels = train_test_split(
    test_df.values, 
    communities, 
    train_size=0.2
    )

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

predicted_labels = celestia_object.predict(test_data)
predicted_labels

report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     44430
           1       0.93      0.95      0.94     38731
           2       0.94      0.94      0.94     38502
           3       0.91      0.94      0.92     30952
           4       0.93      0.92      0.92     30786
           5       0.97      0.98      0.97     29879
           6       0.93      0.92      0.92     28335
           7       0.94      0.94      0.94     27786
           8       0.95      0.96      0.96     15680
           9       0.95      0.91      0.93     15334
          10       0.95      0.94      0.94     13556
          11       0.96      0.95      0.96      9431
          12       0.93      0.92      0.92      9296
          13       0.93      0.94      0.94      8733
          14       0.98      0.96      0.97      7692
          15       0.93      0.88      0.90      6940
          16       0.98      0.98      0.98      6325
          17       0.85    

In [14]:
# Random Forest is NOT
from sklearn.ensemble import RandomForestClassifier

train_data, test_data, train_labels, test_labels = train_test_split(
    scaled_df,
    communities,
    train_size=0.2
)


classifier = RandomForestClassifier()
classifier.fit(train_data, train_labels)
predicted_labels = classifier.predict(test_data)

report = classification_report(test_labels, predicted_labels)
print(report)