In [1]:
import numpy as np
import os
import pandas as pd
import phenograph
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

from src.segmented_svc.segmented_svc import SegmentedSVC

test_df = pd.read_csv(os.path.join("test", "flow_cytometry_test_data.csv")).sample(500000)

In [2]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(test_df)

In [3]:
# Clustering the whole dataset

communities, graph, Q  = phenograph.cluster(
    scaled_df,
    clustering_algo = "leiden",
    )

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 58.51398801803589 seconds
Jaccard graph constructed in 27.602198362350464 seconds
Running Leiden optimization
Leiden completed in 630.0765178203583 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 725.9330031871796 seconds


In [4]:
communities.max()

33

In [5]:
# We can use the SegmentedSVC to do the same computation in less time

#   We need to use unscaled data because the SegmentedSVC object 
# will train and use its own

train_data, test_data, train_labels, test_labels = train_test_split(scaled_df.values,
                                                                    communities,
                                                                    train_size=0.8
                                                                    )

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

In [6]:
predicted_labels = celestia_object.predict(test_data)
predicted_labels

array([ 3., 12.,  4., ...,  0.,  8.,  2.])

In [7]:
report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97     11100
           1       0.95      0.96      0.96      9816
           2       0.95      0.95      0.95      9708
           3       0.93      0.95      0.94      7661
           4       0.94      0.94      0.94      7707
           5       0.98      0.98      0.98      7389
           6       0.94      0.93      0.94      7051
           7       0.95      0.95      0.95      7001
           8       0.96      0.97      0.96      3919
           9       0.95      0.93      0.94      3785
          10       0.96      0.95      0.95      3318
          11       0.97      0.97      0.97      2381
          12       0.96      0.95      0.95      2353
          13       0.95      0.94      0.95      2180
          14       0.98      0.98      0.98      1963
          15       0.94      0.92      0.93      1752
          16       0.98      0.98      0.98      1535
          17       0.95    

In [None]:
# This default performance has been repeatable to me with a large variety of data set types and complexities,
# and blows other classifiers out of the water

train_data, test_data, train_labels, test_labels = train_test_split(test_df.values,
                                                                    communities,
                                                                    train_size=0.8
                                                                    )

# Pure SVC
from sklearn.svm import SVC



classifier = SVC()

In [None]:
# KNN

In [None]:
# Random Forest

In [8]:
# PS: It also works just fine with a small train set

train_data, test_data, train_labels, test_labels = train_test_split(
    test_df.values, 
    communities, 
    train_size=0.2
    )

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

predicted_labels = celestia_object.predict(test_data)
predicted_labels

report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95     44386
           1       0.94      0.95      0.94     38962
           2       0.94      0.94      0.94     38208
           3       0.90      0.95      0.92     30926
           4       0.93      0.92      0.92     30763
           5       0.97      0.98      0.97     29823
           6       0.93      0.92      0.93     28344
           7       0.94      0.94      0.94     27951
           8       0.95      0.95      0.95     15688
           9       0.95      0.91      0.93     15283
          10       0.95      0.94      0.95     13540
          11       0.96      0.96      0.96      9331
          12       0.94      0.92      0.93      9345
          13       0.94      0.94      0.94      8758
          14       0.97      0.97      0.97      7638
          15       0.93      0.88      0.90      6967
          16       0.97      0.97      0.97      6343
          17       0.88    