In [1]:
import numpy as np
import os
import pandas as pd
import phenograph
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from src.segmented_svc.segmented_svc import SegmentedSVC

test_df = pd.read_csv(os.path.join("test", "flow_cytometry_test_data.csv")).fillna(0)

In [2]:
scaler = StandardScaler()

scaled_df = scaler.fit_transform(test_df)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [3]:
# Clustering the whole dataset

communities, graph, Q  = phenograph.cluster(
    scaled_df,
    clustering_algo = "leiden",
    )

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


ValueError: Input X contains NaN.
NearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
communities.max()

25

In [None]:
# We can use the SegmentedSVC to do the same computation in less time

#   We need to use unscaled data because the SegmentedSVC object 
# will train and use its own

train_data, test_data, train_labels, test_labels = train_test_split(test_df.values, communities, train_size=0.8)

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
predicted_labels = celestia_object.predict(test_data)
predicted_labels

array([ 9., 16.,  9., ...,  0., 12.,  5.])

In [None]:
report = classification_report(test_labels, predicted_labels)

print(report)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93     10145
           1       0.94      0.94      0.94      9988
           2       0.95      0.96      0.96      9105
           3       0.99      0.99      0.99      7227
           4       0.92      0.93      0.93      7047
           5       0.93      0.92      0.93      7063
           6       0.96      0.96      0.96      5897
           7       0.97      0.97      0.97      5660
           8       0.93      0.92      0.92      5049
           9       0.97      0.98      0.97      4313
          10       0.99      0.99      0.99      4189
          11       0.93      0.92      0.93      3691
          12       0.96      0.96      0.96      3401
          13       0.95      0.94      0.94      3030
          14       0.97      0.96      0.96      3020
          15       0.97      0.96      0.97      2558
          16       0.94      0.92      0.93      2206
          17       0.96    

In [None]:
# It also works just fine with a small train set

train_data, test_data, train_labels, test_labels = train_test_split(
    test_df.values, 
    communities, 
    train_size=0.2
    )

celestia_object = SegmentedSVC(
    data = train_data,
    labels = train_labels
)

predicted_labels = celestia_object.predict(test_data)
predicted_labels

report = classification_report(test_labels, predicted_labels)

print(report)

  super()._check_params_vs_input(X, default_n_init=10)


              precision    recall  f1-score   support

           0       0.92      0.92      0.92     40219
           1       0.93      0.93      0.93     39817
           2       0.94      0.96      0.95     36572
           3       0.99      0.99      0.99     28546
           4       0.92      0.91      0.92     28558
           5       0.91      0.92      0.92     28457
           6       0.96      0.94      0.95     23292
           7       0.96      0.97      0.96     22370
           8       0.92      0.91      0.91     20379
           9       0.96      0.97      0.97     17039
          10       0.99      0.99      0.99     16497
          11       0.92      0.91      0.91     15094
          12       0.94      0.94      0.94     13596
          13       0.93      0.91      0.92     12528
          14       0.96      0.94      0.95     11867
          15       0.97      0.94      0.95     10141
          16       0.92      0.91      0.92      8931
          17       0.94    

In [None]:
# Why save a couple minutes to go from 100% accuracy to 94?

from pprint import pprint
from datetime import datetime as dt 

x = 0.30

run_times = []

while x < 1:
    run_dict = {}
    sub_df = test_df.sample(frac = x, replace=False)
    run_dict['index'] = x * 100
    scaler = StandardScaler()
    scaled_sub_df = scaler.fit_transform(sub_df)
    
    
    t1 = dt.now()
    communities, graph, Q  = phenograph.cluster(
    scaled_sub_df,
    clustering_algo = "leiden",
    )
    t2 = dt.now()
    time_elapsed = t2-t1
    run_dict['Leiden'] = time_elapsed.seconds
    
    
    t1 = dt.now()
    celestia_object = SegmentedSVC(
    data = sub_df.values,
    labels = communities
    )
    t2 = dt.now()
    time_elapsed = t2-t1
    run_dict['SegmentedSVC Training'] = time_elapsed.seconds
    
    
    t1 = dt.now()
    _ = celestia_object.predict(sub_df.values)
    t2 = dt.now()
    time_elapsed = t2-t1
    run_dict['SegmentedSVC Predicting'] = time_elapsed.seconds
    
    
    run_times.append(run_dict)
    
    pprint(run_dict)
    
    x += 0.10
    
run_df = pd.DataFrame(run_times)
    

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


Neighbors computed in 8.734695434570312 seconds
Jaccard graph constructed in 10.755605459213257 seconds
Running Leiden optimization
Leiden completed in 60.13450860977173 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 83.61015629768372 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 15.898804426193237 seconds
Jaccard graph constructed in 13.000689029693604 seconds
Running Leiden optimization
Leiden completed in 167.3783209323883 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 201.8471999168396 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 17.1208758354187 seconds
Jaccard graph constructed in 15.539642572402954 seconds
Running Leiden optimization
Leiden completed in 127.33467245101929 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 165.6311604976654 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 25.43706512451172 seconds
Jaccard graph constructed in 18.055782318115234 seconds
Running Leiden optimization
Leiden completed in 213.0628867149353 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 263.00885128974915 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0},
 {'Leiden': 263,
  'SegmentedSVC Predicting': 174,
  'SegmentedSVC Training': 84,
  'index': 60.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 32.56760239601135 seconds
Jaccard graph constructed in 20.228790998458862 seconds
Running Leiden optimization
Leiden completed in 424.70195031166077 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 484.70691657066345 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0},
 {'Leiden': 263,
  'SegmentedSVC Predicting': 174,
  'SegmentedSVC Training': 84,
  'index': 60.0},
 {'Leiden': 484,
  'SegmentedSVC Predicting': 281,
  'SegmentedSVC Training': 142,
  'index': 70.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 40.35634803771973 seconds
Jaccard graph constructed in 23.361350536346436 seconds
Running Leiden optimization
Leiden completed in 330.8852596282959 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 402.62000584602356 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0},
 {'Leiden': 263,
  'SegmentedSVC Predicting': 174,
  'SegmentedSVC Training': 84,
  'index': 60.0},
 {'Leiden': 484,
  'SegmentedSVC Predicting': 281,
  'SegmentedSVC Training': 142,
  'index': 70.0},
 {'Leiden': 402,
  'SegmentedSVC Predicting': 294,
  'SegmentedSVC Training': 160,
  'index': 80.0}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 39.111478328704834 seconds
Jaccard graph constructed in 25.85306167602539 seconds
Running Leiden optimization
Leiden completed in 485.3403422832489 seconds
Sorting communities by size, please wait ...
PhenoGraph completed in 559.6573264598846 seconds


  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0},
 {'Leiden': 263,
  'SegmentedSVC Predicting': 174,
  'SegmentedSVC Training': 84,
  'index': 60.0},
 {'Leiden': 484,
  'SegmentedSVC Predicting': 281,
  'SegmentedSVC Training': 142,
  'index': 70.0},
 {'Leiden': 402,
  'SegmentedSVC Predicting': 294,
  'SegmentedSVC Training': 160,
  'index': 80.0},
 {'Leiden': 559,
  'SegmentedSVC Predicting': 421,
  'SegmentedSVC Training': 248,
  'index': 89.99999999999999}]
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 42.67114853858948 seconds
Jaccard graph constructed in 28.19801640510559 seconds
Running Leiden optimization
Leiden completed in 833.9518971443176 seconds
Sorting communities by size, please w

  super()._check_params_vs_input(X, default_n_init=10)


[{'Leiden': 83,
  'SegmentedSVC Predicting': 43,
  'SegmentedSVC Training': 21,
  'index': 30.0},
 {'Leiden': 201,
  'SegmentedSVC Predicting': 73,
  'SegmentedSVC Training': 39,
  'index': 40.0},
 {'Leiden': 165,
  'SegmentedSVC Predicting': 110,
  'SegmentedSVC Training': 58,
  'index': 50.0},
 {'Leiden': 263,
  'SegmentedSVC Predicting': 174,
  'SegmentedSVC Training': 84,
  'index': 60.0},
 {'Leiden': 484,
  'SegmentedSVC Predicting': 281,
  'SegmentedSVC Training': 142,
  'index': 70.0},
 {'Leiden': 402,
  'SegmentedSVC Predicting': 294,
  'SegmentedSVC Training': 160,
  'index': 80.0},
 {'Leiden': 559,
  'SegmentedSVC Predicting': 421,
  'SegmentedSVC Training': 248,
  'index': 89.99999999999999},
 {'Leiden': 914,
  'SegmentedSVC Predicting': 514,
  'SegmentedSVC Training': 256,
  'index': 99.99999999999999}]


In [None]:
run_df