# Miscellaneous Sanity Checking and Profiling of the Accuracy Sets

`accuracy_set.csv` was originally built by Dan Bryce, `new_accuracy_table.csv` was built after gating as described in these notebooks.

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
from typing import List

In [2]:
old_df: pd.DataFrame = pd.read_csv('accuracy_set.csv')

In [3]:
new_df: pd.DataFrame = pd.read_csv('new_accuracy_table.csv')

In [4]:
print(f"New table has {new_df.shape[0]} rows.")
print(f"Old table has {old_df.shape[0]} rows.")

New table has 8726 rows.
Old table has 8686 rows.


In [5]:
added_columns: List[str] = sorted(set(new_df.columns) - set(old_df.columns))
print(f"Added columns are:")
c: str
for c in added_columns:
    print(f"\t{c}")

Added columns are:
	gated_count
	gated_live_count
	mean_log_gfp_gated
	mean_log_gfp_gated_live
	std_log_gfp_gated
	std_log_gfp_gated_live


In [6]:
print(added_columns[0])

gated_count


In [7]:
new_df[new_df['gated_count'].isna()].shape[0]

1375

In [8]:
new_df[~new_df['gated_count'].isna()].shape[0]

7351

In [9]:
new_df[~new_df['gated_live_count'].isna()].shape[0]

5344

In [10]:
no_live_count = new_df[~new_df['gated_count'].isna() & new_df['gated_live_count'].isna()]
no_live_count.shape[0]

2007

In [11]:
no_gated_count = new_df[new_df['gated_count'].isna()]
no_gated_count.shape[0]

1375

In [12]:
no_live_count['strain_name'].unique()

array(['NOR-00-Control', 'WT-Live-Control',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_16968/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_16969/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_16970/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_5783/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6388/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6389/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6390/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6391/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7299/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7300/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7373/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7374/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7375/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7376/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_8225/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_82

In [13]:
no_gated_count['strain_name'].unique()

array(['https://hub.sd2e.org/user/sd2e/design/UWBF_6390/1',
       'WT-Dead-Control',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7375/1',
       'WT-Live-Control',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7300/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7374/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_8542/1',
       'NOR-00-Control',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_16969/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_16970/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6388/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7299/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_7377/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_8231/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_8544/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_6389/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_5992/1',
       'https://hub.sd2e.org/user/sd2e/design/UWBF_8543/1',
       'https://hub.s

In [14]:
mask = pd.isnull(new_df[added_columns[0]])
for c in added_columns[1:]:
    bool_vec = pd.isnull(new_df[c])
    mask = np.logical_or(mask, bool_vec)
print(f"There are {new_df[mask].shape[0]} rows with missing values")

There are 3384 rows with missing values


In [15]:
new_df[mask].iloc[0, :]

lab_id                    r1c5va879uaex_r1c639xp952g4
strain_name                            NOR-00-Control
plate_id                                            0
well                                              C12
count                                             NaN
                                     ...             
std_log_gfp_gated                            0.286968
std_log_gfp_gated_live                            NaN
std_log_gfp_live                                  NaN
threshold                                         NaN
threshold_live                                    NaN
Name: 0, Length: 61, dtype: object

In [16]:
new_df[mask].iloc[0, :][added_columns]

gated_count                 25743.0
gated_live_count                NaN
mean_log_gfp_gated         2.872624
mean_log_gfp_gated_live         NaN
std_log_gfp_gated          0.286968
std_log_gfp_gated_live          NaN
Name: 0, dtype: object

In [17]:
sorted(new_df[mask].iloc[0, :].index)

['count',
 'count_live',
 'experiment_id',
 'filename',
 'gate',
 'gated_count',
 'gated_live_count',
 'growth_media_1',
 'growth_media_2',
 'id',
 'inc_temp',
 'inc_time_1',
 'inc_time_2',
 'index',
 'input',
 'lab',
 'lab_id',
 'mean_correct_classifier',
 'mean_correct_classifier_live',
 'mean_correct_high_classifier',
 'mean_correct_high_classifier_live',
 'mean_correct_high_threshold',
 'mean_correct_high_threshold_live',
 'mean_correct_low_classifier',
 'mean_correct_low_classifier_live',
 'mean_correct_low_threshold',
 'mean_correct_low_threshold_live',
 'mean_correct_threshold',
 'mean_correct_threshold_live',
 'mean_log_gfp',
 'mean_log_gfp_gated',
 'mean_log_gfp_gated_live',
 'mean_log_gfp_live',
 'media',
 'od',
 'od_cutoff',
 'output',
 'plan',
 'plate_id',
 'replicate',
 'source_container',
 'std_correct_classifier',
 'std_correct_classifier_live',
 'std_correct_high_classifier',
 'std_correct_high_classifier_live',
 'std_correct_high_threshold',
 'std_correct_high_threshol