# Assemble CSV file of Positive Control Flow Cytometry Events

This notebook was run on the Texas Advanced Computing Center (TACC) systems to generate the file `positive_controls.csv`.  This notebook has not been updated since then, but should be runnable using the `flow_cytometry`*xx* files in the Zenodo data set with DOI 10.5281/zenodo.6562250

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# list of files with NOR-00-Control strain data 
positive_control_files = ["r1c84xwx57y95_r1c8ejz8jfg9z",
"r1cft9hhnhjdb_r1cfwgjmz247y",
"r1cyeyjwbefaf2_r1cys7qtr6j42x",
"r1c7cpvfzqprk_r1c7fbvba55db",
"r1cdcetsvxgc2_r1cdfa6jrtssc",
"r1c8yyg9gxbme_r1c96wn43pq45",
"r1ce3pjymvm3u_r1ceb3rgpe8gm",
"r1ce3pqt45vfe_r1ce7qf5e483u",
"r1ceuhvgf7b4y_r1cf2yr353x5j",
"r1cwvvxz5d7wua_r1cx7ucsnkjfsb",
"r1c84y3r9yqxb_r1c8auwvzbgt3",
"r1c8yydkumrkr_r1c96xsxw79c9",
"r1cgbw3xk7pau_r1cgpa54x8ff3",
"r1cckyh56hjqj_r1ccpyjef2vca",
"r1cfkpp2p3kjz_r1cftpr8wxqc7",
"r1c8xx7we38vw_r1c93cukfcxqp",
"r1cbut53kpf3n_r1cc3bn2s2aj8",
"r1cdcesknarkf_r1cdfg4x5uuwp",
"r1d6r8mvznxxme_r1d8jxmssfag5w",
"r1c7cprv7fe49_r1c7jmje3ebhc",
"r1cdfr4bjcnrg_r1cdnuqex83dm",
"r1cbut3veu2z4_r1cc6efukqtxx",
"r1cfkpuz2kr6c_r1cfpvk5hjkzb",
"r1cgbw8qkp6z4_r1cgjpjx5ve76",
"r1cgbw7hjvxgx_r1cgjxrexhd4j",
"r1c5va9wyf3mx_r1c63bjzu5ruy",
"r1c5vad8u7ve2_r1c66q7cru28u",
"r1c7cpu9nj2bu_r1c7fg23qkzph",
"r1cyey8rzxq269_r1cysrdt3pvt5n",
"r1ccqfbz332u8_r1cctaf8r32rc",
"r1c8yx25rrtag_r1c96v2w8x5pq",
"r1cctw7qh3777_r1ccwxr4t3wb6",
"r1ce3p3p7j77n_r1ceevbnsqku2",
"r1cwvvwryt3q49_r1cx4ux3zcdt4w",
"r1ceuhrtvgyvd_r1cf6kx2a233e",
"r1cgbw6bqmemz_r1cgnt4s2c5ty",
"r1cbut2pdjtwv_r1cc6qzwcd4hx",
"r1cfkptta6kuc_r1cfq4k72rwba",
"r1cckyeq9r358_r1ccqgmy5ffcd",
"r1ceuhu9enre5_r1cf33y4j6dyg",
"r1cfkpq8n84aj_r1cftg78pbqd7",
"r1cft9f659y9s_r1cfx3nrnc2uw",
"r1ce3p4vgfu3p_r1ceebve8d6tc",
"r1cgbw9ww9g9h_r1cgjg8e8xeav",
"r1c5vac658fxn_r1c66qw595ydy",
"r1c8xx94j94u7_r1c93ajvdzxz5",
"r1d6r8ztq8zfa7_r1d952nn9xk753",
"r1cdfsf4ezew3_r1cdjsyza79sr",
"r1ce3phrwnkqt_r1ceb8fj2hj9a",
"r1c9tpmcezpk2_r1c9wjnedp92f",
"r1d6r8p5ddgsut_r1d8fgr2efzrhj",
"r1ce3p63vnpzq_r1cee5f7knupg",
"r1ch7zwzznes3_r1cheagmxcqvt",
"r1c8xx6qe9eea_r1c93enbf48t7",
"r1cbut6a79n9t_r1cc34mkuvzeh",
"r1cfkprehr2fj_r1cft8f7cbsp7",
"r1d6r8ykf868ep_r1d98fh76wrqp7",
"r1d6r932unhafk_r1d93nuytthhp4",
"r1cbajqru7kbe_r1cbe9p9zu6dh",
"r1cwvvvj2bh6hc_r1cx4pbeyv2q29",
"r1c84xvquwxth_r1c8ek7vcmdxh",
"r1cyeye2du6ep8_r1cysfjkjjphqj",
"r1ce3pndp353t_r1ce7xkybfnqd",
"r1c7cpqp2k6cw_r1c7jn9syrr8s",
"r1cctw6h5fsfg_r1ccx3axsj5ua",
"r1d6r59sste9m2_r1d8gf8cczkrz9",
"r1d6r58jnct63g_r1d8kxz5ndkxyh",
"r1d6r5b24f9a9z_r1d8cyvjthnvvt",
"r1cf3p48cauwv_r1cf6hech88mh",
"r1cdcerdbpda2_r1cdfphfehzq2",
"r1c84xzc5dbuv_r1c8axrywctwb",
"r1cbuszfskzjt_r1cc6ws9c3dh4",
"r1cft9gbs55yn_r1cfwthkscwsk",
"r1ce3pm76spw4_r1ceas4zgsdt5",
"r1ceuhqmu6c7w_r1cf6snbv99p8",
]
positive_control_files = [x + ".csv" for x in positive_control_files]


In [4]:
df = pd.read_csv(positive_control_files.pop(), index_col=0)
df = df[df['strain_name'] == 'NOR-00-Control']

for x in positive_control_files:
    new_df = pd.read_csv(x, index_col=0)
    new_df = new_df[new_df['strain_name'] == 'NOR-00-Control']
    df = pd.concat([df, new_df])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  mask |= (ar1 == a)


In [5]:
df['strain_name'].unique()

array(['NOR-00-Control'], dtype=object)

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
if 'level_0' in df.columns:
    df.drop(columns='level_0', inplace=True)

The two incubation times are not the same, so we can't drop one.

In [8]:
df.loc[1801704, :][['inc_time_1', 'inc_time_2']]

inc_time_1    18:hour
inc_time_2    15:hour
Name: 1801704, dtype: object

In [9]:
df['inc_time_1'].dropna() == df['inc_time_2'].dropna()

0          True
1          True
2          True
3          True
4          True
           ... 
1831704    True
1831705    True
1831706    True
1831707    True
1831708    True
Length: 1831709, dtype: bool

In [10]:
all(df['growth_media_1'] == df['growth_media_2'])

True

In [11]:
df.loc[:, 'media'] = df['growth_media_1'].astype(pd.CategoricalDtype(sorted(df['growth_media_1'].unique())))
df.drop(columns=['growth_media_1', 'growth_media_2'], inplace=True)

In [13]:
growth_media = sorted(set(df['media'].unique()))
categories = pd.CategoricalDtype(growth_media)
df.at[:, 'media'] = df['media'].astype(categories)

In [15]:
df.at[:, 'inc_temp_degrees'] = np.vectorize(lambda x: int(x.split("_")[1]))(df['inc_temp'])
df.drop(columns='inc_temp', inplace=True)

In [16]:
df.at[:, 'inc_time_1_hrs'] = np.vectorize(lambda x: int(x.split(":")[0]))(df['inc_time_1'])

In [17]:
df.at[:, 'inc_time_2_hrs'] = np.vectorize(lambda x: int(x.split(":")[0]))(df['inc_time_2'])

In [18]:
df.drop(columns=['inc_time_1', 'inc_time_2'], inplace=True)

In [19]:
wells = np.vectorize(lambda x: x.split("_")[-1])(df['id'])
df.at[:, 'well'] = wells

In [20]:
wells = sorted(df['well'].unique())
categories = pd.CategoricalDtype(wells)
df.at[:, 'well'] = df['well'].astype(categories)

In [21]:
df.loc[:, 'replicate'] = df.groupby(['lab_id', 'plate_id', 'well']).ngroup()

In [22]:
df.loc[:, 'event'] = df.groupby(['lab_id', 'plate_id', 'well']).cumcount()

In [23]:
df.drop(columns=['lab', 'plan'], inplace=True)

In [25]:
df.set_index(['strain_name', 'inc_temp_degrees', 'inc_time_2_hrs', 'media', 'od', 'plate_id', 'replicate', 'event'], drop=True, inplace=True)

In [26]:
df.to_csv('positive_controls.csv')