# Screen 4 SuperNatant Pre-processing

    Media = TP

# Imports

In [1]:
import numpy as np
import pandas as pd
import os


import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec

import seaborn as sns
sns.set(style='whitegrid')

from os.path import join as pjoin 
import itertools as it

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'

In [2]:
pd.set_option('max_rows', 10)

# Data

In [3]:
path  = '../data_o/screens/sc4/supernatant'
sc4 = 'screen4_parsed_results_supernatant.tsv'

# Pre-processing

In [4]:
dfscr = pd.read_csv(pjoin(path,sc4), sep='\t', parse_dates=['Sampling_datetime'])

# lowercase
dfscr.columns = map(str.lower, dfscr.columns)



# drop columns

keep_cols = ['sampling_datetime','t (h)', 'condition', 'chlamy_bact_ratio','replicate',
             'well',  'sample',  'measurement_type', 'wavelength', 'measurement',
             'measurement_blank_average', 'measurement_fixed']


dfscr = dfscr[keep_cols]

# drop duplicates
dfscr = dfscr.drop_duplicates()

# rename columns
rename_cols =  {'sampling_datetime':'samp_day', 'sample':'content', 't (h)':'t(h)',
                'measurement':'measure', 'chlamy_bact_ratio':'c_b_ratio', 
                'measurement_blank_average':'blank_mean','measurement_fixed':'corr_measure'}
dfscr = dfscr.rename(rename_cols, axis=1)


#rename & lowercase 'measurement_type'
type_rename = {'Fluorescence Bottom Reading': 'fluorescence', 'Absorbance':'absorbance'}
dfscr['measurement_type'] = dfscr['measurement_type'].map(type_rename)


# change ratio value 
#ratio_revalue = {'-':'1', '1.6':'1.6', '16':'16', '160':'160'}
#dfscr['c_b_ratio'] = dfscr['c_b_ratio'].map(ratio_revalue)

# replicate revalue
replicate_revalue = {'rep-1':1, 'rep-2':2}
dfscr['replicate'] = dfscr['replicate'].map(replicate_revalue)
# # convert categorical int values to string types
# dfscr['replicate'] = dfscr['replicate'].map(str)

dfscr['t(h)'] = dfscr['t(h)'].map(int)

# rename and lowercase wavelength values
wave_rename = {'Ex:440_nm/Em:680_nm':'680_nm', '750_nm':'750_nm', '680_nm':'680_nm'}
dfscr['wavelength'] = dfscr['wavelength'].map(wave_rename)

# sort values by using the measurement times
dfscr = dfscr.sort_values(by=['samp_day', 'condition', 'replicate', 'c_b_ratio'])

dfscr = dfscr.reset_index(drop=True)
dfscr.head()

Unnamed: 0,samp_day,t(h),condition,c_b_ratio,replicate,well,content,measurement_type,wavelength,measure,blank_mean,corr_measure
0,2019-04-16 19:23:35,0,b+sup,low,1,A1,ICL_186B,absorbance,680_nm,0.1068,0.108756,-0.001956
1,2019-04-16 19:23:35,0,b+sup,low,1,B1,ICL_129B,absorbance,680_nm,0.1024,0.108756,-0.006356
2,2019-04-16 19:23:35,0,b+sup,low,1,C1,ICL_100,absorbance,680_nm,0.1,0.108756,-0.008756
3,2019-04-16 19:23:35,0,b+sup,low,1,D1,ICL_101,absorbance,680_nm,0.1007,0.108756,-0.008056
4,2019-04-16 19:23:35,0,b+sup,low,1,E1,ICL_103,absorbance,680_nm,0.1018,0.108756,-0.006956


## Correct Values Using 'BLANK'

In [5]:
# use the first days averages of blanks by condition and wavelength and apply it to subsequent days
gb = dfscr[(dfscr['samp_day']<='2019-04-17') &
           (dfscr['content']=='BLANK')].groupby(by=['measurement_type', 'wavelength'])


correcting_dict={}
for name,group in gb:
    #group
    #group['measure'].mean()
    correcting_dict[name]=group['measure'].mean()

gb = dfscr.groupby(['measurement_type', 'wavelength'])


groups = []
for name,group in gb:
    group = group.copy()
    group['blank_mean'] = correcting_dict[name]
    group['corr_measure'] = group['measure'] - group['blank_mean']
    groups.append(group)
    #group

dfscr = pd.concat(groups).sort_values(by=['samp_day', 'condition', 
                                          'replicate', 'c_b_ratio'])

# Save to Disk

In [4]:
path  = '../data_o/screens/sc4/supernatant'
fname = 'screen_4_supernatant_parsed_data.csv '
# dfscr.to_csv(pjoin(path, fname))

In [7]:
# round datetimes
dfscr['samp_day'] = dfscr['samp_day'].dt.round('D')
dfscr

Unnamed: 0,samp_day,t(h),condition,c_b_ratio,replicate,well,content,measurement_type,wavelength,measure,blank_mean,corr_measure
0,2019-04-17,0,b+sup,low,1,A1,ICL_186B,absorbance,680_nm,0.1068,0.108756,-0.001956
1,2019-04-17,0,b+sup,low,1,B1,ICL_129B,absorbance,680_nm,0.1024,0.108756,-0.006356
2,2019-04-17,0,b+sup,low,1,C1,ICL_100,absorbance,680_nm,0.1000,0.108756,-0.008756
3,2019-04-17,0,b+sup,low,1,D1,ICL_101,absorbance,680_nm,0.1007,0.108756,-0.008056
4,2019-04-17,0,b+sup,low,1,E1,ICL_103,absorbance,680_nm,0.1018,0.108756,-0.006956
...,...,...,...,...,...,...,...,...,...,...,...,...
4603,2019-04-24,161,b+sup,high,1,D12,ICL_129A,absorbance,750_nm,0.1191,0.113044,0.006056
4604,2019-04-24,161,b+sup,high,1,E12,ICL_184BA,absorbance,750_nm,0.1116,0.113044,-0.001444
4605,2019-04-24,161,b+sup,high,1,F12,ICL_20BA,absorbance,750_nm,0.1202,0.113044,0.007156
4606,2019-04-24,161,b+sup,high,1,G12,ICL_162,absorbance,750_nm,0.1294,0.113044,0.016356


# Segregate by Condition & Join Respective Wavelengths

    # recall that our models only require A680 and A750.
    # because of this we're going to ignore F680... there
    # are several reasons for ignoring this as well

In [8]:
dfscr['condition'].unique()
dfscr['c_b_ratio'].unique()


array(['b+sup', 'sc+sup'], dtype=object)

array(['low', 'low-high', 'high'], dtype=object)

# Conditions @ 680


In [9]:
A680 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='680_nm')]
A680.shape

A680['samp_day'].unique() # note that its collected for only six days

# media-partioned A680 data: condition = 'b'
df_b_high = A680[(A680['c_b_ratio']=='high')&(A680['condition'] == 'b+sup')]
df_b_low = A680[(A680['c_b_ratio']=='low')&(A680['condition'] == 'b+sup')]
df_sc = A680[(A680['c_b_ratio']=='low-high')&(A680['condition'] == 'sc+sup')]


df_b_high.shape
df_b_low.shape
df_sc.shape

daily_b_high_reps_dfs_680 = []
daily_b_low_reps_dfs_680 = []
daily_sc_reps_dfs_680 = []
for day in A680['samp_day'].unique():
    #day
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    b_high = df_b_high[(df_b_high['samp_day']==day)][keep_cols].reset_index(drop=True)
    b_low = df_b_low[(df_b_low['samp_day']==day)][keep_cols].reset_index(drop=True)
    sc = df_sc[(df_sc['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column this helps
    # plot the information on a timewise manner much simpler
    try:
        for df in  [b_high, b_low, sc]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    b_high = b_high.rename({'measure':'A680', 'corr_measure':'A680c'}, axis=1)
    b_low = b_low.rename({'measure':'A680', 'corr_measure':'A680c'}, axis=1)
    sc = sc.rename({'measure':'A680', 'corr_measure':'A680c'}, axis=1)

    daily_b_high_reps_dfs_680.append(b_high)
    daily_b_low_reps_dfs_680.append(b_low)
    daily_sc_reps_dfs_680.append(sc)
    

(2304, 12)

array(['2019-04-17T00:00:00.000000000', '2019-04-18T00:00:00.000000000',
       '2019-04-19T00:00:00.000000000', '2019-04-20T00:00:00.000000000',
       '2019-04-21T00:00:00.000000000', '2019-04-22T00:00:00.000000000',
       '2019-04-23T00:00:00.000000000', '2019-04-24T00:00:00.000000000'],
      dtype='datetime64[ns]')

(768, 12)

(768, 12)

(768, 12)

# Conditions @ 750


In [10]:
A750 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='750_nm')]
A750.shape

A750['samp_day'].unique() # note that its collected for only six days

# media-partioned A750 data: condition = 'b'
df_b_high = A750[(A750['c_b_ratio']=='high')&(A750['condition'] == 'b+sup')]
df_b_low = A750[(A750['c_b_ratio']=='low')&(A750['condition'] == 'b+sup')]
df_sc = A750[(A750['c_b_ratio']=='low-high')&(A750['condition'] == 'sc+sup')]


df_b_high.shape
df_b_low.shape
df_sc.shape

daily_b_high_reps_dfs_750 = []
daily_b_low_reps_dfs_750 = []
daily_sc_reps_dfs_750 = []
for day in A750['samp_day'].unique():
    #day
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    b_high = df_b_high[(df_b_high['samp_day']==day)][keep_cols].reset_index(drop=True)
    b_low = df_b_low[(df_b_low['samp_day']==day)][keep_cols].reset_index(drop=True)
    sc = df_sc[(df_sc['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column this helps
    # plot the information on a timewise manner much simpler
    try:
        for df in  [b_high, b_low, sc]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    b_high = b_high.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    b_low = b_low.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    sc = sc.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)

    daily_b_high_reps_dfs_750.append(b_high)
    daily_b_low_reps_dfs_750.append(b_low)
    daily_sc_reps_dfs_750.append(sc)
    

(2304, 12)

array(['2019-04-17T00:00:00.000000000', '2019-04-18T00:00:00.000000000',
       '2019-04-19T00:00:00.000000000', '2019-04-20T00:00:00.000000000',
       '2019-04-21T00:00:00.000000000', '2019-04-22T00:00:00.000000000',
       '2019-04-23T00:00:00.000000000', '2019-04-24T00:00:00.000000000'],
      dtype='datetime64[ns]')

(768, 12)

(768, 12)

(768, 12)

In [11]:
# zip ensure that only lists of mathching size get put together, thus even though A750 DFs
# have 10 days worth of data they are limited by the length of A680 data, this matters
# because it makes no sense to apply the models to non-existing A680 data


############
### b_high ###
############
b_high_final_daily_BC = []
for a6,a7 in zip(daily_b_high_reps_dfs_680, daily_b_high_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    b_high_final_daily_BC.append(df)
    
b_high_bc_df = pd.concat(b_high_final_daily_BC)

############
### b_low ###
############
b_low_final_daily_BC = []
for a6,a7 in zip(daily_b_low_reps_dfs_680, daily_b_low_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    b_low_final_daily_BC.append(df)

b_low_bc_df = pd.concat(b_low_final_daily_BC)


############
### sc ###
############
sc_final_daily_BC = []
for a6,a7 in zip(daily_sc_reps_dfs_680, daily_sc_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    sc_final_daily_BC.append(df)

sc_bc_df = pd.concat(sc_final_daily_BC)

In [13]:
b_high_bc_df['b_c_ratio'] = 'b_high'
b_low_bc_df['b_c_ratio'] = 'b_low'
sc_bc_df['b_c_ratio'] = 'sc_low-high'

bc_dfs = [b_high_bc_df, b_low_bc_df, sc_bc_df]

bcdfs = pd.concat(bc_dfs).reset_index(drop=True)

bcdfs
path = '../data_o/screens/screens_preprocessed_data'
fname = 'sc4_supernatant_test_ratio_control.csv'
bcdfs.to_csv(pjoin(path, fname))
# save to disk

Unnamed: 0,t(h),well,replicate,content,A680,A680c,A750,A750c,b_c_ratio
0,0,A1,1,ICL_186B,0.1114,0.002644,0.1152,0.002156,b_high
1,0,B1,1,ICL_129B,0.1112,0.002444,0.1142,0.001156,b_high
2,0,C1,1,ICL_100,0.1036,-0.005156,0.1072,-0.005844,b_high
3,0,D1,1,ICL_101,0.1007,-0.008056,0.1036,-0.009444,b_high
4,0,E1,1,ICL_103,0.1054,-0.003356,0.1098,-0.003244,b_high
...,...,...,...,...,...,...,...,...,...
2299,161,D12,1,ICL_129A,0.1261,0.017344,0.1264,0.013356,sc_low-high
2300,161,E12,1,ICL_184BA,0.1206,0.011844,0.1188,0.005756,sc_low-high
2301,161,F12,1,ICL_20BA,0.1186,0.009844,0.1168,0.003756,sc_low-high
2302,161,G12,1,ICL_162,0.1193,0.010544,0.1179,0.004856,sc_low-high
