# For now we will ignore Screen 2 Data as No A680 or A600 data was collected....
# therefore we cannot implement the models directly
# We do however have F680 data, which can be regressed back to A680, but this
# would be prone to lots of errors, therefore for now we ignore it.

# Imports

In [2]:
import numpy as np
import pandas as pd
import os
import scipy

import xlrd
import argparse

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec
from os.path import join as pjoin

import seaborn as sns
sns.set(style='whitegrid')

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'


# Data Pre-processing

In [3]:
path = '../data_o/screens/sc2'
fname =  'screen2_tecan_measurements.tsv'


In [4]:
dfscr = pd.read_csv(pjoin(path,fname), sep='\t')
dfscr.head()

Unnamed: 0,Sampling_date,Sampling_time,Sampling_datetime,Day,Media,Condition,replicate,Measurement_type,Wavelength,Row,Column,Measurement,Filename,t (h),Content,Measurement_blank_average,Measurement_fixed
0,16.02.2019,14:19:58,2019-02-16 14:19:58,day1,tp,b+c,5,Absorbance,750_nm,A,1,0.3114,day1_b+c_5.xlsx,22.0,ICL_162,0.120254,0.191146
1,16.02.2019,14:19:58,2019-02-16 14:19:58,day1,tp,b+c,5,Absorbance,750_nm,B,1,0.1076,day1_b+c_5.xlsx,22.0,ICL_115,0.120254,-0.012654
2,16.02.2019,14:19:58,2019-02-16 14:19:58,day1,tp,b+c,5,Absorbance,750_nm,C,1,0.105,day1_b+c_5.xlsx,22.0,ICL_144,0.120254,-0.015254
3,16.02.2019,14:19:58,2019-02-16 14:19:58,day1,tp,b+c,5,Absorbance,750_nm,D,1,0.3406,day1_b+c_5.xlsx,22.0,ICL_186A,0.120254,0.220346
4,16.02.2019,14:19:58,2019-02-16 14:19:58,day1,tp,b+c,5,Absorbance,750_nm,E,1,0.3707,day1_b+c_5.xlsx,22.0,ICL_162,0.120254,0.250446


In [4]:
# select the information to work with
keep_cols = ['Sampling_date', 'Media', 'Condition', 'replicate', 'Content',
             'Measurement_type', 'Wavelength', 'Row', 'Column', 't (h)', 
             'Measurement', 'Measurement_blank_average', 'Measurement_fixed']
dfscr = dfscr[keep_cols]
# lowercase all columns 
dfscr.columns = map(str.lower, dfscr.columns)
# rename columns
rename_cols = {'measurement_blank_average':'blank_measure_mean', 
               'measurement_fixed':'corrected_measure',
               'measurement':'measure','t (h)': 't(h)',
              'sampling_date':'samp_day'}
dfscr = dfscr.rename(rename_cols, axis=1)

# # rename day values
# day_rename = {'day0':'0', 'day1':'1', 'day2':'2', 'day3':'3',
#               'day4':'4', 'day5':'5', 'day6':'6', 'day7':'7'}
# dfscr['day'] = dfscr['day'].map(day_rename)

#rename and lowercase meas. type values
type_rename = {'Fluorescence Bottom Reading': 'fluorescence', 'Absorbance':'absorbance'}
dfscr['measurement_type'] = dfscr['measurement_type'].map(type_rename)

# rename and lowercase wavelength values
wave_rename = {'Ex:440_nm/Em:680_nm':'680_nm', '750_nm':'750_nm'}
dfscr['wavelength'] = dfscr['wavelength'].map(wave_rename)

# convert to datetime
dfscr['samp_day'] = pd.to_datetime(dfscr['samp_day'], dayfirst=True)

# convert categorical int values to string types
dfscr['replicate'] = dfscr['replicate'].map(str)
dfscr['column'] = dfscr['column'].map(str)
dfscr['t(h)'] = dfscr['t(h)'].map(int)

# create well column to replace row & column 
dfscr['well'] = dfscr['row']+dfscr['column']


# sort values by using the measurement times
dfscr = dfscr.sort_values(by=['samp_day', 'media', 'condition', 'replicate',
                              'measurement_type'])
dfscr = dfscr.reset_index(drop=True)

dfscr.head()

Unnamed: 0,samp_day,media,condition,replicate,content,measurement_type,wavelength,row,column,t(h),measure,blank_measure_mean,corrected_measure,well
0,2019-02-15,tp,b,1,ICL_162,absorbance,750_nm,A,1,0,0.1154,0.114179,0.001221,A1
1,2019-02-15,tp,b,1,ICL_115,absorbance,750_nm,B,1,0,0.1121,0.114179,-0.002079,B1
2,2019-02-15,tp,b,1,ICL_144,absorbance,750_nm,C,1,0,0.1244,0.114179,0.010221,C1
3,2019-02-15,tp,b,1,ICL_186A,absorbance,750_nm,D,1,0,0.1142,0.114179,2.1e-05,D1
4,2019-02-15,tp,b,1,ICL_162,absorbance,750_nm,E,1,0,0.1136,0.114179,-0.000579,E1


In [5]:
# TP is only media used

In [6]:
dfscr['condition'].unique()
dfscr['media'].unique()

# NOTE that while there are absorbance measurements, there is only 1
# Which means we cannot apply our models to it. Unfortunately the
# Flurescence measurements cannot be used to glean the wanted information
# because Fluorescence measurements depend both on the amount of Chlamy & Bacteria
# in the well, at 750 you're capturing everything that is in the system, not only
# bacterium

array(['b', 'b+c', 'sc+c'], dtype=object)

array(['tp'], dtype=object)

# Correct BLANK

In [7]:
# use the first days averages of blanks as the value by which to correct all subsequent days
gb = dfscr[(dfscr['samp_day']<='2019-02-15')&(dfscr['content']=='BLANK')].groupby(['condition', 'wavelength'])

# only blank values exist in absorbance measurements at 750_nm,

correcting_mean = None
for name,group in gb:
    group
    correcting_mean = group['measure'].mean()
    

gb = dfscr.groupby(['condition', 'wavelength'])

groups = []
for name,group in gb:

    group = group.copy()

    if name=='b':

        group['blank_measure_mean'] = correcting_mean
        group['corrected_measure'] = group['measure'] - group['blank_measure_mean']
        groups.append(group)

    else:
        groups.append(group)
        
    dfscr=pd.concat(groups)

Unnamed: 0,samp_day,media,condition,replicate,content,measurement_type,wavelength,row,column,t(h),measure,blank_measure_mean,corrected_measure,well
43,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,D,6,0,0.1177,0.114179,0.003521,D6
47,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,H,6,0,0.1116,0.114179,-0.002579,H6
51,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,D,7,0,0.1091,0.114179,-0.005079,D7
55,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,H,7,0,0.1125,0.114179,-0.001679,H7
59,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,D,8,0,0.1159,0.114179,0.001721,D8
63,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,H,8,0,0.1232,0.114179,0.009021,H8
67,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,D,9,0,0.1119,0.114179,-0.002279,D9
71,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,H,9,0,0.117,0.114179,0.002821,H9
75,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,D,10,0,0.1129,0.114179,-0.001279,D10
79,2019-02-15,tp,b,1,BLANK,absorbance,750_nm,H,10,0,0.113,0.114179,-0.001179,H10


In [8]:
dfscr[dfscr['measurement_type']=='fluorescence'].head() # mean is kept at zero

Unnamed: 0,samp_day,media,condition,replicate,content,measurement_type,wavelength,row,column,t(h),measure,blank_measure_mean,corrected_measure,well
288,2019-02-15,tp,b+c,1,ICL_162,fluorescence,680_nm,A,1,0,26.0,0.0,26.0,A1
289,2019-02-15,tp,b+c,1,ICL_115,fluorescence,680_nm,B,1,0,19.0,0.0,19.0,B1
290,2019-02-15,tp,b+c,1,ICL_144,fluorescence,680_nm,C,1,0,23.0,0.0,23.0,C1
291,2019-02-15,tp,b+c,1,ICL_186A,fluorescence,680_nm,D,1,0,22.0,0.0,22.0,D1
292,2019-02-15,tp,b+c,1,ICL_162,fluorescence,680_nm,E,1,0,27.0,0.0,27.0,E1



# Pre-Process Indiv Replicates
    
    The experiement was conducted such that the biologists made 12 separate daily measurements
    encompassing different conditions and a variable number of replicates per condition. Here, 
    we first separate the dataframe into absorbance and fluorescence, then we separate them by
    the 3 conditions (sc+c, b+c, b) into the respective day the measurements were taken. 
    Once the daily measurements have been segregated by condition, we then concatenate the
    respective abs and fluorescence measurements ensuring that the measurements correspond to 
    'content' the main key.


## Absorbance DFs

In [9]:
abn = dfscr[dfscr['measurement_type']=='absorbance']
abn.shape

(9216, 14)

#### SC+C

In [10]:
# absorbance condition: sc+c
df_scc = abn[abn['condition'] == 'sc+c']

all_scc_abs_dfs = []
for day in df_scc['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_scc[df_scc['samp_day']==day]['replicate'].unique():        
        tdf = df_scc[(df_scc['samp_day']==day)&(df_scc['replicate']==rep)]

        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
        
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_scc_abs_dfs.append(df.reset_index())

#### B+C

In [11]:
# absorbance condition: b+c
df_bc = abn[abn['condition'] == 'b+c']

all_bc_abs_dfs =  []
for day in df_bc['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_bc[df_bc['samp_day']==day]['replicate'].unique():
        
        tdf = df_bc[(df_bc['samp_day']==day)&(df_bc['replicate']==rep)]
        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
    
    
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_bc_abs_dfs.append(df.reset_index())

#### B

In [12]:
# absorbance condition: b
df_b = abn[abn['condition'] == 'b']

all_b_abs_dfs = []
for day in df_b['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_b[df_b['samp_day']==day]['replicate'].unique():
        
        tdf = df_b[(df_b['samp_day']==day)&(df_b['replicate']==rep)]
        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
    
    
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_b_abs_dfs.append(df.reset_index())

## Fluorescence DFs

In [13]:
fls = dfscr[dfscr['measurement_type']=='fluorescence']
fls.shape

(7680, 14)

#### SC+C

In [14]:
# fluorescence condition: sc+c
df_scc = fls[fls['condition'] == 'sc+c']

all_scc_fls_dfs = []
for day in df_scc['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_scc[df_scc['samp_day']==day]['replicate'].unique():
        
        tdf = df_scc[(df_scc['samp_day']==day)&(df_scc['replicate']==rep)]
        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
    
    
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_scc_fls_dfs.append(df.reset_index())

#### B+C

In [15]:
# absorbance condition: b+c
df_bc = fls[fls['condition'] == 'b+c']

all_bc_fls_dfs =  []
for day in df_bc['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_bc[df_bc['samp_day']==day]['replicate'].unique():
        
        tdf = df_bc[(df_bc['samp_day']==day)&(df_bc['replicate']==rep)]
        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
    
    
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_bc_fls_dfs.append(df.reset_index())

#### B

In [16]:
# absorbance condition: b

# NOTE: there are no Fluorescence measurements
# for this condition hence the empty dataframe

df_b = fls[fls['condition'] == 'b']

all_b_fls_dfs = []
for day in df_b['samp_day'].unique():
    
    daily_reps_dfs = []
    for rep in df_b[df_b['samp_day']==day]['replicate'].unique():
        
        tdf = df_b[(df_b['samp_day']==day)&(df_b['replicate']==rep)]
        tdf = tdf[['content', 'well', 't(h)', 'corrected_measure']]
        
        tdf =  tdf.set_index(keys=['well', 'content'])
        
        
        daily_reps_dfs.append(tdf)
    
    
    
    df = pd.concat(daily_reps_dfs, join='outer', axis=0)
    df
    df = df.groupby(['well', 'content']).mean()[['t(h)','corrected_measure']]
    all_b_fls_dfs.append(df.reset_index())
    

In [17]:
df_b

Unnamed: 0,samp_day,media,condition,replicate,content,measurement_type,wavelength,row,column,t(h),measure,blank_measure_mean,corrected_measure,well


# Concatenate Respective Abs-Fluo Measurements

#### SC+C

In [18]:
#all_scc_abs_dfs

conc_scc = []
for ab,fl in zip(all_scc_abs_dfs, all_scc_fls_dfs):
    ab = ab.rename({'corrected_measure':'corr_abs'}, axis=1)
    ab = ab.set_index(keys=['well', 'content', 't(h)'])
    
    fl = fl.rename({'corrected_measure':'corr_fls'}, axis=1)
    fl = fl.set_index(keys=['well', 'content', 't(h)'])
    df =  pd.concat([ab,fl], join='outer', axis=1).reset_index()
    
    conc_scc.append(df)

#### B+C

In [19]:
conc_bc = []
for ab,fl in zip(all_bc_abs_dfs, all_bc_fls_dfs):
    ab = ab.rename({'corrected_measure':'corr_abs'}, axis=1)
    ab = ab.set_index(keys=['well', 'content', 't(h)'])
    
    fl = fl.rename({'corrected_measure':'corr_fls'}, axis=1)
    fl = fl.set_index(keys=['well', 'content', 't(h)'])
    
    df =  pd.concat([ab,fl], join='outer', axis=1).reset_index()
    
    conc_bc.append(df)

#### B

In [20]:
conc_b = []
for ab in all_b_abs_dfs:
    
    # there are NO B reps in flourescence so this is the control
    ab = ab.rename({'corrected_measure':'corr_abs'}, axis=1)
    #ab = ab.set_index(keys=['well', 'content', 't(h)'])

    conc_b.append(ab)
    


In [21]:
scc_df = pd.concat(conc_scc).reset_index(drop=True)
bc_df = pd.concat(conc_bc).reset_index(drop=True)
b_df = pd.concat(conc_b).reset_index(drop=True)

In [22]:
ths = []
for df in [scc_df, bc_df, b_df]:
    ths.append(df['t(h)'].values)
    

ths = np.array(ths).T
ths = pd.DataFrame(ths)

ths[3] = (ths[0]+ths[1]+ths[2])/3
ths[3] = ths[3].apply(lambda x: int(x))

for df in [scc_df, bc_df, b_df]:
    df['t(h)'] = ths[3]
    
    df.head()
    df.tail()

Unnamed: 0,well,content,t(h),corr_abs,corr_fls
0,A1,ICL_162,0,0.005721,25.0
1,A10,ICL_118B,0,0.005646,19.0
2,A11,ICL_100,0,0.002046,18.25
3,A12,ICL_108,0,0.005871,19.5
4,A2,ICL_202A,0,0.002221,22.5


Unnamed: 0,well,content,t(h),corr_abs,corr_fls
763,H5,ICL_179B,167,0.168479,7934.75
764,H6,Chlamy,167,0.090679,8172.25
765,H7,Chlamy,167,0.093054,8602.25
766,H8,Chlamy,167,0.087879,8208.0
767,H9,Chlamy,167,0.086854,8313.25


Unnamed: 0,well,content,t(h),corr_abs,corr_fls
0,A1,ICL_162,0,0.003238,22.166667
1,A10,ICL_118B,0,0.004588,23.666667
2,A11,ICL_100,0,0.000171,22.333333
3,A12,ICL_108,0,0.005521,23.833333
4,A2,ICL_202A,0,0.003655,24.333333


Unnamed: 0,well,content,t(h),corr_abs,corr_fls
763,H5,ICL_179B,167,0.176729,11052.833333
764,H6,Chlamy,167,0.165012,11958.5
765,H7,Chlamy,167,0.171595,12165.666667
766,H8,Chlamy,167,0.139495,10978.333333
767,H9,Chlamy,167,0.161679,11548.666667


Unnamed: 0,well,content,t(h),corr_abs
0,A1,ICL_162,0,0.001921
1,A10,ICL_118B,0,-0.000879
2,A11,ICL_100,0,0.000521
3,A12,ICL_108,0,-0.002529
4,A2,ICL_202A,0,0.004371


Unnamed: 0,well,content,t(h),corr_abs
763,H5,ICL_179B,167,-0.009871
764,H6,BLANK,167,-0.016221
765,H7,BLANK,167,0.019879
766,H8,BLANK,167,-0.008771
767,H9,BLANK,167,0.026329


In [23]:
path = """.../data_o/screens/screens_preprocessed_data"""
#scc_df.to_csv(pjoin(path,'sc2_condition_sc+c.csv'))
#bc_df.to_csv(pjoin(path,'sc2_condition_b+c.csv'))
#b_df.to_csv(pjoin(path,'sc2_condition_b.csv'))

# Integrity Checking

    First we derive the mapping on the 96-well plate. Note that only the 'B'
    condition has BLANK measurements, and that these blank measurements are
    used to correct the values for all the other wells

In [None]:
mapp_scc = pd.DataFrame()
mapp_bc = pd.DataFrame()
mapp_b = pd.DataFrame()

for cc, bb, aa in zip(conc_scc, conc_bc, conc_b):
    for l in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
        
        #cc[cc['well'].str.contains(l)]
        c = cc[cc['well'].str.contains(l)]['content'].values
        mapp_scc[l] = c
        
        b = bb[bb['well'].str.contains(l)]['content'].values
        mapp_bc[l] = b
        
        a = aa[aa['well'].str.contains(l)]['content'].values
        mapp_b[l] = a
    
    cc['t(h)'].unique()
    mapp_scc
    mapp_bc
    mapp_b
    break