# Screen 3 Preprocessing
    
    The media for all replicates is cb_1p6
    Two things are going on in this experiment:
        
        How does the chlamy-to-bact ratio afect growth through time
        
        sc+c or b+c (syncom, indiv_strain) vs. sc or b as control
    
    

# Imports

In [1]:
import numpy as np
import pandas as pd
import os
import scipy

import xlrd
import argparse

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec

from os.path import join as pjoin

import seaborn as sns
sns.set(style='whitegrid')

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'
import warnings; warnings.simplefilter('ignore')

# Data Pre-processing

In [2]:
path = '../data_o/screens/sc3'
sc3 = 'screen_3_data.tsv'
tdf = pd.read_csv(pjoin(path,sc3), sep='\t')
tdf.head()


Unnamed: 0,Sampling_date,Sampling_time,Sampling_datetime,Day,Media,Condition,replicate,chlamy_bact_ratio,Measurement_type,Wavelength,...,Measurement_blank_average,Measurement_fixed,ICL_ID,alsphere_ID,Kingdom,Phylum,Class,Order,Family,Genus
0,25.03.2019,14:20:01,2019-03-25 14:20:01,day-3,tp,b+c,rep-2,1.6,Absorbance,600_nm,...,0.119817,0.031883,ICL_116,Chlamy25,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Burkholderiaceae,Cupriavidus
1,25.03.2019,14:20:01,2019-03-25 14:20:01,day-3,tp,b+c,rep-2,1.6,Absorbance,600_nm,...,0.119817,0.048383,ICL_160,Chlamy74,Bacteria,Proteobacteria,Alphaproteobacteria,Caulobacterales,Caulobacteraceae,Brevundimonas
2,25.03.2019,14:20:01,2019-03-25 14:20:01,day-3,tp,b+c,rep-2,1.6,Absorbance,600_nm,...,0.119817,0.040583,ICL_201,Chlamy125,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas
3,25.03.2019,14:20:01,2019-03-25 14:20:01,day-3,tp,b+c,rep-2,1.6,Absorbance,600_nm,...,0.119817,0.042083,ICL_116,Chlamy25,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Burkholderiaceae,Cupriavidus
4,25.03.2019,14:20:01,2019-03-25 14:20:01,day-3,tp,b+c,rep-2,1.6,Absorbance,600_nm,...,0.119817,0.036583,ICL_160,Chlamy74,Bacteria,Proteobacteria,Alphaproteobacteria,Caulobacterales,Caulobacteraceae,Brevundimonas


In [6]:
dfscr = pd.read_csv(pjoin(path,sc3), sep='\t', parse_dates=['Sampling_datetime'], )


# separate taxonomic info for later use
taxonomic_df = dfscr[['alsphere_ID', 'Kingdom', 'Phylum',
                      'Class', 'Order', 'Family', 'Genus']
                    ].drop_duplicates(subset='alsphere_ID').dropna(how='any')
#taxonomic_df.head(2)


# select the information to work with
keep_cols = ['Sampling_datetime', 'replicate', 'Well', 'ICL_ID', 't (h)',
             'Condition', 'chlamy_bact_ratio', 'Measurement_type', 'Wavelength',
             'Measurement', 'Measurement_blank_average', 'Measurement_fixed']

dfscr = dfscr[keep_cols]

#drop duplicates
dfscr=dfscr.drop_duplicates()

# rename columns and remap/rename select values
dfscr.columns = map(str.lower, dfscr.columns)

rename_cols =  {'sampling_datetime':'samp_day', 'icl_id':'content', 't (h)':'t(h)',
                'measurement':'measure', 'chlamy_bact_ratio':'c_b_ratio', 
                'measurement_blank_average':'blank_mean', 'measurement_fixed':'corr_measure'}

dfscr = dfscr.rename(rename_cols, axis=1)

dfscr['replicate']=dfscr['replicate'].map({'rep-1':1,'rep-2':2})
dfscr['measurement_type'] = dfscr['measurement_type'].map(str.lower)
dfscr['measurement_type'] = dfscr['measurement_type'].map({'absorbance':'absorbance',
                                                           'fluorescence bottom reading':'fluorescence'})
wave_rename = {'Ex:440_nm/Em:680_nm':'680_nm', '750_nm':'750_nm', '600_nm':'600_nm'}
dfscr['wavelength'] = dfscr['wavelength'].map(wave_rename)


# sort values by using the measurement times
dfscr = dfscr.sort_values(by=['samp_day', 'c_b_ratio', 'replicate'] )

# reset index
dfscr = dfscr.reset_index(drop=True)

dfscr.head()

Unnamed: 0,samp_day,replicate,well,content,t(h),condition,c_b_ratio,measurement_type,wavelength,measure,blank_mean,corr_measure
0,2019-03-22 14:43:00,1,A1,ICL_116,0.0,b+c,160,absorbance,600_nm,0.1442,0.114167,0.030033
1,2019-03-22 14:43:00,1,B1,ICL_160,0.0,b+c,160,absorbance,600_nm,0.1461,0.114167,0.031933
2,2019-03-22 14:43:00,1,C1,ICL_201,0.0,b+c,160,absorbance,600_nm,0.1468,0.114167,0.032633
3,2019-03-22 14:43:00,1,D1,ICL_116,0.0,b+c,160,absorbance,600_nm,0.14,0.114167,0.025833
4,2019-03-22 14:43:00,1,E1,ICL_160,0.0,b+c,160,absorbance,600_nm,0.1464,0.114167,0.032233


## Correct Values Using 'BLANK'

In [7]:
# use the first days averages of blanks by condition and wavelength and apply it to subsequent days
gb = dfscr[(dfscr['samp_day']<='2019-03-23') & (dfscr['content']=='BLANK')].groupby(['condition','wavelength'])

correcting_dict = {}
for name,group in gb:
    correcting_dict[name] = group['measure'].mean()

# b+c and sc+c at the measured wavelengths have no blank measurements, for these we will average the 
# existing blanks of the first day since these happen to be in the same range
gmean = np.array([0.11525833234190941, 0.11583333276212215, 0.11559999982515971, 0.11473333090543747]).mean()
gmean


##
correcting_dict[('b+c', '600_nm')] = gmean
correcting_dict[('b+c', '680_nm')] = gmean
correcting_dict[('b+c', '750_nm')] = gmean

correcting_dict[('sc+c', '600_nm')] = gmean
correcting_dict[('sc+c', '680_nm')] = gmean
correcting_dict[('sc+c', '750_nm')] = gmean


gb = dfscr.groupby(['condition', 'wavelength'])

groups = []
for name,group in gb:
    #correcting_dict[name]
    group = group.copy()
    group['blank_mean'] = correcting_dict[name]
    group['corr_measure'] = group['measure'] - group['blank_mean']
    groups.append(group)

dfscr = pd.concat(groups).sort_values(by=['samp_day',
                                          'c_b_ratio',
                                          'replicate'] ).reset_index(drop=True)

dfscr.head()
dfscr.tail()

0.11535624895865719

Unnamed: 0,samp_day,replicate,well,content,t(h),condition,c_b_ratio,measurement_type,wavelength,measure,blank_mean,corr_measure
0,2019-03-22 14:43:00,1,A1,ICL_116,0.0,b+c,160,absorbance,600_nm,0.1442,0.115356,0.028844
1,2019-03-22 14:43:00,1,B1,ICL_160,0.0,b+c,160,absorbance,600_nm,0.1461,0.115356,0.030744
2,2019-03-22 14:43:00,1,C1,ICL_201,0.0,b+c,160,absorbance,600_nm,0.1468,0.115356,0.031444
3,2019-03-22 14:43:00,1,D1,ICL_116,0.0,b+c,160,absorbance,600_nm,0.14,0.115356,0.024644
4,2019-03-22 14:43:00,1,E1,ICL_160,0.0,b+c,160,absorbance,600_nm,0.1464,0.115356,0.031044


Unnamed: 0,samp_day,replicate,well,content,t(h),condition,c_b_ratio,measurement_type,wavelength,measure,blank_mean,corr_measure
31387,2019-04-03 17:20:26,1,D12,ICL_103,290.0,b,-,absorbance,750_nm,0.1123,0.115833,-0.003533
31388,2019-04-03 17:20:26,1,E12,ICL_48,290.0,b,-,absorbance,750_nm,0.1103,0.115833,-0.005533
31389,2019-04-03 17:20:26,1,F12,ICL_103,290.0,b,-,absorbance,750_nm,0.1162,0.115833,0.000367
31390,2019-04-03 17:20:26,1,G12,ICL_48,290.0,b,-,absorbance,750_nm,0.1226,0.115833,0.006767
31391,2019-04-03 17:20:26,1,H12,BLANK,290.0,b,-,absorbance,750_nm,0.1183,0.115833,0.002467


# Save to Disk

In [8]:
# # once the ambiguity has been removed save to disk
# fname =  'screen_3_parsed_data.tsv'
# save_to = os.path.join(path, fname)

# dfscr.to_csv(save_to, sep='\t', index=False)

In [9]:
# round datetimes
dfscr['samp_day'] = dfscr['samp_day'].dt.round('D')


# Segregate by Condition & Join Respective Wavelengths

# Condition B+C, B @ 600

In [10]:
A600 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='600_nm')]
A600.shape

A600['samp_day'].unique() # note that its collected for only six days

# media-partioned A600 data: condition = 'bc'
df_cb160 = A600[(A600['c_b_ratio']=='160')&(A600['condition'] == 'b+c')]
df_cb16 = A600[(A600['c_b_ratio']=='16')&(A600['condition'] == 'b+c')]
df_cb_1p6 = A600[(A600['c_b_ratio']=='1.6')&(A600['condition'] == 'b+c')]
df_cb_0 = A600[(A600['c_b_ratio']=='-')&(A600['condition'] == 'b')]


df_cb160.shape
df_cb16.shape
df_cb_1p6.shape
df_cb_0.shape

daily_cb160_reps_dfs_600 = []
daily_cb16_reps_dfs_600 = []
daily_cb_1p6_reps_dfs_600 = []
daily_cb_0_reps_dfs_600 = []
for day in A600['samp_day'].unique():
    #day
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    cb160 = df_cb160[(df_cb160['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb16 = df_cb16[(df_cb16['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_1p6 = df_cb_1p6[(df_cb_1p6['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_0 = df_cb_0[(df_cb_0['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column this helps
    # plot the information on a timewise manner much simpler
    try:
        for df in  [cb160, cb16, cb_1p6, cb_0]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    cb160 = cb160.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb16 = cb16.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb_1p6 = cb_1p6.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb_0 = cb_0.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)

    daily_cb160_reps_dfs_600.append(cb160)
    daily_cb16_reps_dfs_600.append(cb16)
    daily_cb_1p6_reps_dfs_600.append(cb_1p6)
    daily_cb_0_reps_dfs_600.append(cb_0)
    

(11424, 12)

array(['2019-03-23T00:00:00.000000000', '2019-03-24T00:00:00.000000000',
       '2019-03-25T00:00:00.000000000', '2019-03-26T00:00:00.000000000',
       '2019-03-27T00:00:00.000000000', '2019-03-28T00:00:00.000000000',
       '2019-03-29T00:00:00.000000000', '2019-03-30T00:00:00.000000000',
       '2019-04-02T00:00:00.000000000', '2019-04-04T00:00:00.000000000'],
      dtype='datetime64[ns]')

(1920, 12)

(1728, 12)

(1920, 12)

(1920, 12)

numpy.datetime64('2019-03-30T00:00:00.000000000')

# Condition B+C, B @ 750

In [11]:
A750 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='750_nm')]
A750.shape

A750['samp_day'].unique() # note that its collected for only six days


# media-partioned A600 data: condition = 'bc'
df_cb160 = A750[(A750['c_b_ratio']=='160')&(A750['condition'] == 'b+c')]
df_cb16 = A750[(A750['c_b_ratio']=='16')&(A750['condition'] == 'b+c')]
df_cb_1p6 = A750[(A750['c_b_ratio']=='1.6')&(A750['condition'] == 'b+c')]
df_cb_0 = A750[(A750['c_b_ratio']=='-')&(A750['condition'] == 'b')]



# there are 6 DFs in the lists below, each houses 3 different sets of strains ('replicates')
daily_cb160_reps_dfs_750 = []
daily_cb16_reps_dfs_750 = []
daily_cb_1p6_reps_dfs_750 = []
daily_cb_0_reps_dfs_750 = []
for day in A750['samp_day'].unique():
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    cb160 = df_cb160[(df_cb160['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb16 = df_cb16[(df_cb16['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_1p6 = df_cb_1p6[(df_cb_1p6['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_0 = df_cb_0[(df_cb_0['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column
    try:

        for df in  [cb160, cb16, cb_1p6, cb_0]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    cb160 = cb160.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb16 = cb16.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb_1p6 = cb_1p6.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb_0 = cb_0.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    
    
    daily_cb160_reps_dfs_750.append(cb160)
    daily_cb16_reps_dfs_750.append(cb16)
    daily_cb_1p6_reps_dfs_750.append(cb_1p6)
    daily_cb_0_reps_dfs_750.append(cb_0)

(11424, 12)

array(['2019-03-23T00:00:00.000000000', '2019-03-24T00:00:00.000000000',
       '2019-03-25T00:00:00.000000000', '2019-03-26T00:00:00.000000000',
       '2019-03-27T00:00:00.000000000', '2019-03-28T00:00:00.000000000',
       '2019-03-29T00:00:00.000000000', '2019-03-30T00:00:00.000000000',
       '2019-04-02T00:00:00.000000000', '2019-04-04T00:00:00.000000000'],
      dtype='datetime64[ns]')

numpy.datetime64('2019-03-30T00:00:00.000000000')

# Concatenate B+C, B

In [12]:
# zip ensure that only lists of mathching size get put together, thus even though A750 DFs
# have 10 days worth of data they are limited by the length of A600 data, this matters
# because it makes no sense to apply the models to non-existing A600 data


############
### cb160 ###
############
cb160_final_daily_BC = []
for a6,a7 in zip(daily_cb160_reps_dfs_600, daily_cb160_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb160_final_daily_BC.append(df)
    
cb160_bc_df = pd.concat(cb160_final_daily_BC)

############
### cb16 ###
############
cb16_final_daily_BC = []
for a6,a7 in zip(daily_cb16_reps_dfs_600, daily_cb16_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb16_final_daily_BC.append(df)

cb16_bc_df = pd.concat(cb16_final_daily_BC)


############
### cb_1p6 ###
############
cb_1p6_final_daily_BC = []
for a6,a7 in zip(daily_cb_1p6_reps_dfs_600, daily_cb_1p6_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb_1p6_final_daily_BC.append(df)

cb_1p6_bc_df = pd.concat(cb_1p6_final_daily_BC)


############
### cb_0 ###
############
cb_0_final_daily_BC = []
for a6,a7 in zip(daily_cb_0_reps_dfs_600, daily_cb_0_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb_0_final_daily_BC.append(df)

    
cb_0_bc_df = pd.concat(cb_0_final_daily_BC)

In [13]:
cb160_bc_df['b_c_ratio'] = '160'
cb16_bc_df['b_c_ratio'] = '16'
cb_1p6_bc_df['b_c_ratio'] = '1.6'
cb_0_bc_df['b_c_ratio'] = '-'

bc_dfs = [cb160_bc_df, cb16_bc_df, cb_1p6_bc_df, cb_0_bc_df]

bcdfs = pd.concat(bc_dfs).reset_index(drop=True)

path = '/home/rdmtinez/Documents/B-IT MS Program/Masters Thesis/data_o/screens/screens_preprocessed_data'
fname = 'sc3_test_ratios_condition_b+c.csv'
bcdfs.to_csv(pjoin(path, fname))
# save to disk

# Condition SC+C, B @ 600

In [14]:
A600 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='600_nm')]
A600.shape

A600['samp_day'].unique() # note that its collected for only six days

# media-partioned A600 data: condition = 'bc'
df_cb160 = A600[(A600['c_b_ratio']=='160')&(A600['condition'] == 'sc+c')]
df_cb16 = A600[(A600['c_b_ratio']=='16')&(A600['condition'] == 'sc+c')]
df_cb_1p6 = A600[(A600['c_b_ratio']=='1.6')&(A600['condition'] == 'sc+c')]
df_cb_0 = A600[(A600['c_b_ratio']=='-')&(A600['condition'] == 'sc')]


df_cb160.shape
df_cb16.shape
df_cb_1p6.shape
df_cb_0.shape

daily_cb160_reps_dfs_600 = []
daily_cb16_reps_dfs_600 = []
daily_cb_1p6_reps_dfs_600 = []
daily_cb_0_reps_dfs_600 = []
for day in A600['samp_day'].unique():
    #day
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    cb160 = df_cb160[(df_cb160['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb16 = df_cb16[(df_cb16['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_1p6 = df_cb_1p6[(df_cb_1p6['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_0 = df_cb_0[(df_cb_0['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column this helps
    # plot the information on a timewise manner much simpler
    try:
        for df in  [cb160, cb16, cb_1p6, cb_0]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    cb160 = cb160.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb16 = cb16.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb_1p6 = cb_1p6.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)
    cb_0 = cb_0.rename({'measure':'A600', 'corr_measure':'A600c'}, axis=1)

    daily_cb160_reps_dfs_600.append(cb160)
    daily_cb16_reps_dfs_600.append(cb16)
    daily_cb_1p6_reps_dfs_600.append(cb_1p6)
    daily_cb_0_reps_dfs_600.append(cb_0)
    

(11424, 12)

array(['2019-03-23T00:00:00.000000000', '2019-03-24T00:00:00.000000000',
       '2019-03-25T00:00:00.000000000', '2019-03-26T00:00:00.000000000',
       '2019-03-27T00:00:00.000000000', '2019-03-28T00:00:00.000000000',
       '2019-03-29T00:00:00.000000000', '2019-03-30T00:00:00.000000000',
       '2019-04-02T00:00:00.000000000', '2019-04-04T00:00:00.000000000'],
      dtype='datetime64[ns]')

(960, 12)

(1056, 12)

(960, 12)

(960, 12)

# Condition SC+C, B @ 750

In [15]:
A750 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='750_nm')]
A750.shape

A750['samp_day'].unique() # note that its collected for only six days


# media-partioned A600 data: condition = 'bc'
df_cb160 = A750[(A750['c_b_ratio']=='160')&(A750['condition'] == 'sc+c')]
df_cb16 = A750[(A750['c_b_ratio']=='16')&(A750['condition'] == 'sc+c')]
df_cb_1p6 = A750[(A750['c_b_ratio']=='1.6')&(A750['condition'] == 'sc+c')]
df_cb_0 = A750[(A750['c_b_ratio']=='-')&(A750['condition'] == 'sc')]

df_cb160.shape
df_cb16.shape
df_cb_1p6.shape
df_cb_0.shape

# there are 6 DFs in the lists below, each houses 3 different sets of strains ('replicates')
daily_cb160_reps_dfs_750 = []
daily_cb16_reps_dfs_750 = []
daily_cb_1p6_reps_dfs_750 = []
daily_cb_0_reps_dfs_750 = []
for day in A750['samp_day'].unique():
    keep_cols = ['t(h)', 'well', 'replicate', 'content', 'measure', 'corr_measure']
    cb160 = df_cb160[(df_cb160['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb16 = df_cb16[(df_cb16['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_1p6 = df_cb_1p6[(df_cb_1p6['samp_day']==day)][keep_cols].reset_index(drop=True)
    cb_0 = df_cb_0[(df_cb_0['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column
    try:

        for df in  [cb160, cb16, cb_1p6, cb_0]:
            df['t(h)'] = int(df['t(h)'].mean())
    except:
        day
        #df
        continue
    
    cb160 = cb160.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb16 = cb16.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb_1p6 = cb_1p6.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    cb_0 = cb_0.rename({'measure':'A750', 'corr_measure':'A750c'}, axis=1)
    
    
    daily_cb160_reps_dfs_750.append(cb160)
    daily_cb16_reps_dfs_750.append(cb16)
    daily_cb_1p6_reps_dfs_750.append(cb_1p6)
    daily_cb_0_reps_dfs_750.append(cb_0)
    

(11424, 12)

array(['2019-03-23T00:00:00.000000000', '2019-03-24T00:00:00.000000000',
       '2019-03-25T00:00:00.000000000', '2019-03-26T00:00:00.000000000',
       '2019-03-27T00:00:00.000000000', '2019-03-28T00:00:00.000000000',
       '2019-03-29T00:00:00.000000000', '2019-03-30T00:00:00.000000000',
       '2019-04-02T00:00:00.000000000', '2019-04-04T00:00:00.000000000'],
      dtype='datetime64[ns]')

(960, 12)

(1056, 12)

(960, 12)

(960, 12)

# Concatenate SC+C, SC

In [16]:
# zip ensure that only lists of mathching size get put together, thus even though A750 DFs
# have 10 days worth of data they are limited by the length of A600 data, this matters
# because it makes no sense to apply the models to non-existing A600 data


############
### cb160 ###
############
cb160_final_daily_SC = []
for a6,a7 in zip(daily_cb160_reps_dfs_600, daily_cb160_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb160_final_daily_SC.append(df)
    
cb160_sc_df = pd.concat(cb160_final_daily_SC)

############
### cb16 ###
############
cb16_final_daily_SC = []
for a6,a7 in zip(daily_cb16_reps_dfs_600, daily_cb16_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb16_final_daily_SC.append(df)

cb16_sc_df = pd.concat(cb16_final_daily_SC)


############
### cb_1p6 ###
############
cb_1p6_final_daily_SC = []
for a6,a7 in zip(daily_cb_1p6_reps_dfs_600, daily_cb_1p6_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb_1p6_final_daily_SC.append(df)

cb_1p6_sc_df = pd.concat(cb_1p6_final_daily_SC)


############
### cb_0 ###
############
cb_0_final_daily_SC = []
for a6,a7 in zip(daily_cb_0_reps_dfs_600, daily_cb_0_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','replicate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    cb_0_final_daily_SC.append(df)

    
cb_0_sc_df = pd.concat(cb_0_final_daily_SC)

In [17]:
cb160_sc_df['b_c_ratio'] = '160'
cb16_sc_df['b_c_ratio'] = '16'
cb_1p6_sc_df['b_c_ratio'] = '1.6'
cb_0_sc_df['b_c_ratio'] = '-'

sc_dfs = [cb160_sc_df, cb16_sc_df, cb_1p6_sc_df, cb_0_sc_df]

scdfs = pd.concat(bc_dfs).reset_index(drop=True)

path = '../data_o/screens/screens_preprocessed_data'
fname = 'sc3_test_ratios_condition_sc+c.csv'


#scdfs.to_csv(pjoin(path, fname))
# save to disk