# Screen 1 Preprocessing
    
    The media is the main point of change in this experiement
    
    There are 4 types of media, these dictate what should grow and
    what shouldn't grow


# Imports

In [2]:
import numpy as np
import pandas as pd
import os
import scipy

import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.gridspec as gridspec

from os.path import join as pjoin 

import seaborn as sns
sns.set(style='whitegrid')

%config InteractiveShell.ast_node_interactivity='all'
%config InlineBackend.figure_format = 'svg'
import warnings; warnings.simplefilter('ignore')

In [3]:
pd.set_option('max_rows', 300)

# Data Pre-Processing

In [4]:
path = '../data_o/screens/sc1'
fname=  'screen_1_w_gene_info.tsv'

In [5]:
filepath = os.path.join(path, fname)
data = pd.read_csv(filepath, sep='\t')

In [33]:
# The original dataframe is such that it contains many duplicate samples,
# its length (1063792) is 20x bigger than expected for the raw files which
# were originally parsed by a colleague, thus a drop_duplicates function
# is called to remove this unneeded data.

# original dataframe has a lot of columns not needed for this specific work
keep_cols = ['Sampling_date', 'Media', 'Condition', 'Plate', 'Strain_id',
             'Measurement_type', 'Wavelength', 'Row', 'Column', 't (h)', 
             'Measurement', 'Measurement_blank_average', 'Measurement_fixed']
dfscr = data[keep_cols]

# This drop duplicates removes many more after its called post-removal of the 
# columns not withing in the keep_columns list mention above this statement
dfscr = dfscr.drop_duplicates()
dfscr.head()
dfscr.shape # 51840

# lowercase all columns
dfscr.columns = map(str.lower, dfscr.columns)

# rename columns
rename_cols =  {'strain_id':'content', 't (h)':'t(h)','measurement':'measure', 
                'measurement_blank_average':'blank_measure_mean',
                'sampling_date':'samp_day',
                'measurement_fixed':'corrected_measure'}

dfscr = dfscr.rename(rename_cols, axis=1)
#rename and lowercase meas. type values
type_rename = {'Fluorescence Bottom Reading': 'fluorescence', 'Absorbance':'absorbance'}
dfscr['measurement_type'] = dfscr['measurement_type'].map(type_rename)

# rename and lowercase wavelength values
wave_rename = {'Ex:440_nm/Em:680_nm':'680_nm', '750_nm':'750_nm', '600_nm':'600_nm'}
dfscr['wavelength'] = dfscr['wavelength'].map(wave_rename)

# convert categorical int values to string types
dfscr['plate'] = dfscr['plate'].map(str)
dfscr['column'] = dfscr['column'].map(str)
dfscr['t(h)'] = dfscr['t(h)'].map(int)

# create well column to replace row & column 
dfscr['well'] = dfscr['row']+dfscr['column']

# drop columns
dfscr =  dfscr.drop(labels=['row', 'column'], axis=1)

# convert to datetime
dfscr['samp_day'] = pd.to_datetime(dfscr['samp_day'], dayfirst=True)

# sort values by using the measurement times
dfscr = dfscr.sort_values(by=['samp_day', 'media', 'condition', 'plate',
                              'measurement_type'])
dfscr = dfscr.reset_index(drop=True)

dfscr.head()
dfscr.shape

Unnamed: 0,Sampling_date,Media,Condition,Plate,Strain_id,Measurement_type,Wavelength,Row,Column,t (h),Measurement,Measurement_blank_average,Measurement_fixed
0,14.12.2018,tpn,b,1,204,Absorbance,750_nm,A,1,210.0,0.0949,0.097194,-0.002294
1,14.12.2018,tpn,b,1,161B,Absorbance,750_nm,B,1,210.0,0.0944,0.097194,-0.002794
2,14.12.2018,tpn,b,1,202B,Absorbance,750_nm,C,1,210.0,0.1016,0.097194,0.004406
3,14.12.2018,tpn,b,1,BLANK,Absorbance,750_nm,D,1,210.0,0.0938,0.097194,-0.003394
34,14.12.2018,tpn,b,1,129A,Absorbance,750_nm,E,1,210.0,0.1004,0.097194,0.003206


(51840, 13)

Unnamed: 0,samp_day,media,condition,plate,content,measurement_type,wavelength,t(h),measure,blank_measure_mean,corrected_measure,well
0,2018-12-05,tap,b,1,204,absorbance,600_nm,1,0.1177,0.120141,-0.002441,A1
1,2018-12-05,tap,b,1,161B,absorbance,600_nm,1,0.1104,0.120141,-0.009741,B1
2,2018-12-05,tap,b,1,202B,absorbance,600_nm,1,0.1114,0.120141,-0.008741,C1
3,2018-12-05,tap,b,1,BLANK,absorbance,600_nm,1,0.1129,0.120141,-0.007241,D1
4,2018-12-05,tap,b,1,129A,absorbance,600_nm,1,0.1165,0.120141,-0.003641,E1


(51840, 12)

## Correct Values Using 'BLANK'

In [6]:
# It was noticed that the data for screen_1 did not have BLANK measurements for the condition 'bc' at any 
# of the media labels. Only when the condition was 'b' were blank measurements taken. To correct for this
# the daily average of the 3 replicates per media of condition 'b' were used to correct the signal of 
# condition 'bc'. Its also worth mentioning that the varaince of the blank measurements per media have some 
# strong outliers as noted by the 'max' values below. After to talking to my colleage it was determined
# that evaporation and condesation was probably the main culprits and that perhaps only the initial BLANK
# mesurements should be considered....

# I'll consider both cases and see if any significant difference arise in the analysis
    
# To complicate matters more, blank fluorescence measuremnts were taken only before 08.12.2018  data and only
# when the condition was 'b' as previoiusly mentioned. Recall that these values were then averaged and input 
# onto 'blank_measure_mean' column for when the condition is 'bc'. To correct for the data post 08.12 the 
# 'blank' average for a given media used will be applied to the days not having a blank measurement. 



# in this particular case we will get the average blank measurement of condition 'b'
# (as it is the only one that has blank measurements) over each day, media, wavelen
# and plate, and apply it to all conditions, to ensure that all values are changed
# accordingly we will remove all the 'measurement_blank_mean' values and check the
# then resulting dataframe


####### BLANK means ##########
#blank_means

# After samp_day == 2018-12-08 the there are no more 'b' only replicates for fluorescence values
# and therefore there does not exist anymore blank measurements as only the plates with 'b' 
# conditions were fitted with blanks. Therefore the last blank measurement mean will be used to 
# fill in the blanks of those replicaftes. Furthermore, those blank measurements tend to be
# whole numbers so the mean value will be rounded to the neearest whole number, or to median

        
        # changes the NaN values of the blank mean to last known blank mean
        # irrespective of date and on a per media and plate basis, although
        # these last two conditions should not matter much it is set this 
        # way for thuroughness


In [7]:
# use the first days averages of blanks as the value by which to correct all subsequent days
gb = dfscr[(dfscr['samp_day']=='2018-12-05') & (dfscr['content']=='BLANK')].groupby(['media', 'wavelength'])

correcting_dict = {}
for name,group in gb:
    correcting_dict[name] = group['measure'].mean()

    

gb = dfscr.groupby(['media', 'wavelength'])

groups = []
for name,group in gb:
    group = group.copy()
    group['blank_measure_mean'] = correcting_dict[name]
    group['corrected_measure'] = group['measure'] - group['blank_measure_mean']
    groups.append(group)

dfscr=pd.concat(groups)


('tap', '600_nm')

0.11966893178166695

('tap', '680_nm')

2.6990291262135924

('tap', '750_nm')

0.11733495096847849

('tapn', '600_nm')

0.11803009715473768

('tapn', '680_nm')

3.262135922330097

('tapn', '750_nm')

0.11649611613993506

('tp', '600_nm')

0.14495922342955486

('tp', '680_nm')

3.0388349514563107

('tp', '750_nm')

0.1374310676767988

('tpn', '600_nm')

0.11881262136315837

('tpn', '680_nm')

3.0388349514563107

('tpn', '750_nm')

0.11742135911311918

In [8]:
# are there any null values
dfscr.loc[dfscr.isnull().any(axis=1)]
dfscr.loc[dfscr.isna().any(axis=1)]

Unnamed: 0,samp_day,media,condition,plate,content,measurement_type,wavelength,t(h),measure,blank_measure_mean,corrected_measure,well


Unnamed: 0,samp_day,media,condition,plate,content,measurement_type,wavelength,t(h),measure,blank_measure_mean,corrected_measure,well


# Save to Disk

In [9]:
# # once the ambiguity has been removed save to disk
 fname =  'screen_1_parsed_data.tsv'

    
# save_to = os.path.join(path, fname)
# dfscr.to_csv(save_to, sep='\t', index=False)

# Segregate by Condition and Media

    A600 data was collected for only six days, after this the data became too noisy and was no longer
    measured. A750 was measured over 10 days along with Fluorescence.
    
    # Note:

    Since the Models DO NOT include Fluorescence as a parameter 
    it is not merged onto later dataframes for visualization, nor
    do the concatenation need to include the latter 4 days given 
    that no models using A750 and F680 are made. These models
    dot have very good predictive power so they are ignored 

In [10]:
dfscr.head()

Unnamed: 0,samp_day,media,condition,plate,content,measurement_type,wavelength,t(h),measure,blank_measure_mean,corrected_measure,well
0,2018-12-05,tap,b,1,204,absorbance,600_nm,1,0.1177,0.119669,-0.001969,A1
1,2018-12-05,tap,b,1,161B,absorbance,600_nm,1,0.1104,0.119669,-0.009269,B1
2,2018-12-05,tap,b,1,202B,absorbance,600_nm,1,0.1114,0.119669,-0.008269,C1
3,2018-12-05,tap,b,1,BLANK,absorbance,600_nm,1,0.1129,0.119669,-0.006769,D1
4,2018-12-05,tap,b,1,129A,absorbance,600_nm,1,0.1165,0.119669,-0.003169,E1


# CONDITION: B+C 
    
    Bacteria + Chlamy

### A600 Partion

In [11]:
A600 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='600_nm')]
A600.shape

(13824, 12)

In [12]:
A600['samp_day'].unique() # note that its collected for only six days

array(['2018-12-05T00:00:00.000000000', '2018-12-06T00:00:00.000000000',
       '2018-12-07T00:00:00.000000000', '2018-12-08T00:00:00.000000000',
       '2018-12-09T00:00:00.000000000', '2018-12-10T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [13]:
# media-partioned A600 data: condition = 'bc'
df_tap = A600[(A600['media']=='tap')&(A600['condition'] == 'bc')]
df_tapn = A600[(A600['media']=='tapn')&(A600['condition'] == 'bc')]
df_tp = A600[(A600['media']=='tp')&(A600['condition'] == 'bc')]
df_tpn = A600[(A600['media']=='tpn')&(A600['condition'] == 'bc')]

In [14]:
# there are 6 DFs in the lists below, each houses 3 different sets of strains ('plates')
daily_tap_reps_dfs_600 = []
daily_tapn_reps_dfs_600 = []
daily_tp_reps_dfs_600 = []
daily_tpn_reps_dfs_600 = []
for day in A600['samp_day'].unique():
    
    keep_cols = ['t(h)', 'well', 'plate', 'content', 'measure', 'corrected_measure']
    tap = df_tap[(df_tap['samp_day']==day)][keep_cols].reset_index(drop=True)
    tapn = df_tapn[(df_tapn['samp_day']==day)][keep_cols].reset_index(drop=True)
    tp = df_tp[(df_tp['samp_day']==day)][keep_cols].reset_index(drop=True)
    tpn = df_tpn[(df_tpn['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column
    for df in  [tap, tapn, tp, tpn]:
        df['t(h)'] = int(df['t(h)'].mean())
    
    
    tap = tap.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tapn = tapn.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tp = tp.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tpn = tpn.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)

    daily_tap_reps_dfs_600.append(tap)
    daily_tapn_reps_dfs_600.append(tapn)
    daily_tp_reps_dfs_600.append(tp)
    daily_tpn_reps_dfs_600.append(tpn)

In [15]:
len(daily_tapn_reps_dfs_600)
for df in daily_tap_reps_dfs_600:
    df.head()
    break

6

Unnamed: 0,t(h),well,plate,content,A600,A600c
0,0,A1,1,204,0.1306,0.010931
1,0,B1,1,161B,0.1297,0.010031
2,0,C1,1,202B,0.1266,0.006931
3,0,D1,1,Chlamy,0.1289,0.009231
4,0,E1,1,129A,0.1241,0.004431


## A750 Partition

In [16]:
A750 = dfscr[(dfscr['measurement_type']=='absorbance')&
             (dfscr['wavelength']=='750_nm')]
A750.shape

(23040, 12)

In [17]:
# days when this dataset was collected --- Note, that unlike A600, the A750 data
# were collected over the 10 days the experiment was conducted. I believe it was
# determined at some point that that the A600 signal was too noisy
A750[(A750['media']=='tap')&(A750['condition'] == 'bc')]['samp_day'].unique()

array(['2018-12-05T00:00:00.000000000', '2018-12-06T00:00:00.000000000',
       '2018-12-07T00:00:00.000000000', '2018-12-08T00:00:00.000000000',
       '2018-12-09T00:00:00.000000000', '2018-12-10T00:00:00.000000000',
       '2018-12-11T00:00:00.000000000', '2018-12-12T00:00:00.000000000',
       '2018-12-13T00:00:00.000000000', '2018-12-14T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [18]:
# media-partioned absorbance dfs: condition 'bc'
df_tap = A750[(A750['media']=='tap')&(A750['condition'] == 'bc')]
df_tapn = A750[(A750['media']=='tapn')&(A750['condition'] == 'bc')]
df_tp = A750[(A750['media']=='tp')&(A750['condition'] == 'bc')]
df_tpn = A750[(A750['media']=='tpn')&(A750['condition'] == 'bc')]

In [19]:
# there are 12 DFs in the lists below, each houses 3 different sets of strains ('plates')

daily_tap_reps_dfs_750 = []
daily_tapn_reps_dfs_750 = []
daily_tp_reps_dfs_750 = []
daily_tpn_reps_dfs_750 = []
for day in A750['samp_day'].unique():
    
    keep_cols = ['t(h)', 'well', 'plate', 'content', 'measure', 'corrected_measure']
    tap = df_tap[(df_tap['samp_day']==day)][keep_cols].reset_index(drop=True)
    tapn = df_tapn[(df_tapn['samp_day']==day)][keep_cols].reset_index(drop=True)
    tp = df_tp[(df_tp['samp_day']==day)][keep_cols].reset_index(drop=True)
    tpn = df_tpn[(df_tpn['samp_day']==day)][keep_cols].reset_index(drop=True)

    # ensure that the daily replicates have a mean 't(h)' column
    for df in  [tap, tapn, tp, tpn]:
        df['t(h)'] = int(df['t(h)'].mean())
        
    tap = tap.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tapn = tapn.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tp = tp.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tpn = tpn.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    
    daily_tap_reps_dfs_750.append(tap)
    daily_tapn_reps_dfs_750.append(tapn)
    daily_tp_reps_dfs_750.append(tp)
    daily_tpn_reps_dfs_750.append(tpn)

In [20]:
len(daily_tapn_reps_dfs_750)
for df in daily_tap_reps_dfs_750:
    df.head()
    break

10

Unnamed: 0,t(h),well,plate,content,A750,A750c
0,0,A1,1,204,0.1275,0.010165
1,0,B1,1,161B,0.1264,0.009065
2,0,C1,1,202B,0.1223,0.004965
3,0,D1,1,Chlamy,0.1255,0.008165
4,0,E1,1,129A,0.1215,0.004165


# Concatenate B+C 6-day DFs 

In [21]:
# zip ensure that only lists of mathching size get put together, thus even though A750 DFs
# have 10 days worth of data they are limited by the length of A600 data, this matters
# because it makes no sense to apply the models to non-existing A600 data


############
### TAP ###
############
tap_final_daily_BC = []
for a6,a7 in zip(daily_tap_reps_dfs_600, daily_tap_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tap_final_daily_BC.append(df)
    
tap_bc_df = pd.concat(tap_final_daily_BC)

############
### TAPN ###
############
tapn_final_daily_BC = []
for a6,a7 in zip(daily_tapn_reps_dfs_600, daily_tapn_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tapn_final_daily_BC.append(df)

tapn_bc_df = pd.concat(tapn_final_daily_BC)


############
### TP ###
############
tp_final_daily_BC = []
for a6,a7 in zip(daily_tp_reps_dfs_600, daily_tp_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tp_final_daily_BC.append(df)

tp_bc_df = pd.concat(tp_final_daily_BC)


############
### TPN ###
############
tpn_final_daily_BC = []
for a6,a7 in zip(daily_tpn_reps_dfs_600, daily_tpn_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tpn_final_daily_BC.append(df)

    
tpn_bc_df = pd.concat(tpn_final_daily_BC)


In [22]:
tap_bc_df['media'] = 'tap'
tapn_bc_df['media'] = 'tapn'
tp_bc_df['media'] = 'tp'
tpn_bc_df['media'] = 'tpn'

In [23]:
bc_dfs = [tap_bc_df, tapn_bc_df, tp_bc_df, tpn_bc_df]

bcdfs = pd.concat(bc_dfs).reset_index(drop=True)

bcdfs = bcdfs[['media', 'plate', 'well', 't(h)', 'content', 'A600', 'A600c', 'A750', 'A750c']]

In [24]:
path = '../data_o/screens/screens_preprocessed_data'


# bcdfs.to_csv(pjoin(path, 'sc1_condition_bc.csv'))

# CONDITION: B

### A600 B

In [25]:
# media-partioned A600 data: condition = 'bc'
df_tap = A600[(A600['media']=='tap')&(A600['condition'] == 'b')]
df_tapn = A600[(A600['media']=='tapn')&(A600['condition'] == 'b')]
df_tp = A600[(A600['media']=='tp')&(A600['condition'] == 'b')]
df_tpn = A600[(A600['media']=='tpn')&(A600['condition'] == 'b')]

In [26]:
# there are 6 DFs in the lists below, each houses 3 different sets of strains ('plates')
daily_tap_reps_dfs_600 = []
daily_tapn_reps_dfs_600 = []
daily_tp_reps_dfs_600 = []
daily_tpn_reps_dfs_600 = []
for day in A600['samp_day'].unique():
    
    
    keep_cols = ['t(h)', 'well', 'plate', 'content', 'measure', 'corrected_measure']
    tap = df_tap[(df_tap['samp_day']==day)][keep_cols].reset_index(drop=True)
    tapn = df_tapn[(df_tapn['samp_day']==day)][keep_cols].reset_index(drop=True)
    tp = df_tp[(df_tp['samp_day']==day)][keep_cols].reset_index(drop=True)
    tpn = df_tpn[(df_tpn['samp_day']==day)][keep_cols].reset_index(drop=True)
    
    
    # ensure that the daily replicates have a mean 't(h)' column
    for df in  [tap, tapn, tp, tpn]:
        df['t(h)'] = int(df['t(h)'].mean())
    
    
    tap = tap.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tapn = tapn.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tp = tp.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)
    tpn = tpn.rename({'measure':'A600', 'corrected_measure':'A600c'}, axis=1)

    daily_tap_reps_dfs_600.append(tap)
    daily_tapn_reps_dfs_600.append(tapn)
    daily_tp_reps_dfs_600.append(tp)
    daily_tpn_reps_dfs_600.append(tpn)

In [27]:
len(daily_tapn_reps_dfs_600)
for df in daily_tap_reps_dfs_600:
    df.head()
    break

6

Unnamed: 0,t(h),well,plate,content,A600,A600c
0,0,A1,1,204,0.1177,-0.001969
1,0,B1,1,161B,0.1104,-0.009269
2,0,C1,1,202B,0.1114,-0.008269
3,0,D1,1,BLANK,0.1129,-0.006769
4,0,E1,1,129A,0.1165,-0.003169


### A750 B

In [28]:
# media-partioned absorbance dfs: condition 'bc'
df_tap = A750[(A750['media']=='tap')&(A750['condition'] == 'b')]
df_tapn = A750[(A750['media']=='tapn')&(A750['condition'] == 'b')]
df_tp = A750[(A750['media']=='tp')&(A750['condition'] == 'b')]
df_tpn = A750[(A750['media']=='tpn')&(A750['condition'] == 'b')]

In [29]:
# there are 12 DFs in the lists below, each houses 3 different sets of strains ('plates')

daily_tap_reps_dfs_750 = []
daily_tapn_reps_dfs_750 = []
daily_tp_reps_dfs_750 = []
daily_tpn_reps_dfs_750 = []
for day in A750['samp_day'].unique():
    
    keep_cols = ['t(h)', 'well', 'plate', 'content', 'measure', 'corrected_measure']
    tap = df_tap[(df_tap['samp_day']==day)][keep_cols].reset_index(drop=True)
    tapn = df_tapn[(df_tapn['samp_day']==day)][keep_cols].reset_index(drop=True)
    tp = df_tp[(df_tp['samp_day']==day)][keep_cols].reset_index(drop=True)
    tpn = df_tpn[(df_tpn['samp_day']==day)][keep_cols].reset_index(drop=True)

    # ensure that the daily replicates have a mean 't(h)' column
    for df in  [tap, tapn, tp, tpn]:
        df['t(h)'] = int(df['t(h)'].mean())
        
    tap = tap.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tapn = tapn.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tp = tp.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    tpn = tpn.rename({'measure':'A750','corrected_measure':'A750c'}, axis=1)
    
    daily_tap_reps_dfs_750.append(tap)
    daily_tapn_reps_dfs_750.append(tapn)
    daily_tp_reps_dfs_750.append(tp)
    daily_tpn_reps_dfs_750.append(tpn)

In [30]:
len(daily_tapn_reps_dfs_750)
for df in daily_tap_reps_dfs_750:
    df.head()
    break

10

Unnamed: 0,t(h),well,plate,content,A750,A750c
0,0,A1,1,204,0.1162,-0.001135
1,0,B1,1,161B,0.1105,-0.006835
2,0,C1,1,202B,0.1106,-0.006735
3,0,D1,1,BLANK,0.1114,-0.005935
4,0,E1,1,129A,0.1139,-0.003435


# Concatenate 'B' 6-day DFs 

In [31]:
# zip ensure that only lists of mathching size get put together, thus even though A750 DFs
# have 10 days worth of data they are limited by the length of A600 data, this matters
# because it makes no sense to apply the models to non-existing A600 data


############
### TAP ###
############
tap_final_daily_B = []
for a6,a7 in zip(daily_tap_reps_dfs_600, daily_tap_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tap_final_daily_B.append(df)
    
tap_b_df = pd.concat(tap_final_daily_B)

############
### TAPN ###
############
tapn_final_daily_B = []
for a6,a7 in zip(daily_tapn_reps_dfs_600, daily_tapn_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tapn_final_daily_B.append(df)

tapn_b_df = pd.concat(tapn_final_daily_B)


############
### TP ###
############
tp_final_daily_B = []
for a6,a7 in zip(daily_tp_reps_dfs_600, daily_tp_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tp_final_daily_B.append(df)

tp_b_df = pd.concat(tp_final_daily_B)


############
### TPN ###
############
tpn_final_daily_B = []
for a6,a7 in zip(daily_tpn_reps_dfs_600, daily_tpn_reps_dfs_750):

    a6 = a6.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    a7 = a7.set_index(keys=['t(h)', 'well','plate', 'content'])#, drop=True)
    
    df = pd.concat([a6, a7], join='outer', axis=1).reset_index()
    
    tpn_final_daily_B.append(df)

    
tpn_b_df = pd.concat(tpn_final_daily_B)

In [32]:
tap_b_df['media'] = 'tap'
tapn_b_df['media'] = 'tapn'
tp_b_df['media'] = 'tp'
tpn_b_df['media'] = 'tpn'

b_dfs = [tap_b_df, tapn_b_df, tp_b_df, tpn_b_df]

bdfs = pd.concat(b_dfs).reset_index(drop=True)

bdfs = bdfs[['media', 'plate', 'well', 't(h)', 'content', 'A600', 'A600c', 'A750', 'A750c']]

path = '../data_o/screens/screens_preprocessed_data'
bdfs.to_csv(pjoin(path, 'sc1_condition_b.csv'))