In [30]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.optimize import curve_fit
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb

import random
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
def generate_metadata(well, plate_map):
    
    df = pd.merge(well, plate_map, on='Well', how='left')
    return df.dropna()

def transpose_data(df, col, timepoint=20):
    
    df.set_index(col, inplace=True)
    df = df.transpose().reset_index(drop=True)
    df.index = df.index * timepoint
    return df

def generate_data(df, name, datapoint=2, num_data=4, start_idx=3, col='name'):
    
    datas = []
    idxs = [start_idx]
    for i in range(1, num_data+1):
        idxs.append(idxs[i-1] + datapoint)
        data = (df.iloc[:, idxs[i-1]:idxs[i]]).astype(float)
        data = pd.concat([name, data], axis=1)
        data = transpose_data(data, col)
        datas.append(data)
    return datas

def get_blank_corrected(data):
    
    blank_corrected = data.sub(data[filter(lambda x: x.startswith('blank'), data.columns)].mean(axis=1), axis=0)
    return blank_corrected.drop(filter(lambda x: x.startswith('blank'), blank_corrected.columns), axis=1).reset_index(drop=True)

def get_neg_ctrl_corrected(fluo, od):
    
    blank_corr_fluo = get_blank_corrected(fluo)
    blank_corr_od = get_blank_corrected(od)
    data = blank_corr_fluo / blank_corr_od
    neg_corrected = data.sub(data[filter(lambda x: x.startswith('neg'), data.columns)].mean(axis=1), axis=0)
    return neg_corrected.drop(filter(lambda x: x.startswith('neg'), neg_corrected.columns), axis=1)

In [3]:
def generate_name(df):
    
    samples = df[df['Content'].str.startswith('Sample')]
    neg_ctrl = df[df['Content'].str.startswith('Negative')]
    blank = df[df['Content'].str.startswith('Blank')]

    samples['row'] = samples['Well'].str[0]
    samples['col'] = samples['Well'].str[1:].astype(int)
    samples.loc[samples['row'].isin(['A', 'E']), 'cuma'] = 0
    samples.loc[samples['row'].isin(['B', 'F']), 'cuma'] = 1 #6.3
    samples.loc[samples['row'].isin(['C', 'G']), 'cuma'] = 2 #25
    samples.loc[samples['row'].isin(['D', 'H']), 'cuma'] = 3 #100
    rhas = [0, 1, 2, 3] #[0, 0.98, 3.9, 15.3]
    samples['rha'] = samples['col'].apply(lambda x: rhas[(x-1) % 4])
    samples.loc[(samples['row'].isin(['A', 'B', 'C', 'D'])) & (samples['col']<5), 'ara'] = 0
    samples.loc[(samples['row'].isin(['A', 'B', 'C', 'D'])) & (samples['col']>=5) & (samples['col']<9), 'ara'] = 1 #3.2
    samples.loc[(samples['row'].isin(['A', 'B', 'C', 'D'])) & (samples['col']>=9) & (samples['col']<13), 'ara'] = 2 #13
    samples.loc[(samples['row'].isin(['E', 'F', 'G', 'H'])) & (samples['col']<5), 'ara'] = 3 #52
    samples.loc[(samples['row'].isin(['E', 'F', 'G', 'H'])) & (samples['col']>=5) & (samples['col']<9), 'ara'] = 4 #208
    samples['name'] = 'fa-' + samples['ara'].astype(int).astype(str) + samples['rha'].astype(int).astype(str) + samples['cuma'].astype(int).astype(str)
    
    neg_ctrl['name'] = list(range(len(neg_ctrl)))
    neg_ctrl['name'] = 'neg-' + neg_ctrl['name'].astype(str)
    
    blank['name'] = list(range(len(blank)))
    blank['name'] = 'blank-' + blank['name'].astype(str)
    
    return pd.concat([samples[['Well', 'name']], neg_ctrl[['Well', 'name']], blank[['Well', 'name']]], axis=0)

In [4]:
def read_data(folder, filename):
    
    raw_data = pd.read_csv('datasets/experiment/{}/{}.csv'.format(folder, filename), skiprows=[0])
    metadata = generate_metadata(raw_data['Well'], generate_name(raw_data.iloc[:, :2]))
    #data = raw_data[raw_data['Well'].isin(metadata['Well'])].reset_index(drop=True)
    #return metadata
    return generate_data(raw_data, metadata['name'], datapoint=int((raw_data.shape[1] - 3)/5), num_data=5)

def get_data_at(fluos, ods, h=6):
    
    samples = get_neg_ctrl_corrected(fluos, ods)
    #reporter = get_neg_ctrl_corrected(fluos[filter(lambda x: '-amp' in x, fluos.columns)],
    #                                  ods[filter(lambda x: '-amp' in x, ods.columns)])
    return samples.iloc[int(h * 60/20)]#.apply(lambda x:  x / reporter.iloc[int(h * 60/20)].mean())

In [5]:
folder = '201-JW-full-adder'
filename = 'JW-full-adder'
ods, fluos_gfp, fluos_1_gfp, fluos_mc, fluos_1_mc = read_data(folder, filename)
gfp = get_data_at(fluos_gfp, ods, h=12).reset_index()
mc = get_data_at(fluos_mc, ods, h=12).reset_index()

In [6]:
aras = [0, 3.2, 13, 52, 208]
rhas = [0, 0.98, 3.9, 15.3]
cumas = [0, 6.3, 25, 100]
gfp['ara'] = gfp['name'].apply(lambda x: aras[int(x[3])])
gfp['rha'] = gfp['name'].apply(lambda x: rhas[int(x[4])])
gfp['cuma'] = gfp['name'].apply(lambda x: cumas[int(x[5])])
mc['ara'] = mc['name'].apply(lambda x: aras[int(x[3])])
mc['rha'] = mc['name'].apply(lambda x: rhas[int(x[4])])
mc['cuma'] = mc['name'].apply(lambda x: cumas[int(x[5])])

In [7]:
gfp[gfp['name'].isin(['fa-000', 'fa-003', 'fa-030', 'fa-033', 'fa-300', 'fa-303', 'fa-330', 'fa-333'])].sort_values('name')

Unnamed: 0,name,36,ara,rha,cuma
0,fa-000,6358.977394,0.0,0.0,0.0
36,fa-003,806.802889,0.0,0.0,100.0
3,fa-030,4658.53772,0.0,15.3,0.0
39,fa-033,11741.796941,0.0,15.3,100.0
48,fa-300,4774.434969,52.0,0.0,0.0
72,fa-303,995.506204,52.0,0.0,100.0
51,fa-330,15598.20757,52.0,15.3,0.0
75,fa-333,18636.727233,52.0,15.3,100.0


In [8]:
mc[mc['name'].isin(['fa-000', 'fa-003', 'fa-030', 'fa-033', 'fa-300', 'fa-303', 'fa-330', 'fa-333'])]

Unnamed: 0,name,36,ara,rha,cuma
0,fa-000,325.57517,0.0,0.0,0.0
3,fa-030,605.61314,0.0,15.3,0.0
36,fa-003,167.591282,0.0,0.0,100.0
39,fa-033,1819.61643,0.0,15.3,100.0
48,fa-300,1771.529274,52.0,0.0,0.0
51,fa-330,14361.441192,52.0,15.3,0.0
72,fa-303,2050.265488,52.0,0.0,100.0
75,fa-333,19912.235169,52.0,15.3,100.0


In [9]:
data = gfp.copy()
X = data[['ara', 'rha', 'cuma']]
y = data[36]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
regr = RandomForestRegressor(random_state=0)
regr.fit(X_train, y_train)

r2_score(y_test, regr.predict(X_test))

0.905702746024565

In [10]:
regr = RandomForestRegressor()
cross_val_score(regr, X, y, scoring='r2', cv=KFold(shuffle=True))

array([0.93330841, 0.93365527, 0.9052763 , 0.90284225, 0.7573827 ])

In [11]:
data = mc.copy()
X = data[['ara', 'rha', 'cuma']]
y = data[36]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
regr = RandomForestRegressor(random_state=0)
regr.fit(X_train, y_train)

r2_score(y_test, regr.predict(X_test))

0.9693391583223072

In [12]:
regr = RandomForestRegressor(random_state=0)
cross_val_score(regr, X, y, scoring='r2', cv=KFold(shuffle=True))

array([0.98096663, 0.9293342 , 0.93628242, 0.91201761, 0.91993689])

In [13]:
regr = xgb.XGBRegressor(random_state=0)
cross_val_score(regr, X, y, scoring='r2', cv=KFold(shuffle=True))

array([0.94525248, 0.976216  , 0.9588079 , 0.95936427, 0.90344259])

In [16]:
data = gfp.copy()
X = data[['ara', 'rha', 'cuma']]
y = data[36]
gfp_regr = RandomForestRegressor()
gfp_regr.fit(X, y)

RandomForestRegressor()

In [17]:
data = mc.copy()
X = data[['ara', 'rha', 'cuma']]
y = data[36]
mc_regr = RandomForestRegressor()
mc_regr.fit(X, y)

RandomForestRegressor()

In [None]:
min_max = (gfp[36].min(), gfp[36].max())
min_max

In [None]:
on_states = [1, 2, 4, 7] #specific for full adder, numbers represent index where output is on
expected = [min_max[1] if i in on_states else min_max[0] for i in range(2**3)]
expected

In [None]:
0 0 0 = 0 0
0 0 1 = 0 1
0 1 0 = 0 1
0 1 1 = 1 0
1 0 0 = 0 1
1 0 1 = 1 0
1 1 0 = 1 0
1 1 1 = 1 1

In [31]:
class GeneticAlgorithm:

    def __init__(self, gfp_model, mc_model, induction_lim, min_max, population_size=100, generations=30, mutation_rate=0.02, elite_size=0.1):
        
        self.gfp_model = gfp_model
        self.mc_model = mc_model
        self.induction_lim = induction_lim
        self.num_inputs = len(induction_lim)
        self.num_selection = len(induction_lim) - 1
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.num_elite = int(elite_size * population_size)
        
        self.input_states = [format(x, '0{}b'.format(self.num_inputs)) for x in list(range(2**self.num_inputs))]
        self.on_states = [[1, 2, 4, 7], [3, 5, 6, 7]] #specific for full adder, numbers represent index where output is on
        self.expected = [min_max[0][1] if i in self.on_states else min_max[0][0] for i in range(2**self.num_inputs)] + [min_max[1][1] if i in self.on_states else min_max[1][0] for i in range(2**self.num_inputs)]
        
        self.best_populations = []
       
    def create_population(self):
        
        return [[np.random.uniform(ind[0], ind[1]) for ind in self.induction_lim]
                for _ in range(self.population_size)] #repeated arrangement is allowed

    def calculate_fitness(self, population):
        
        new_population = []
        for indiv in population:
            
            induction_lvl = []
            for input_state in self.input_states:
                induction_lvl.append([inducer*int(j) for inducer, j in zip(indiv, input_state)])  
            new_population.append(induction_lvl)
        
        fitness = []
        for pop in new_population:
            
            gfp_output = self.gfp_model.predict(pop)
            mc_output = self.mc_model.predict(pop)
            
            mse = mean_absolute_error(self.expected, np.append(gfp_output, mc_output))
            fitness.append(mse)
            
            #min_on = np.min([output[i] for i in list(range(len(self.input_states))) if i in self.on_states])
            #max_off = np.max([output[i] for i in list(range(len(self.input_states))) if i not in self.on_states])
            #dyn_range = min_on/max_off
            #fitness.append(dyn_range)

        return fitness

    def rank_population(self, population):

        fitness = self.calculate_fitness(population)
        return sorted([(a, b) for a, b in zip(population, fitness)], key=lambda x: x[1])#, reverse=True)
    
    def selection(self, sorted_pop):

        #might need to define another selection method later
        selected_pop = []
        df = pd.DataFrame(sorted_pop, columns=['index', 'fitness'])
        df['cum_sum'] = df['fitness'].cumsum()
        df['cum_perc'] = 100 * df['cum_sum'] / df['fitness'].sum()

        for i in range(self.num_elite): #ellitists
            selected_pop.append(sorted_pop[i][0])

        for i in range(len(sorted_pop) - self.num_elite): #remainings
            pick = 100 * random.random()
            for i in range(len(sorted_pop)):
                if pick <= df.iat[i, 3]:
                    selected_pop.append(sorted_pop[i][0])
                    break

        return selected_pop
    
    def mating_pool(self, population):
    
        return [selected for selected in population] #might need some fixing?
    
    def crossover(self, parent1, parent2):
        
        midpoint = random.randint(0, len(parent1))
        child_p1 = parent1[:midpoint]    
        child_p2 = parent1[midpoint:] 
        child = child_p1 + child_p2
        
        return child
    
    def original_crossover(self, parent1, parent2):
    
        child_p1 = []
        child_p2 = []
        gene_A = int(random.random() * len(parent1))
        gene_B = int(random.random() * len(parent1))
        
        start_gene = min(gene_A, gene_B)
        end_gene = max(gene_A, gene_B)
        child_p1 = [parent1[i] for i in range(start_gene, end_gene)]    
        child_p2 = [item for item in parent2 if item not in child_p1]
        child = child_p1 + child_p2
        
        return child

    def crossover_population(self, mating_pop):

        children = []
        non_elite = len(mating_pop) - self.num_elite
        pool = random.choices(mating_pop, k=len(mating_pop))
        
        for i in range(self.num_elite):
            children.append(mating_pop[i])

        for i in range(non_elite):
            child = self.crossover(pool[i], pool[len(mating_pop)-i-1])
            children.append(child[:self.num_inputs])
        
        return children
    
    def mutate(self, individual):

        for swapped in range(len(individual)):
            if(random.random() < self.mutation_rate):
                mutant = int(random.random() * len(individual))
                individual[mutant] = np.random.uniform(self.induction_lim[mutant][0], self.induction_lim[mutant][1])
        return individual

    def original_mutate(self, individual):

        for swapped in range(len(individual)):
            if(random.random() < self.mutation_rate):
                swapwith = int(random.random() * len(individual))
                temp = individual[swapped]
                individual[swapped] = individual[swapwith]
                individual[swapwith] = temp
        return individual

    def mutate_population(self, population):
    
        return [self.mutate(indiv) for indiv in population]
    
    def next_generation(self, current_pop):
    
        ranked_pop = self.rank_population(current_pop)
        selected_pop = self.selection(ranked_pop)
        mating_pop = self.mating_pool(selected_pop)
        children = self.crossover_population(mating_pop)
        next_pop = self.mutate_population(children)
        
        return next_pop
    
    def best_individuals(self, num_indiv=1):
        
        return self.best_populations[:num_indiv]

    def run(self):

        pop = self.create_population()
        #init_pop = pop.copy()
        for i in tqdm(range(self.generations)):
            #print('***', pop)
            pop = self.next_generation(pop)
        
        self.best_populations = self.rank_population(pop)

In [27]:
ga = GeneticAlgorithm(gfp_regr, mc_regr, [(0, 208), (0, 15.3), (0, 100)], [(gfp[36].min(), gfp[36].max()), (mc[36].min(), mc[36].max())],
                      population_size=1000, generations=100)
ga.run()
best3 = ga.best_individuals(3)
best3

100%|███████████████████████████████████████████| 20/20 [03:38<00:00, 10.95s/it]


[([138.91984622723757, 0.3673015384555395, 95.71137389921567],
  5507141.239402443),
 ([206.17975944810473, 0.364471048714964, 80.69881345207644],
  5507141.239402443),
 ([205.63581142901924, 0.42487241317288965, 88.2159739364093],
  5507141.239402443)]

In [32]:
ga = GeneticAlgorithm(gfp_regr, mc_regr, [(0, 208), (0, 15.3), (0, 100)], [(gfp[36].min(), gfp[36].max()), (mc[36].min(), mc[36].max())],
                      population_size=1000, generations=100)
ga.run()
best3 = ga.best_individuals(3)
best3

100%|█████████████████████████████████████████| 100/100 [49:20<00:00, 29.60s/it]


[([0.9434956838863044, 0.43555708822073586, 54.69169866668617],
  1437.0954843247005),
 ([0.9434956838863044, 0.43555708822073586, 54.69169866668617],
  1437.0954843247005),
 ([0.9434956838863044, 0.43555708822073586, 58.04229986749293],
  1437.0954843247005)]

In [None]:
inducer_concentration = [a*b for a, b in zip([0.0019222194245691604, 0.026859976759736837, 0.0008085142800648981], [208, 15.3, 100])]
inducer_concentration

In [None]:
pops = [50, 100, 200, 500]
gens = [10, 50, 100]
muts = [0.02, 0.1, 0.2]
elis = [0.1]

results = []
for p in tqdm(pops):
    for g in gens:
        for m in muts:
            for e in elis:
                ga = GeneticAlgorithm(regr, [(0, 208), (0, 15.3), (0, 100)],
                                  population_size=p,
                                  generations=g,
                                  mutation_rate=m,
                                  elite_size=e
                                 )
            ga.run()
            best_fit = ga.best_individuals(1)[0][1]
            config = p, g, m
            results.append((config, best_fit))

In [None]:
gfp[gfp['name'].isin(['fa-000', 'fa-003', 'fa-030', 'fa-033', 'fa-300', 'fa-303', 'fa-330', 'fa-333'])].sort_values('name')

In [None]:
input_states = [format(x, '0{}b'.format(3)) for x in list(range(2**3))]
on_states = [1, 2, 4, 7] #specific for full adder, numbers represent index where output is on
        
output = [ 5892.64718776,   812.40091677,  5216.42005874, 12565.07218765,
        4088.03037565,   588.16901041, 16361.27162125, 18766.8513641 ]
min_on = np.min([output[i] for i in list(range(len(input_states))) if i in on_states])
max_off = np.max([output[i] for i in list(range(len(input_states))) if i not in on_states])
dyn_range = min_on/max_off
dyn_range

In [28]:
input_states = [format(x, '0{}b'.format(3)) for x in list(range(2**3))]
indiv = [138.91984622723757, 0.3673015384555395, 95.71137389921567]
induction_lvl = []
for input_state in input_states:
    induction_lvl.append([inducer*int(j) for inducer, j in zip(indiv, input_state)]) 
gfp_regr.predict(induction_lvl)

array([5601.90491859,  819.13885027, 5601.90491859,  819.13885027,
       3964.61134277,  571.50910455, 3964.61134277,  571.50910455])

In [29]:
mc_regr.predict(induction_lvl)

array([ 284.27561859,  243.37048078,  284.27561859,  243.37048078,
       1581.00353272, 1625.88181388, 1581.00353272, 1625.88181388])

In [33]:
input_states = [format(x, '0{}b'.format(3)) for x in list(range(2**3))]
indiv = [0.9434956838863044, 0.43555708822073586, 54.69169866668617]
induction_lvl = []
for input_state in input_states:
    induction_lvl.append([inducer*int(j) for inducer, j in zip(indiv, input_state)]) 
gfp_regr.predict(induction_lvl)

array([5601.90491859,  711.64493165, 5601.90491859,  711.64493165,
       5601.90491859,  711.64493165, 5601.90491859,  711.64493165])

In [34]:
mc_regr.predict(induction_lvl)

array([284.27561859, 291.93462896, 284.27561859, 291.93462896,
       284.27561859, 291.93462896, 284.27561859, 291.93462896])

In [None]:
input_states = [format(x, '0{}b'.format(3)) for x in list(range(2**3))]
indiv = [208, 15.3, 100]
induction_lvl = []
for input_state in input_states:
    induction_lvl.append([inducer*int(j) for inducer, j in zip(indiv, input_state)]) 
regr.predict(induction_lvl)

In [None]:
regr.predict([[208, 15.3, 100]])

In [None]:
plt.barh(['A', 'R', 'C'], regr.feature_importances_)