In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import editdistance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# great, can you write a scripts that takes a bunch of  CRI dataframes
# and (1) takes out incomplete runs (e.g. something that stopped at the middle of batch 5 or something)
# continuing my instructions above, can you also make sure it removes "duplicates" of the same run
# (averaging may be too tricky) so if the same algo is run on the same landscape
# with same parameters and same starting position, just take one run

In [3]:
data = pd.read_csv("/Users/richard/Downloads/RichardRuns2/RNA/consistency_robustness_independence/DynaPPO_Agent_0.5_10_20_dens.csv")

In [5]:
data.columns

Index(['id', 'batch', 'sequence', 'true_score', 'model_score', 'batch_size',
       'measurement_cost', 'virtual_evals', 'landscape_id', 'start_id',
       'model_type', 'virtual_screen', 'horizon', 'explorer_type'],
      dtype='object')

In [6]:
len(data)

52857

In [21]:
data.head()
data.tail()

Unnamed: 0,id,batch,sequence,true_score,model_score,batch_size,measurement_cost,virtual_evals,landscape_id,start_id,model_type,virtual_screen,horizon,explorer_type
52852,c46b3c3e-48e9-11ea-b7bd-b1bacd5ad49f,10,GCGCGGGGCCAUGG,0.465842,0.465842,100,1001,19708.0,B2L14RNA1+2,startRNAL14_2,ENS_3_NAMb_ss1,20,1,DynaPPO_Agent_0.5_10_20
52853,c46b3c3e-48e9-11ea-b7bd-b1bacd5ad49f,10,UCGUGGGGGGCUCG,0.464514,0.464514,100,1001,19708.0,B2L14RNA1+2,startRNAL14_2,ENS_3_NAMb_ss1,20,1,DynaPPO_Agent_0.5_10_20
52854,c46b3c3e-48e9-11ea-b7bd-b1bacd5ad49f,10,GGGUGCGGCGCGGA,0.464448,0.464448,100,1001,19708.0,B2L14RNA1+2,startRNAL14_2,ENS_3_NAMb_ss1,20,1,DynaPPO_Agent_0.5_10_20
52855,c46b3c3e-48e9-11ea-b7bd-b1bacd5ad49f,10,GCGCGGGGCCAUUG,0.461894,0.461894,100,1001,19708.0,B2L14RNA1+2,startRNAL14_2,ENS_3_NAMb_ss1,20,1,DynaPPO_Agent_0.5_10_20
52856,c46b3c3e-48e9-11ea-b7bd-b1bacd5ad49f,0,UCUUGGGGACUUUU,0.077723,0.077723,100,1,0.0,B2L14RNA1+2,startRNAL14_2,ENS_3_arch=CNNa_hd100_f50,20,10,DynaPPO_Agent_0.5_10_20


In [98]:
import os
from collections import defaultdict

class CRIData:
    def __init__(self, verbose=False, warnings=True):
        self.runs = dict()
        self.verbose = verbose
        self.warnings = warnings
        self.dir = "./"
        
        self.landscapes = defaultdict(set)
    
    def set_dir(self, d):
        self.dir = d
        
    def hash_item(self, r):
        # hashes a single row of data
        return (f"{r.landscape_id},"
                f"{r.start_id},"
                f"{r.model_type},"
                f"{r.virtual_screen},"
                f"{r.explorer_type}")
        
    def parse_file(self, filename):
        data = pd.read_csv(os.path.join(self.dir, filename))
        
        start = 0
        current_task = None
        for i, row in data.iterrows():
            h = self.hash_item(row)
            if current_task != h or i == len(data)-1:
                # store previous run
                self.store_run(data, start=start, end=i, h=h)

                # start new run
                current_task = h
                start = i
                if self.verbose:
                    print(f"Start: {start}, Task: {h}")
                    
    @staticmethod
    def is_incomplete_run(df):
        # bad metric
        if df.iloc[-1].horizon == 1:
            return False
        return True
        
    def store_run(self, df, start, end, h):
        """
        start: starting index of run
        end: ending index of run
        h: hash of run
        """
        if end <= 0:
            return
        
        candidate = df.iloc[start:end]
        
        if self.is_incomplete_run(candidate):
            if self.warnings:
                print(f"WARNING: not a complete run")
            return

        if h in self.runs:
            if self.warnings:
                print(f"WARNING: {h} already in runs")

        self.runs[h] = candidate
        self.store_completed_landscape(candidate)
        
    def store_completed_landscape(self, df):
        landscape = df.iloc[0].landscape_id
        start_seq = df.iloc[0].start_id
        self.landscapes[landscape].add(start_seq)

In [99]:
cRNA = CRIData(warnings=False)
cRNA.set_dir("/Users/richard/Downloads/RichardRuns2/RNA/consistency_robustness_independence/")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_dens.csv")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_num2_dens.csv")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_num3_dens.csv")

In [100]:
cRNA.landscapes

defaultdict(set,
            {'B1L14RNA1': {'startRNAL14_0', 'startRNAL14_1', 'startRNAL14_2'},
             'B1L50RNA1': {'startRNAL50_0', 'startRNAL50_1', 'startRNAL50_2'},
             'B2L14RNA1+2': {'startRNAL14_0',
              'startRNAL14_1',
              'startRNAL14_2'},
             'B2L50RNA1+4': {'startRNAL50_0', 'startRNAL50_1'}})

In [103]:
cTF = CRIData()
cTF.set_dir("/Users/richard/Downloads/RichardRuns2/TF/consistency_robustness_independence/")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_dens.csv")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_num2_dens.csv")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_num3_dens.csv")



In [104]:
cTF.landscapes

defaultdict(set,
            {'POU3F4_REF_R1': {'TF0', 'TF1', 'TF2'},
             'PAX3_G48R_R1': {'TF0', 'TF1', 'TF2'},
             'SIX6_REF_R1': {'TF0', 'TF1', 'TF2'}})

In [106]:
cRNA_nodens = CRIData(warnings=False)
cRNA_nodens.set_dir("/Users/richard/Downloads/RichardRuns2/RNA/consistency_robustness_independence/")
cRNA_nodens.parse_file("DynaPPO_Agent_0.5_10_20.csv")
cRNA_nodens.parse_file("DynaPPO_Agent_0.5_10_20_num2.csv")

In [107]:
cRNA_nodens.landscapes

defaultdict(set,
            {'B1L14RNA1': {'startRNAL14_0', 'startRNAL14_1', 'startRNAL14_2'},
             'B1L50RNA1': {'startRNAL50_0', 'startRNAL50_1', 'startRNAL50_2'},
             'B2L14RNA1+2': {'startRNAL14_0',
              'startRNAL14_1',
              'startRNAL14_2'},
             'B2L50RNA1+4': {'startRNAL50_0',
              'startRNAL50_1',
              'startRNAL50_2'}})

In [108]:
cTF_nodens = CRIData()
cTF_nodens.set_dir("/Users/richard/Downloads/RichardRuns2/TF/consistency_robustness_independence/")
cTF_nodens.parse_file("DynaPPO_Agent_0.5_10_20.csv")
cTF_nodens.parse_file("DynaPPO_Agent_0.5_10_20_num2.csv")



In [109]:
cTF_nodens.landscapes

defaultdict(set,
            {'POU3F4_REF_R1': {'TF0'},
             'SIX6_REF_R1': {'TF0', 'TF1', 'TF2', 'TF3', 'TF4'},
             'VAX2_REF_R1': {'TF0'}})