In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import editdistance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# great, can you write a scripts that takes a bunch of  CRI dataframes
# and (1) takes out incomplete runs (e.g. something that stopped at the middle of batch 5 or something)
# continuing my instructions above, can you also make sure it removes "duplicates" of the same run
# (averaging may be too tricky) so if the same algo is run on the same landscape
# with same parameters and same starting position, just take one run

In [None]:
data = pd.read_csv("path_to_CRI_runs")

In [None]:
data.columns

In [None]:
len(data)

In [None]:
data.head()
data.tail()

In [None]:
import os
from collections import defaultdict

class CRIData:
    def __init__(self, verbose=False, warnings=True):
        self.runs = dict()
        self.verbose = verbose
        self.warnings = warnings
        self.dir = "./"
        
        self.landscapes = defaultdict(set)
    
    def set_dir(self, d):
        self.dir = d
        
    def hash_item(self, r):
        # hashes a single row of data
        return (f"{r.landscape_id},"
                f"{r.start_id},"
                f"{r.model_type},"
                f"{r.virtual_screen},"
                f"{r.explorer_type}")
        
    def parse_file(self, filename):
        data = pd.read_csv(os.path.join(self.dir, filename))
        
        start = 0
        current_task = None
        for i, row in data.iterrows():
            h = self.hash_item(row)
            if current_task != h or i == len(data)-1:
                # store previous run
                self.store_run(data, start=start, end=i, h=h)

                # start new run
                current_task = h
                start = i
                if self.verbose:
                    print(f"Start: {start}, Task: {h}")
                    
    @staticmethod
    def is_incomplete_run(df):
        # bad metric
        if df.iloc[-1].horizon == 1:
            return False
        return True
        
    def store_run(self, df, start, end, h):
        """
        start: starting index of run
        end: ending index of run
        h: hash of run
        """
        if end <= 0:
            return
        
        candidate = df.iloc[start:end]
        
        if self.is_incomplete_run(candidate):
            if self.warnings:
                print(f"WARNING: not a complete run")
            return

        if h in self.runs:
            if self.warnings:
                print(f"WARNING: {h} already in runs")

        self.runs[h] = candidate
        self.store_completed_landscape(candidate)
        
    def store_completed_landscape(self, df):
        landscape = df.iloc[0].landscape_id
        start_seq = df.iloc[0].start_id
        self.landscapes[landscape].add(start_seq)

In [None]:
cRNA = CRIData(warnings=False)
cRNA.set_dir("path_to_CRI_runs")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_dens.csv")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_num2_dens.csv")
cRNA.parse_file("DynaPPO_Agent_0.5_10_20_num3_dens.csv")

In [None]:
cRNA.landscapes

In [None]:
cTF = CRIData()
cTF.set_dir("path_to_CRI_runs")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_dens.csv")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_num2_dens.csv")
cTF.parse_file("DynaPPO_Agent_0.5_10_20_num3_dens.csv")

In [None]:
cTF.landscapes

In [None]:
cRNA_nodens = CRIData(warnings=False)
cRNA_nodens.set_dir("path_to_CRI_runs")
cRNA_nodens.parse_file("DynaPPO_Agent_0.5_10_20.csv")
cRNA_nodens.parse_file("DynaPPO_Agent_0.5_10_20_num2.csv")

In [None]:
cRNA_nodens.landscapes

In [None]:
cTF_nodens = CRIData()
cTF_nodens.set_dir("path_to_CRI_runs")
cTF_nodens.parse_file("DynaPPO_Agent_0.5_10_20.csv")
cTF_nodens.parse_file("DynaPPO_Agent_0.5_10_20_num2.csv")

In [None]:
cTF_nodens.landscapes