## Cross transfer and test/train size expts 

In [1]:
import os
import sys
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import json
import pickle
import itertools
import matplotlib as mpl
from collections import defaultdict
mpl.rcParams['font.family'] = 'Arial'

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
def make_hash(depth=None, type=None):
    """Utility method to make a multilevel dict"""
    if (depth, type) == (None, None):
        return defaultdict(makehash)
    elif depth == 0:
        return defaultdict(type)
    else:
        return defaultdict(partial(makehash, depth - 1, type))

In [3]:
from metalearn.datasets.loaders import load_dataset
from metalearn.models.factory import ModelFactory
from metalearn.utils.metric import mse, vse, r2, pcc
from collections import OrderedDict

def unflatten(dictionary):
    resultDict = dict()
    for key, value in dictionary.items():
        parts = key.split(".")
        d = resultDict
        for part in parts[:-1]:
            if part not in d:
                d[part] = dict()
            d = d[part]
        d[parts[-1]] = value
    return resultDict

def save_stats(scores_dict, outfile=sys.stdout):
    metrics = list(scores_dict.keys())
    metrics.remove('size')
    metrics.remove('name')
    names = scores_dict['name']
    sizes = scores_dict['size']

    results = [
        OrderedDict(
            ([('name', names[idx]), ('size', sizes[idx])] +
             [(metric_name + aggregator.__name__, aggregator(scores_dict[metric_name][idx]))
              for metric_name in metrics for aggregator in [np.mean, np.median, np.std]]
             )
        ) for idx in names
    ]

    results = pd.DataFrame(results)
    results.to_csv(outfile, index=False, sep='\t')
    return results
    
def load_model_and_metatest(folder, return_params=False):
    param_file = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('_params.json')]
    model_file = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('_ckp.ckp')]
    if len(param_file) == 0 or len(model_file) == 0:
        return 
    param_file = param_file[0]
    model_file = model_file[0]
    
    with open(param_file) as fd:
        params = json.load(fd)
        
    params = unflatten(params)
    model_name, model_params = params['model_name'], params['model_params']
    dataset_name, dataset_params = params['dataset_name'], params['dataset_params']
    model = ModelFactory()(model_name, **model_params)
    model.load(model_file)
    _, _, meta_test = load_dataset(dataset_name, **dataset_params)
    if return_params:
        return model, meta_test, params
    return model, meta_test
            

In [4]:
def expts_changing_size(folder, res_tag=''):
    model, meta_test, params = load_model_and_metatest(folder, return_params=True)
    dataset_name, dataset_params = params['dataset_name'], params['dataset_params']
    orig_k = meta_test.dataset.max_examples_per_episode
    for k in [5, 10, 20, 40, 60, 80, 100]:
        meta_test.dataset.max_examples_per_episode = k
        dataset_params.update(dict(max_examples_per_episode=k))
        _, _, meta_test = load_dataset(dataset_name, **dataset_params)
        
        scores = model.evaluate(meta_test, metrics=[mse, vse, r2, pcc])
        result_fname = f'{res_tag}_me_{orig_k}_to_{k}_res.csv'
        with open(result_fname, "w") as outfile:
            results = save_stats(scores, outfile)

In [5]:
def expts_cross_transfer(folder_1, folder_2, tag_1, tag_2):
    model1, metatest1 = load_model_and_metatest(folder_1, return_params=False)
    model2, metatest2 = load_model_and_metatest(folder_2, return_params=False)
    
    scores = model1.evaluate(metatest2, metrics=[mse, vse, r2, pcc])
    result_fname = f'cross_transfert_{tag_1}_to_{tag_2}_res.csv'
    with open(result_fname, "w") as outfile:
        results = save_stats(scores, outfile)

    scores = model2.evaluate(metatest1, metrics=[mse, vse, r2, pcc])
    result_fname = f'cross_transfert_f{tag_2}_{tag_1}_res.csv'
    with open(result_fname, "w") as outfile:
        results = save_stats(scores, outfile)

In [6]:
res_folder = '/home/prtos/.invivo/invivoai-sagemaker-artifacts/iscb-expts4/'
expts_folders = os.listdir(res_folder)
for ef in expts_folders:
    print(ef)
    expts_changing_size(os.path.join(res_folder, ef, 'output/model'), res_tag=ef)

iscb-2019-01-27-17-26-53-249
iscb-2019-01-27-17-27-37-056
iscb-2019-01-27-17-27-14-607
iscb-2019-01-27-17-27-03-795
iscb-2019-01-27-17-27-25-825
iscb-2019-01-27-17-28-06-442
iscb-2019-01-27-17-26-47-923
iscb-2019-01-27-17-26-58-498
iscb-2019-01-27-17-27-31-422
iscb-2019-01-27-17-27-54-444
iscb-2019-01-27-17-27-42-930
iscb-2019-01-27-17-27-20-081
iscb-2019-01-27-17-27-09-179
iscb-2019-01-27-17-27-48-661
iscb-2019-01-27-17-28-00-450
iscb-2019-01-27-17-28-12-429


In [8]:
import os
import tarfile
from ivbase.utils.datacache import DataCache
import click
import shutil

def extract_all_model_archives(dir_path, rem_fail=False):
    for root, dirs, files in os.walk(dir_path):
        for fname in files:
            fpath = os.path.join(root, fname)
            dir_name = fpath[:-7]
            if fname.endswith('.tar.gz'):
                tar = tarfile.open(fpath, 'r:gz')
                tar.extractall(path=dir_name, )
                tar.close()
            if os.path.isdir(dir_name):
                if rem_fail and os.path.exists(os.path.join(dir_name, 'failure')):
                    shutil.rmtree(dir_name, ignore_errors=True)
                    
datacache = DataCache('/home/prtos/.invivo')
chembl_folder = datacache.get_dir("s3://invivoai-sagemaker-artifacts/iscb-expts3/iscb-2019-01-26-09-19-02-877")
pubchem_folder = datacache.get_dir("s3://invivoai-sagemaker-artifacts/iscb-expts3/iscb-2019-01-26-09-19-48-642")
extract_all_model_archives(os.path.dirname(chembl_folder))
expts_cross_transfer(os.path.join(pubchem_folder, 'output/model'),
                    os.path.join(chembl_folder, 'output/model'), 
                    'pubchem', 'chembl')

NameError: name 'tag1' is not defined