In [None]:
%load_ext autoreload
%autoreload 2
!echo $HOSTNAME

import sys
print('Python path: ', sys.executable)

In [None]:
from pathlib import Path
from collections import namedtuple
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys

In [None]:
Parameter = namedtuple("Parameter", ['hyperparameter_str', 'epoch', 'result', 'improvement'])
Results = namedtuple("Results", ['hyperparameter_str', 'epoch', 'result', 'improvement', 'performance']) # hack to include performance in the namedtuple

def parse_log_path(filepath): 
    stem = filepath.stem
    
    array = stem.split("_")
    hyperparameter_str = '_'.join(array[-4:])

    result = None
    epoch = None
    with open(filepath) as f:
        for line in f.readlines(): 
            if line.startswith("Best loss meta training:"): 
                result = line.split('[')[1].replace(']', '').strip()
                result = [float(i) for i in result.split()]
                
            if line.startswith("epoch:"): 
                epoch = int(line.split(':')[1].strip())
                
    
    return Parameter(hyperparameter_str, epoch, result, result[-1] - result[0])

def get_predictions(path): 
    path = Path(path)
    data = []
    
    predict = np.load(path / "zero_shot_predict.npy")
    actual = np.load(path / "zero_shot_true.npy")

    x = np.hstack([predict, actual]).T
    zero = np.corrcoef(x)[0,1]
    
    for sample in range(20): 
        ks = [zero]
        for k in range(1, 11): 
            predict = np.load(path / f"{k}_{sample}_shot_predict.npy")
            actual = np.load(path / f"{k}_{sample}_shot_true.npy")
            
            x = np.hstack([predict, actual]).T
            x = np.corrcoef(x)[0,1]
            
            ks.append(x)
        data.append(ks)
        
    return np.vstack(data)

def parse_logs(log_directory): 
    results = []
    log_path = Path(log_directory)
        
    for log in log_path.glob("*"): 
        result = parse_log_path(log)
        results.append(result)
        
    return results

## Gathering all results

In [None]:
from multiprocessing import Pool

In [None]:
def get_all_performances(folders): 
    log_folder, predictions_folder = folders
    print(log_folder)
    
    outer = {}
    for tissue in log_folder.glob("*"):        
        tissue = tissue.stem
        p = log_folder / tissue
        
        # Get all validation and test performance for each hyperpaameter tuned
        all_results = []
        params = parse_logs(p)
        for param in params: 
            predictions = get_predictions(predictions_folder / tissue / param.hyperparameter_str / f'epochs_{param.epoch}')
            results = Results(*param, predictions)
            all_results.append(results)

        outer[tissue] = all_results
            
    return outer

In [None]:
predictions_directory = Path("/cellar/users/shfong/projects/TCRP-refactored/tcrp-original/output/210726_complete-drug-run-v2/predictions")
logs_directory = Path("/cellar/users/shfong/projects/TCRP-refactored/tcrp-original/output/210726_complete-drug-run-v2/run-logs")

In [None]:
%%time 

paths = []
drugs = []
for drug_path in predictions_directory.glob("*"): 
    drug = drug_path.stem    
    log_path = logs_directory / drug
    paths.append((log_path, drug_path))
    drugs.append(drug)
    
with Pool(64) as pool: 
    results = pool.map(get_all_performances, paths)
    
all_results = {d: r for d, r in zip(drugs, results)}

In [None]:
import pickle

In [None]:
with open("tcrp-all-performance.pkl", "wb") as f: 
    pickle.dump(all_results, f)

In [None]:
%%bash 

ls -alh

## Getting results based on best improvement

In [None]:
%%time

output = {}
failed_paths = []
for drug in predictions_directory.glob("*"): 
    print(drug)
    drug = drug.stem
    
    output[drug] = {}
    
    directory = logs_directory / drug
    for tissue in directory.glob("*"): 
        tissue = tissue.stem
        
        p = directory / tissue
        
        try: 
            results = parse_logs(p)
            param = sorted(results, key=lambda x: x.improvement)[-1]
            predictions = get_predictions(predictions_directory / drug / tissue / param.hyperparameter_str / f'epochs_{param.epoch}').mean(axis=0)

            output[drug][tissue] = predictions
            
        except KeyboardInterrupt: 
            raise KeyboardInterrupt
        
        except: # None for results
            failed_paths.append(p)
            print(f"Path {p} failed!")
            continue

In [None]:
len(results)

In [None]:
import pickle

In [None]:
with open("tcrp-performance.pkl", "wb") as f: 
    pickle.dump(output, f)

## Results

In [None]:
all_results = []
for drug, inner in output.items(): 
    for tissue, p in inner.items(): 
        all_results.append(p)
        
all_results = np.vstack(all_results)

In [None]:
plt.plot(np.nanmedian(all_results, axis=0))