In [1]:
import config_readwrite as crw
import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
from scipy import stats
import seaborn as sns

# files

In [2]:
cfn = os.path.join(os.path.dirname(os.getcwd()), "config.ini")
config, cfn = crw.read(cfn)

In [3]:
# strategy : [add_layers (bool), freeze_layers (bool), unfreeze_layers (int)]
strategies = {"first_n_last": [True, True, 0], 
             "everything":[True, False, 10],
             "unfreeze_1layer":[True, True, 1] # TBD
              #"multioutput":[]
             }

In [5]:
CL = 'hepg2'
SIZE = "0.9"
EPOCHS = 10
BATCHSIZE=10
SEQ_SIZE = 200

STRATEGY = "unfreeze_1layer"

ADD_LAYERS, FREEZE, UNFREEZE_N = strategies[STRATEGY]


section = f"LegNet.agarwal.{STRATEGY}.EPOCHS.{EPOCHS}.BATCHSIZE.{BATCHSIZE}"
crw.check(config, section)

config[section]["SEQ_SIZE"] = str(SEQ_SIZE)

In [6]:

PREFIX = "AGARWAL.seqs"
section = "agarwal_mpra"

TRUTH = config[section]["MPRA_ACTIVITY"] # US activity in HepG2

TRAIN = config[section][f"training.{CL}.{SIZE}"] # training data
TEST = config[section][f"test.{CL}.{SIZE}"] # training data

PRED_PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/"
PRED = os.path.join(PRED_PATH, f"inference.{PREFIX}.{STRATEGY}.EPOCHS.{EPOCHS}.BATCHSIZE.{BATCHSIZE}.{CL}.retrain.tsv")  # legnet predictions
INFO = config[section]["MPRA_INFO"]

In [7]:
ADD_LAYERS, FREEZE, UNFREEZE_N

(True, True, 1)

In [8]:
section = f"LegNet.agarwal.{STRATEGY}.EPOCHS.{EPOCHS}.BATCHSIZE.{BATCHSIZE}"
crw.check(config, section)
# legnet 
infile = os.path.split(TRAIN)[1]
model_dir = os.path.join(PRED_PATH, "re" + ".".join(infile.split(".")[:-1]) + "." + STRATEGY
        + ".epochs" + EPOCHS ".batchsize-" + BATCHSIZE ".unfreeze-" + UNFREEZE)

MODEL = os.path.join(model_dir, "model_9.pth")
INFERENCE = os.path.join(PRED_PATH, f"inference.test.{CL.upper()}.{SEQ_SIZE}.{STRATEGY}.EPOCHS.{EPOCHS}.BATCHSIZE.{BATCHSIZE}.tsv")

config[section]["path"] = PRED_PATH
config[section]["model_dir"]=model_dir
config[section]["model_10"]= MODEL
config[section]["inference"]= INFERENCE
config[section]["epochs"] = str(EPOCHS)
config[section]["batchsize"] = str(BATCHSIZE)
config[section]["freeze_layers"] = str(FREEZE)
config[section]["unfreeze_layers"] = str(UNFREEZE_N)

crw.write(config, cfn)

# run legnet

In [13]:
def parseTrainArgs(train_input, seqsize, new_layers, freeze, model_dir, epochs=10, batchsize=100, unfreeze=0):

    args = [
        "--train_valid_path", train_input,
        "--foldify",
        "--delimiter tab",
        "--seed 42",
        "--train_batch_size 1024",
        "--train_workers 8",
        "--valid_batch_size 4098",
        "--valid_workers 8",
        "--epoch_num", str(epochs), 
        "--batch_per_epoch", str(batchsize),
        "--weights uniform",
        "--seqsize", str(seqsize),
        "--temp .TEMPDIR",
        "--use_single_channel",
        "--singleton_definition integer",
        "--gpu 0",
        "--model_dir",model_dir, 
        "--ks 7",
        "--blocks 256 128 128 64 64 64 64",
        "--resize_factor 4",
        "--se_reduction 4",
        "--shift 0.5",
        "--scale 0.5",
        "--loss kl",
        "--final_ch 18",
        "--optimizer adamw",
        "--model /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/model_300.pth"]
    
    print(unfreeze)
    if new_layers is True:
        args.append("--tl_layers")
    if freeze is True:
        args.append("--freeze_layers")
    if unfreeze >0:
        args.append(f"--unfrozen_layers {unfreeze}")

    return " ".join(args)

In [16]:
def launchLegNetTrain(train_input, seqsize, new_layers, freeze, strategy, epochs, batchsize, unfreeze):
    """ predict 18 bins using input sequence, write to output file, specify sequence size"""

    PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/"
    TRAIN_LEGNET_SH = os.path.join(PATH, "train_transfer_lastlayer.sh")

    OUTPUT_PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/"

    # str split to make output file a copy of the input file.
    infile = os.path.split(train_input)[1]
    model_dir = os.path.join(
        OUTPUT_PATH, "re" + ".".join(infile.split(".")[:-1]) + "." + strategy
        + ".epochs" + epochs ".batchsize-" + batchsize ".unfreeze-" + unfreeze)

    # legnet training arguments
    args = parseTrainArgs(train_input, seqsize, new_layers, 
                          freeze, model_dir, epochs, batchsize, unfreeze)  # get arguments for LegNet
   
    cmd = " ".join(["qsub -q gpu.q",
                   TRAIN_LEGNET_SH               
                    ])
    
    # add arguments to command
    cmd = cmd + " " + args
    
    print(cmd)
    
    #os.system(cmd)
    if os.path.exists(model_dir) is False:
        os.system(cmd)
    else:
        print("trained already?")

    return model_dir

# Retrain?

In [17]:
UNFREEZE_N

1

In [18]:
model_dir = launchLegNetTrain(TRAIN, SEQ_SIZE, ADD_LAYERS, FREEZE, STRATEGY, EPOCHS, BATCHSIZE, 2)

2
qsub -q gpu.q /wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/train_transfer_lastlayer.sh --train_valid_path /wynton/home/ahituv/fongsl/EMF/US/data/training.HepG2.0.9.txt --foldify --delimiter tab --seed 42 --train_batch_size 1024 --train_workers 8 --valid_batch_size 4098 --valid_workers 8 --epoch_num 10 --batch_per_epoch 10 --weights uniform --seqsize 200 --temp .TEMPDIR --use_single_channel --singleton_definition integer --gpu 0 --model_dir /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/retraining.HepG2.0.9.unfreeze_1layer --ks 7 --blocks 256 128 128 64 64 64 64 --resize_factor 4 --se_reduction 4 --shift 0.5 --scale 0.5 --loss kl --final_ch 18 --optimizer adamw --model /wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/model_300.pth --tl_layers --freeze_layers --unfrozen_layers 2


# test

In [None]:
def parseTestArgs(test_input, test_output, seqsize, model, new_layers):
    args = [
        "--target",  test_input,
        "--output", test_output,
        "--seed 42",
        "--valid_batch_size 124",
        "--valid_workers 8",
        "--seqsize", str(seqsize),
        "--temp .TEMPDIR",
        "--use_single_channel",
        "--singleton_definition integer",
        "--gpu 0", "--ks 7",
        "--blocks 256 128 128 64 64 64 64",
        " --resize_factor 4",
        "--se_reduction 4",
        "--final_ch 18",
        "--delimiter tab",
        "--output_format tsv",
        "--model",  model
    ]
    if new_layers is True:
        args.append("--tl_layers")
    return " ".join(args)

In [None]:
def launchLegNetInference(test_input, seqsize, model, cl, strategy, add_layers):
    """ predict 18 bins using input sequence, write to output file, specify sequence size"""

    PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/bin/"
    LEGNET_SH = os.path.join(PATH, "legnet_inference.sh")

    OUTPUT_PATH = "/wynton/home/ahituv/fongsl/EMF/US/ml_emf/data/legnet/output/"

    # str split to make output file a copy of the input file.
    infile = os.path.split(test_input)[1]
    test_output = os.path.join(OUTPUT_PATH, f"inference.test.{cl.upper()}.{seqsize}.{strategy}.tsv")

    cmd = " ".join(["qsub -q gpu.q",
                   LEGNET_SH
                   ])
    args = parseTestArgs(test_input, test_output, seqsize, model, add_layers)
    
    
    cmd = cmd + " " + args
    
    print(cmd)
    #os.system(cmd)
    if os.path.exists(test_output) is False:
        print("running...")
        os.system(cmd)
    else:
        print("skipping...", test_output)

## Inference 

In [None]:
launchLegNetInference(TEST, SEQ_SIZE, MODEL, CL, STRATEGY, ADD_LAYERS)

## plot training scores

In [None]:
scores = glob.glob(os.path.join(model_dir, "scores*"))
score_dict={}
for score in scores:
    with open(score, "r") as reader:
        line=json.load(reader)
        score_cols = list(line)
        pearson = line[list(line)[0]]
        mse = line[list(line)[1]]
        spearman = line[list(line)[2]]
        score_dict[os.path.split(score)[1]] = [pearson, mse, spearman, os.path.split(score)[1]]
score_cols.append("iter")  # add iter number column
scoredf = pd.DataFrame(score_dict.values())  # name dataframe
scoredf.columns = score_cols  # plot names

scoredf.head()

In [None]:
fig, ax = plt.subplots(figsize =(4,4))
x = scoredf.index
y = 'train_pearson'
sns.lineplot(x=x, y=y, data= scoredf, label=y)

y = 'train_spearman'
sns.lineplot(x=x, y=y, data= scoredf, label=y)
ax.set(xlabel = 'epoch', ylabel="corr", title=STRATEGY + "-" + CL)

out =os.path.join()