In [4]:
import pandas as pd
import numpy as np
from torcheval.metrics.functional import binary_precision_recall_curve
import torch
from proteinfertorch.utils import read_pickle, read_fasta,save_to_fasta
import matplotlib.pyplot as plt

In [45]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, LinearLR, SequentialLR, LambdaLR

#Simly DNN
model = torch.nn.Sequential(
    torch.nn.Linear(10, 10),
    torch.nn.ReLU(),
    torch.nn.Linear(10, 1),
    torch.nn.Sigmoid()
)


In [46]:

lr = 0.001
lr_warmup_steps = 3_000
lr_decay_steps = 1_000
iters = 500_000
lr_decay = 0.997
optim = Adam(model.parameters(), lr=lr)
lr_warmup_scheule = LinearLR(optimizer=optim,
                             start_factor=1/lr_warmup_steps,
                             end_factor=1,
                             total_iters=lr_warmup_steps
                             )

class ExponentialDecay:
    def __init__(self, decay_steps, decay_rate, staircase):
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate
        self.staircase = staircase

    def __call__(self, step):
        if self.staircase:
            return self.decay_rate ** (step // self.decay_steps)
        else:
            return self.decay_rate ** (step / self.decay_steps)

lr_decay_schedule = LambdaLR(optimizer = optim,
                                  lr_lambda=ExponentialDecay(decay_steps=lr_decay_steps, decay_rate=lr_decay, staircase=True)
                                  )

lr_scheduler = SequentialLR(optimizer=optim,
                            schedulers = [lr_warmup_scheule, lr_decay_schedule],
                            milestones=[lr_warmup_steps])
lrs = []
for iteration in range(iters):
    lr_scheduler.step()
    lrs.append(optim.param_groups[0]['lr'])
    # do train
    # do val
    if iteration == 4000:
        break
# import matplotlib.pyplot as plt
# plt.plot(lrs)



In [None]:
for split in ["train","dev","test"]:
    data = read_fasta(f"../data/random_split/{split}_GO.fasta")
    save_to_fasta(data[:40],f"../data/random_split/{split}_GO_1_batch.fasta")

Saved FASTA file to ../data/random_split/train_GO_1_batch.fasta
Saved FASTA file to ../data/random_split/dev_GO_1_batch.fasta
Saved FASTA file to ../data/random_split/test_GO_1_batch.fasta


: 

In [37]:
from collections import defaultdict
model_ids = defaultdict(list)
file_name2var_name = {
    "noxpd2_cnn_swissprot_go_clustered_swiss-cnn_for_swissprot_go_clustered":"GO_CLUSTERED_ENSEMBLE_ELEMENT_EXPERIMENT_IDS",
    "noxpd2_cnn_swissprot_go_random_swiss-cnn_for_swissprot_go_random":"GO_RANDOM_ENSEMBLE_ELEMENT_EXPERIMENT_IDS",
    "noxpd2_cnn_swissprot_ec_clustered_swiss-cnn_for_swissprot_ec_clustered":"EC_CLUSTERED_ENSEMBLE_ELEMENT_EXPERIMENT_IDS",
    "noxpnd_cnn_swissprot_ec_random_swiss-cnn_for_swissprot_ec_random":"EC_RANDOM_ENSEMBLE_ELEMENT_EXPERIMENT_IDS",

}

for i in pd.read_csv('../zipped_models.txt', header=None).values.flatten():
    file_name = i.split('/')[-1].replace('.tar.gz', '')
    #Only consider file_names with go or ec in them
    if '-'.join(file_name.split('-')[:-1]) in file_name2var_name:
        # Split string noxpd2_cnn_swissprot_ec_clustered_swiss-cnn_for_swissprot_ec_clustered-13704042.tar.gz' by - followed by number and extension
        split = file_name.split('-')
        w_id = int(split[-1])
        name = '-'.join(split[:-1])
        model_ids[file_name2var_name[name]].append(str(w_id))


In [None]:
labels = pd.read_hdf("../outputs/test_labels_ProteInfer.h5", key="labels_df")
probabilities = pd.read_hdf("../outputs/test_probabilities_ProteInfer.h5", key="probabilities_df")

logits_binary = torch.tensor(probabilities.values.flatten(),device='cuda')
labels_binary = torch.tensor(labels.values.flatten(),device='cuda')
precision, recall, thresholds = binary_precision_recall_curve(logits_binary, labels_binary)
f1 = 2 * precision * recall / (precision + recall)

best_th,best_f1 = thresholds[torch.argmax(f1)].item(),torch.max(f1).item()
print(best_th,best_f1)