# Fine-tune Model

In [1]:
## DATASET 1416
import random
import numpy as np
import pandas as pd
from utils import valid_report, read_fasta, negtive_sampling, save_fituned_model, model_fituning, cross_validation

############################################
##               settings                 ##
############################################

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

############################################
##               load data                ##
############################################

MAX_LEN = 512

positive_1416 = read_fasta('./positive_1416.fasta')
negtive_16590 = read_fasta('./negtive_16590.fasta')

positive_sentences = []
for p in positive_1416.values():
    positive_sentences.append(p[:MAX_LEN-2])

positive_labels = [1]*len(positive_sentences)

negtive_sentences, negitive_labels = negtive_sampling(negtive_16590, len(positive_labels))

sentences=positive_sentences+negtive_sentences
labels = positive_labels+negitive_labels
print(len(labels),len(sentences))


############################################
##          save one-fold to test         ##
############################################

fold_index=0
k=10
train = []
train_labels = []
test = []
test_labels = []

for s in range(len(sentences)):
    if s%k==fold_index:
        test.append(sentences[s])
        test_labels.append(labels[s])
    else:
        train.append(sentences[s])
        train_labels.append(labels[s])

train_set = pd.DataFrame(data={'label':train_labels,'seq':train})
test_set = pd.DataFrame(data={'label':test_labels,'seq':test})


############################################
##                training                ##
############################################

model, result, confusion_matrix = model_fituning('K'+str(fold_index), train_set, test_set, save_model=True)


############################################
##          print the result              ##
############################################

valid_report('[DATASET 1416]', [result], [confusion_matrix])

2832 2832


# Appendix

In [8]:
from utils import get_prediction, load_fituned_model, read_fasta
from proteinbert import load_pretrained_model

############################################
##               load data                ##
############################################

MAX_LEN=512

human_20386 = read_fasta('./human_20386.fasta')

sentences = []
for p in human_20386.values():
    sentences.append(p[:MAX_LEN-2])

    
############################################
##       load model and get prediction    ##
############################################

model = load_fituned_model(model_path='./default/checkpoint', seq_len=512)

ypred = get_prediction(sentences, model)

ypred

array([[0.04064   ],
       [0.02551973],
       [0.05227202],
       [0.03922716],
       [0.00764081],
       [0.08009022],
       [0.01084805],
       [0.4980178 ],
       [0.21998864],
       [0.00953716]], dtype=float32)