In [1]:
import torch 
from torch import nn
from torch.nn import functional as F
import numpy as np
import pandas as pd
from vocab import Vocab
from dataprocessor import DataProcessor
from conv_model import ConvProtein
from utils import *
from tqdm import trange
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_data(filename):
    data = pd.read_csv(filename)
    return data['protein_sequence'].values , data['tm'].values 

x,y = get_data('./data/train_fixed.csv')

min_v,max_v = min(y),max(y)

trainx,testx,trainy ,testy = train_test_split(x,y,test_size=0.2)

vocab = Vocab()
vocab.from_file()

vectors = load_ptvectors('./vectors.npy')

train_datafeeder = DataProcessor(vocab , trainx,trainy, [min_v,max_v])
test_datafeeder  = DataProcessor(vocab , testx,testy, [min_v,max_v])

In [None]:
model = ConvProtein(vectors,lr=1e-4,use_cuda=True)
model = model.to('cuda')

In [4]:
for i in trange(2000):
    x,y = train_datafeeder.sample(100)
    model.updates(x,y)
    if i% 100 == 0:
        predictions , trues = [] , []
        for testx,testy in test_datafeeder.export(10 , 100):
            preds = model.predicts(testx)
            predictions.extend(preds)
            trues.extend(testy)
        perf = np.corrcoef(predictions,trues)[0,1]
        print('correlation on test dataset is ',perf)


  0%|          | 2/2000 [00:01<17:22,  1.92it/s]

correlation on test dataset is  0.7423499612516262


  5%|▌         | 100/2000 [00:21<06:23,  4.96it/s]

  2100--0.027


  5%|▌         | 102/2000 [00:23<10:59,  2.88it/s]

correlation on test dataset is  0.7680754769657135


 10%|█         | 200/2000 [00:43<05:56,  5.05it/s]

  2200--0.013


 10%|█         | 202/2000 [00:44<10:09,  2.95it/s]

correlation on test dataset is  0.7463676603933888


 15%|█▌        | 300/2000 [01:04<05:35,  5.07it/s]

  2300--0.011


 15%|█▌        | 302/2000 [01:05<09:32,  2.96it/s]

correlation on test dataset is  0.7598293126910356


 20%|██        | 400/2000 [01:24<05:41,  4.68it/s]

  2400--0.016


 20%|██        | 402/2000 [01:26<10:58,  2.43it/s]

correlation on test dataset is  0.7627695374237331


 25%|██▌       | 500/2000 [01:46<05:01,  4.97it/s]

  2500--0.014


 25%|██▌       | 501/2000 [01:47<10:27,  2.39it/s]

correlation on test dataset is  0.747430974173412


 30%|███       | 600/2000 [02:07<04:47,  4.88it/s]

  2600--0.013


 30%|███       | 601/2000 [02:08<10:10,  2.29it/s]

correlation on test dataset is  0.747703690590829


 35%|███▌      | 700/2000 [02:29<04:22,  4.95it/s]

  2700--0.016


 35%|███▌      | 701/2000 [02:30<09:30,  2.28it/s]

correlation on test dataset is  0.7522500083648981


 40%|████      | 800/2000 [02:51<04:34,  4.36it/s]

  2800--0.019


 40%|████      | 802/2000 [02:52<07:46,  2.57it/s]

correlation on test dataset is  0.7659168302291075


 45%|████▌     | 900/2000 [03:13<04:28,  4.09it/s]

  2900--0.021


 45%|████▌     | 901/2000 [03:14<08:39,  2.11it/s]

correlation on test dataset is  0.765826769186622


 50%|█████     | 1000/2000 [03:34<03:47,  4.40it/s]

  3000--0.009
saved parameters to ckpt.21-32-32_64_128_256_512.pkl


 50%|█████     | 1002/2000 [03:35<06:01,  2.76it/s]

correlation on test dataset is  0.7571784078358768


 55%|█████▌    | 1100/2000 [03:55<03:00,  4.98it/s]

  3100--0.011


 55%|█████▌    | 1101/2000 [03:56<06:50,  2.19it/s]

correlation on test dataset is  0.7552603594652619


 60%|██████    | 1200/2000 [04:18<03:02,  4.38it/s]

  3200--0.014


 60%|██████    | 1201/2000 [04:19<08:23,  1.59it/s]

correlation on test dataset is  0.7452820816798404


 65%|██████▌   | 1300/2000 [04:39<02:18,  5.04it/s]

  3300--0.011


 65%|██████▌   | 1302/2000 [04:40<04:03,  2.86it/s]

correlation on test dataset is  0.7468751625199248


 70%|███████   | 1400/2000 [05:00<02:08,  4.68it/s]

  3400--0.014


 70%|███████   | 1402/2000 [05:01<03:46,  2.64it/s]

correlation on test dataset is  0.749521259583908


 75%|███████▌  | 1500/2000 [05:21<01:39,  5.02it/s]

  3500--0.011


 75%|███████▌  | 1502/2000 [05:22<02:51,  2.90it/s]

correlation on test dataset is  0.7556222365026563


 80%|████████  | 1600/2000 [05:42<01:18,  5.08it/s]

  3600--0.022


 80%|████████  | 1602/2000 [05:43<02:14,  2.96it/s]

correlation on test dataset is  0.7469222545138542


 85%|████████▌ | 1700/2000 [06:02<01:00,  4.99it/s]

  3700--0.013


 85%|████████▌ | 1702/2000 [06:03<01:43,  2.88it/s]

correlation on test dataset is  0.7474803705342549


 90%|█████████ | 1800/2000 [06:23<00:40,  5.00it/s]

  3800--0.009


 90%|█████████ | 1802/2000 [06:24<01:10,  2.83it/s]

correlation on test dataset is  0.7453291669003698


 95%|█████████▌| 1900/2000 [06:43<00:20,  5.00it/s]

  3900--0.013


 95%|█████████▌| 1902/2000 [06:44<00:33,  2.89it/s]

correlation on test dataset is  0.7471242773624889


100%|██████████| 2000/2000 [07:04<00:00,  4.71it/s]

  4000--0.011
saved parameters to ckpt.21-32-32_64_128_256_512.pkl



