In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [2]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

In [3]:
pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

In [18]:
pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:]

In [19]:
pfamA_target.groupby("clan").count()

Unnamed: 0_level_0,Unnamed: 0,id,description,seq,pfamA_acc
clan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
actin_like,21034,21034,21034,21034,21034
p_loop_gtpase,25725,25725,25725,25725,25725
tubulin_binding,7635,7635,7635,7635,7635
tubulin_c,4755,4755,4755,4755,4755


In [20]:
pfamA_target = pfamA_target.sample(frac = 1)
pfamA_target_sub = pfamA_target.groupby("pfamA_acc").head(396)

In [21]:
pfamA_target_sub.groupby("pfamA_acc").count()

Unnamed: 0_level_0,Unnamed: 0,id,description,seq,clan
pfamA_acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PF00022,396,396,396,396,396
PF00063,396,396,396,396,396
PF00091,396,396,396,396,396
PF00225,396,396,396,396,396
PF00349,396,396,396,396,396
PF03028,396,396,396,396,396
PF03727,396,396,396,396,396
PF03953,396,396,396,396,396
PF06723,396,396,396,396,396
PF10644,396,396,396,396,396


In [27]:
pfamA_target_sub.to_csv("../../data/pfamA_target_sub.csv",index = False)

## Filter out motor_toolkit n >= 5000

In [30]:
motor_toolkit = motor_toolkit.loc[motor_toolkit["Length"] < 5000,:]

In [33]:
motor_toolkit = 

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq,type
0,P52732,KIF11_HUMAN,reviewed,Kinesin-like protein KIF11 (Kinesin-like prote...,KIF11 EG5 KNSL1 TRIP5,Homo sapiens (Human),1056,MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPV...,kinesin
1,Q9LX99,KN14A_ARATH,reviewed,Kinesin-like protein KIN-14A (Geminivirus Rep-...,KIN14A GRIMP KAC1 KCA1 KSN1 TH65 At5g10470 F12...,Arabidopsis thaliana (Mouse-ear cress),1273,MADQRSKTNRWNWEVSGFEPRKSSSNASFAESTGHRTTGPLLRRNS...,kinesin
2,Q9FKP4,KN14B_ARATH,reviewed,Kinesin-like protein KIN-14B (Kinesin CDKA-1-a...,KIN14B KAC2 KCA2 At5g65460 MNA5.20,Arabidopsis thaliana (Mouse-ear cress),1264,MAEQKSTNMWNWEVTGFESKKSPSSEEGVHRTPSSMLRRYSIPKNS...,kinesin
3,Q9FZ06,KINUA_ARATH,reviewed,Kinesin-like protein KIN-UA (AtKINUa) (Protein...,KINUA ARK3 PAK At1g12430 F5O11.15,Arabidopsis thaliana (Mouse-ear cress),919,MSTTSGTGGVSYRNGTQRSSLRTQSSASTSSGGQKASVKSKSVLRK...,kinesin
4,P33176,KINH_HUMAN,reviewed,Kinesin-1 heavy chain (Conventional kinesin he...,KIF5B KNS KNS1,Homo sapiens (Human),963,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,kinesin
5,Q12840,KIF5A_HUMAN,reviewed,Kinesin heavy chain isoform 5A (Kinesin heavy ...,KIF5A NKHC1,Homo sapiens (Human),1032,MAETNNECSIKVLCRFRPLNQAEILRGDKFIPIFQGDDSVVIGGKP...,kinesin
6,P33175,KIF5A_MOUSE,reviewed,Kinesin heavy chain isoform 5A (Kinesin heavy ...,Kif5a Kiaa4086 Kif5 Nkhc1,Mus musculus (Mouse),1027,MAETNNECSIKVLCRFRPLNQAEILRGDKFIPIFQGDDSVIIGGKP...,kinesin
7,Q61768,KINH_MOUSE,reviewed,Kinesin-1 heavy chain (Conventional kinesin he...,Kif5b Khcs Kns1,Mus musculus (Mouse),963,MADPAECNIKVMCRFRPLNESEVNRGDKYVAKFQGEDTVMIASKPY...,kinesin
8,Q61771,KIF3B_MOUSE,reviewed,Kinesin-like protein KIF3B (Microtubule plus e...,Kif3b,Mus musculus (Mouse),747,MSKLKSSESVRVVVRCRPMNGKEKAASYDKVVDVDVKLGQVSVKNP...,kinesin
9,O15066,KIF3B_HUMAN,reviewed,Kinesin-like protein KIF3B (HH0048) (Microtubu...,KIF3B KIAA0359,Homo sapiens (Human),747,MSKLKSSESVRVVVRCRPMNGKEKAASYDKVVDVDVKLGQVSVKNP...,kinesin
