In [46]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [54]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors.csv")
df_dev = pd.read_csv("../../data/df_dev.csv")
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")

In [48]:
pfamA_motors_balanced = pfamA_motors.groupby('clan').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

In [55]:
pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:].reset_index()

In [56]:
pfamA_target

Unnamed: 0.1,index,Unnamed: 0,id,description,seq,pfamA_acc,clan
0,6434,6434,A0A287K457_HORVV/66-471,A0A287K457_HORVV/66-471 A0A287K457.1 PF00022.2...,PDSAPIVIDNGASTFRIGWAGEAEPRVSFRNIVQRPRHRSSGETVT...,PF00022,actin_like
1,6435,6435,A0A0A2V8D8_BEABA/123-560,A0A0A2V8D8_BEABA/123-560 A0A0A2V8D8.1 PF00022....,NSGTGFSKLGFAGNDSPSFVFPTAIATKGPAAGGGGSGSGRPAVGN...,PF00022,actin_like
2,6436,6436,A0A1I7VHD9_LOALO/3-418,A0A1I7VHD9_LOALO/3-418 A0A1I7VHD9.1 PF00022.20...,AGRLPACVIDNGTGYTKLGYAGNSEPQFIIPSAIAIREKVGSQSSA...,PF00022,actin_like
3,6437,6437,A0A1U8GWJ5_CAPAN/535-710,A0A1U8GWJ5_CAPAN/535-710 A0A1U8GWJ5.1 PF00022....,DDDKPDQDEAELTRISSRLQEIDPTFFPGSESGASATEAPRFHPLT...,PF00022,actin_like
4,6438,6438,A0A409XX09_9AGAR/1-396,A0A409XX09_9AGAR/1-396 A0A409XX09.1 PF00022.20...,MRKPVIVLDNGASTIKVGIAQKDPDPRIIPNAVVRSKGDKMTYFGH...,PF00022,actin_like
5,6439,6439,A0A3Q4HCL1_NEOBR/13-386,A0A3Q4HCL1_NEOBR/13-386 A0A3Q4HCL1.1 PF00022.2...,ADFKSPIVLDSGSGLIKAGFADQDLPSIIFPTIIGVPKYEEVLNGN...,PF00022,actin_like
6,6440,6440,A0A0V0SES9_9BILA/7-376,A0A0V0SES9_9BILA/7-376 A0A0V0SES9.1 PF00022.20...,IANQPIVIDNGSGTIKAGFAGDQAPKCHFPNYVGRPKHVRVMAGAL...,PF00022,actin_like
7,6441,6441,T0QSF7_SAPDV/1303-1432,T0QSF7_SAPDV/1303-1432 T0QSF7.1 PF00022.20;Actin;,VWTASETPETALDRLFAPDAAQCAMGVVTAILQVLGLSTPVIHGTL...,PF00022,actin_like
8,6442,6442,A0A2P6N9A2_9MYCE/10-390,A0A2P6N9A2_9MYCE/10-390 A0A2P6N9A2.1 PF00022.2...,QPVGSGILKAGLAGVDHPKTLFPSYVGRPKHTRVMAGSVEGDLSIC...,PF00022,actin_like
9,6443,6443,A0A423T7C2_PENVA/1-360,A0A423T7C2_PENVA/1-360 A0A423T7C2.1 PF00022.20...,MCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDAYVGDEAQAK...,PF00022,actin_like


In [57]:
pfamA_target = pfamA_target.sample(frac = 1)
pfamA_target_sub = pfamA_target.groupby("pfamA_acc").head(396)

In [58]:
pfamA_target_sub.groupby("pfamA_acc").count()

Unnamed: 0_level_0,index,Unnamed: 0,id,description,seq,clan
pfamA_acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PF00022,396,396,396,396,396,396
PF00063,396,396,396,396,396,396
PF00091,396,396,396,396,396,396
PF00225,396,396,396,396,396,396
PF00349,396,396,396,396,396,396
PF03028,396,396,396,396,396,396
PF03727,396,396,396,396,396,396
PF03953,396,396,396,396,396,396
PF06723,396,396,396,396,396,396
PF10644,396,396,396,396,396,396


In [59]:
pfamA_target_sub = pfamA_target_sub.iloc[:,1:]

In [60]:
pfamA_target_sub.head()

Unnamed: 0.1,Unnamed: 0,id,description,seq,pfamA_acc,clan
25466,179519,A0A098S4B7_9BACT/12-174,A0A098S4B7_9BACT/12-174 A0A098S4B7.1 PF00091.2...,IIKVLGVGGGGSNAVTHMFRQGIVGVDFAICNTDSQAMELSPVTTR...,PF00091,tubulin_binding
43290,1414859,A0A0A1SN99_9HYPO/43-437,A0A0A1SN99_9HYPO/43-437 A0A0A1SN99.1 PF00225.2...,RASDEDSRTAVRVAIRIRPPLKPTDPGYELIPQRFQRSMVQTTSDT...,PF00225,p_loop_gtpase
6486,12920,M7C1E0_CHEMY/152-523,M7C1E0_CHEMY/152-523 M7C1E0.1 PF00022.20;Actin;,MGKVAVVIDNGSCFTRAGFAGEDKPKSVLKTTSMPPTCPAVMREIP...,PF00022,actin_like
43689,1415258,H9J8N3_BOMMO/16-329,H9J8N3_BOMMO/16-329 H9J8N3.1 PF00225.24;Kinesin;,NQTFAMDKRKKQVSLCEATSAASAPEDRKVGVTAPKMFAFDAIFSQ...,PF00225,p_loop_gtpase
6951,13385,A0A075AWM4_ROZAC/39-659,A0A075AWM4_ROZAC/39-659 A0A075AWM4.1 PF00022.2...,IDTSKVIVLHPGSETLKFGMATEGLPRTIPNVIARLDPTKGDTMEA...,PF00022,actin_like


In [62]:
pfamA_target_sub.to_csv("../../data/pfamA_target_sub.csv",index = False)

## Filter out motor_toolkit n >= 5000

In [30]:
motor_toolkit = motor_toolkit.loc[motor_toolkit["Length"] < 5000,:]

In [34]:
motor_toolkit.to_csv("../../data/motor_toolkit_short.csv")