# Documentation
- Generate the datasets used for evotuning the esm model
- for each dataset, filter out those sequence longer than 1024
- pfamA_balanced: 18000 entries for 4 clans related to motors
- motor_toolkit: motor toolkit
- kinesin_labelled: kinesin labelled dataset
- pfamA_target_shuffled: pfamA_target
- pfamA_target_sub: 396 of each protein family, for embedding visualization only

In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 

import torchvision 
import torchvision.transforms as transforms 
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, IterableDataset, DataLoader
# import tqdm
import numpy as np
import pandas as pd

import math

seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [3]:
pfamA_motors = pd.read_csv("../../data/pfamA_motors_named.csv")
pfamA_motors.head()

Unnamed: 0.1,Unnamed: 0,id,description,seq,pfamA_acc,clan_x,pfamA_name
0,0,A0A495CYV6_9MYCO/3-388,A0A495CYV6_9MYCO/3-388 A0A495CYV6.1 PF00871.18...,AVLVVNSGSSSIKYQVIDEQSGDRLAQGLVERIGESGRGRVVYKGA...,PF00871,actin_like,Acetate_kinase
1,1,A0A3A6QL58_9VIBR/4-390,A0A3A6QL58_9VIBR/4-390 A0A3A6QL58.1 PF00871.18...,LVLVLNCGSSSLKFAIVDAETGAEHLTGLAECLGLPEARMKWKLDG...,PF00871,actin_like,Acetate_kinase
2,2,A0A2T0AKP1_9THEO/2-389,A0A2T0AKP1_9THEO/2-389 A0A2T0AKP1.1 PF00871.18...,KILVLNCGSSSVKYQLFDMQREEVMARGLVERIGITGSMLTHRPAG...,PF00871,actin_like,Acetate_kinase
3,3,H1XW95_9BACT/146-327,H1XW95_9BACT/146-327 H1XW95.1 PF00871.18;Aceta...,ISGMPLIPRKSIFHALNQKAVARETAKKLGKKYRESSIIVAHMGGG...,PF00871,actin_like,Acetate_kinase
4,4,A0A396TZH3_9GAMM/13-397,A0A396TZH3_9GAMM/13-397 A0A396TZH3.1 PF00871.1...,AILVINCGSSSVKFSLIHPKTGQTILSGLAECLLANDAVIKIKFDN...,PF00871,actin_like,Acetate_kinase


In [6]:
sum(np.array([len(a) for a in pfamA_motors["seq"]])<1025)

1907329

In [7]:
sum(np.array([len(a) for a in pfamA_motors["seq"]])>=1025)

7502

In [8]:
7502/1907329

0.00393324906190804

In [11]:
pfamA_motors =  pfamA_motors.loc[np.array([len(a) for a in pfamA_motors["seq"]])<1025,:]

In [5]:
motor_toolkit = pd.read_csv("../../data/motor_tookits.csv")
motor_toolkit.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq,type
0,P52732,KIF11_HUMAN,reviewed,Kinesin-like protein KIF11 (Kinesin-like prote...,KIF11 EG5 KNSL1 TRIP5,Homo sapiens (Human),1056,MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPV...,kinesin
1,Q9LX99,KN14A_ARATH,reviewed,Kinesin-like protein KIN-14A (Geminivirus Rep-...,KIN14A GRIMP KAC1 KCA1 KSN1 TH65 At5g10470 F12...,Arabidopsis thaliana (Mouse-ear cress),1273,MADQRSKTNRWNWEVSGFEPRKSSSNASFAESTGHRTTGPLLRRNS...,kinesin
2,Q9FKP4,KN14B_ARATH,reviewed,Kinesin-like protein KIN-14B (Kinesin CDKA-1-a...,KIN14B KAC2 KCA2 At5g65460 MNA5.20,Arabidopsis thaliana (Mouse-ear cress),1264,MAEQKSTNMWNWEVTGFESKKSPSSEEGVHRTPSSMLRRYSIPKNS...,kinesin
3,Q9FZ06,KINUA_ARATH,reviewed,Kinesin-like protein KIN-UA (AtKINUa) (Protein...,KINUA ARK3 PAK At1g12430 F5O11.15,Arabidopsis thaliana (Mouse-ear cress),919,MSTTSGTGGVSYRNGTQRSSLRTQSSASTSSGGQKASVKSKSVLRK...,kinesin
4,P33176,KINH_HUMAN,reviewed,Kinesin-1 heavy chain (Conventional kinesin he...,KIF5B KNS KNS1,Homo sapiens (Human),963,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,kinesin


In [9]:
# truncate motor_toolkit to be <=1024 
sum(motor_toolkit["Length"]<=1024)

2184

In [12]:
motor_toolkit.loc[motor_toolkit["Length"]>1024,"seq"] = motor_toolkit.loc[motor_toolkit["Length"]>1024,"seq"].apply(lambda s: s[0:1024])

In [13]:
motor_toolkit["Length"] = motor_toolkit.loc[:,"seq"].apply(lambda s: len(s))

In [14]:
sum(motor_toolkit["Length"]>1024)

0

In [15]:
kinesin_labelled = pd.read_csv("../../data/kinesin_labelled.csv")

In [16]:
kinesin_labelled.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq,type,label
0,P52732,KIF11_HUMAN,reviewed,Kinesin-like protein KIF11 (Kinesin-like prote...,KIF11 EG5 KNSL1 TRIP5,Homo sapiens (Human),1056,MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPV...,kinesin,kinesin_5
1,Q9LX99,KN14A_ARATH,reviewed,Kinesin-like protein KIN-14A (Geminivirus Rep-...,KIN14A GRIMP KAC1 KCA1 KSN1 TH65 At5g10470 F12...,Arabidopsis thaliana (Mouse-ear cress),1273,MADQRSKTNRWNWEVSGFEPRKSSSNASFAESTGHRTTGPLLRRNS...,kinesin,unlabeled
2,Q9FKP4,KN14B_ARATH,reviewed,Kinesin-like protein KIN-14B (Kinesin CDKA-1-a...,KIN14B KAC2 KCA2 At5g65460 MNA5.20,Arabidopsis thaliana (Mouse-ear cress),1264,MAEQKSTNMWNWEVTGFESKKSPSSEEGVHRTPSSMLRRYSIPKNS...,kinesin,unlabeled
3,Q9FZ06,KINUA_ARATH,reviewed,Kinesin-like protein KIN-UA (AtKINUa) (Protein...,KINUA ARK3 PAK At1g12430 F5O11.15,Arabidopsis thaliana (Mouse-ear cress),919,MSTTSGTGGVSYRNGTQRSSLRTQSSASTSSGGQKASVKSKSVLRK...,kinesin,unlabeled
4,P33176,KINH_HUMAN,reviewed,Kinesin-1 heavy chain (Conventional kinesin he...,KIF5B KNS KNS1,Homo sapiens (Human),963,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,kinesin,kinesin_1


In [17]:
kinesin_labelled.loc[kinesin_labelled["Length"]>1024,"seq"] = kinesin_labelled.loc[kinesin_labelled["Length"]>1024,"seq"].apply(lambda s: s[0:1024])

In [18]:
kinesin_labelled["Length"] = kinesin_labelled.loc[:,"seq"].apply(lambda s: len(s))

In [19]:
sum(kinesin_labelled["Length"]>1024)

0

In [21]:
pfamA_motors_balanced = pfamA_motors.groupby('clan_x').apply(lambda _df: _df.sample(4500,random_state=1))
pfamA_motors_balanced = pfamA_motors_balanced.apply(lambda x: x.reset_index(drop = True))

In [22]:
pfamA_motors_balanced.shape

(18000, 7)

In [29]:
sum(np.array([len(a) for a in pfamA_motors_balanced["seq"]])>=1025)

0

In [23]:
pfamA_target_name = ["PF00349","PF00022","PF03727","PF06723",\
                       "PF14450","PF03953","PF12327","PF00091","PF10644",\
                      "PF13809","PF14881","PF00063","PF00225","PF03028"]

pfamA_target = pfamA_motors.loc[pfamA_motors["pfamA_acc"].isin(pfamA_target_name),:].reset_index()

In [26]:
pfamA_target = pfamA_target.iloc[:,1:]

In [27]:
pfamA_target_sub = pfamA_target.sample(frac = 1).groupby("pfamA_acc").head(396)

In [28]:
pfamA_target_sub.groupby("pfamA_acc").count()

Unnamed: 0_level_0,Unnamed: 0,id,description,seq,clan_x,pfamA_name
pfamA_acc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PF00022,396,396,396,396,396,396
PF00063,396,396,396,396,396,396
PF00091,396,396,396,396,396,396
PF00225,396,396,396,396,396,396
PF00349,396,396,396,396,396,396
PF03028,396,396,396,396,396,396
PF03727,396,396,396,396,396,396
PF03953,396,396,396,396,396,396
PF06723,396,396,396,396,396,396
PF10644,396,396,396,396,396,396


In [30]:
sum(np.array([len(a) for a in pfamA_target_sub["seq"]])>=1025)

0

In [31]:
pfamA_target_sub.to_csv("../../data/esm/pfamA_target_sub.csv",index = False)
pfamA_target.to_csv("../../data/esm/pfamA_target.csv",index = False)
kinesin_labelled.to_csv("../../data/esm/kinesin_labelled.csv",index = False)
motor_toolkit.to_csv("../../data/esm/motor_toolkit.csv",index = False)
pfamA_motors_balanced.to_csv("../../data/esm/pfamA_motors_balanced.csv",index = False)