In [27]:
# import esm
import torch
from argparse import Namespace
from esm.constants import proteinseq_toks
import math
import torch.nn as nn
import torch.nn.functional as F
from esm.modules import TransformerLayer, PositionalEmbedding  # noqa
from esm.model import ProteinBertModel

# model, alphabet = torch.hub.load("facebookresearch/esm", "esm1_t34_670M_UR50S")
import esm
import io
import pandas as pd

In [28]:
motor_toolkit = pd.read_csv("../../data/esm/motor_toolkit.csv")
# motor_toolkit = motor_toolkit.sample(frac = 1)
motor_toolkit.groupby("type").count().iloc[:,1]

type
dynein       665
kinesin     1055
myosin_v    1535
Name: Entry name, dtype: int64

In [29]:
kinesin_labelled = pd.read_csv("../../data/esm/kinesin_labelled.csv")
# kinesin_labelled = kinesin_labelled.sample(frac = 1)
kinesin_labelled.shape

(1055, 10)

In [30]:
kif_uniprot = pd.read_csv("../../data/kif/kif_uniprot.csv")

In [31]:
kif_uniprot.columns = ['Entry', 'Entry name', 'Status', 'Protein names', 'Gene names',
       'Organism', 'Organism ID', 'Taxonomic lineage (all)', 'seq',
       'Length'] 

In [32]:
kinesin_labelled.loc[:,["Entry","seq"]].head(1)

Unnamed: 0,Entry,seq
0,P52732,MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPV...


In [33]:
kif_uniprot.loc[:,["Entry","seq"]].head(1)

Unnamed: 0,Entry,seq
0,Q7PG43,MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCL...


In [34]:
#prepare 3 set of data with dynein+kin/kin_all/kin_kif only
kin_both = pd.concat([kinesin_labelled.loc[:,["Entry","seq"]],kif_uniprot.loc[:,["Entry","seq"]]], ignore_index=True)

In [35]:
kin_both.head()

Unnamed: 0,Entry,seq
0,P52732,MASQPNSSAKKKEEKGKNIQVVVRCRPFNLAERKASAHSIVECDPV...
1,Q9LX99,MADQRSKTNRWNWEVSGFEPRKSSSNASFAESTGHRTTGPLLRRNS...
2,Q9FKP4,MAEQKSTNMWNWEVTGFESKKSPSSEEGVHRTPSSMLRRYSIPKNS...
3,Q9FZ06,MSTTSGTGGVSYRNGTQRSSLRTQSSASTSSGGQKASVKSKSVLRK...
4,P33176,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...


In [36]:
kin_both.shape

(1616, 2)

In [37]:
dyn = motor_toolkit.loc[motor_toolkit["type"]=="dynein",:]

In [38]:
dyn.shape

(665, 9)

In [39]:
dyun_kin = pd.concat([dyn.loc[:,["Entry","seq"]], kin_both],ignore_index=True)


In [40]:
dyun_kin.shape

(2281, 2)

In [41]:
dyun_kin.head()

Unnamed: 0,Entry,seq
0,Q45VK7,MAGSLGDVRKLFLFTTTQNYFGLRPELWDQPPLSNCPEVNNFLDDG...
1,Q9JJ79,MAGSLSDVRKLFLFTTTQNYFGLRPELWDQTPLSNCPEVNNFLDDG...
2,Q8NCM8,MANGTADVRKLFIFTTTQNYFGLMSELWDQPLLCNCLEINNFLDDG...
3,P38650,MSETGGGEDGSAGLEVSAVQNVADVSVLQKHLRKLVPLLLEDGGDA...
4,Q63100,MSDKSDLKAELERKKQRLAQIREEKKRKEEERKKKEADMQQKKEPV...


In [42]:
kin_kif = kif_uniprot.loc[:,["Entry","seq"]]
kin_kif.shape

(561, 2)

In [43]:
kin_kif.head()

Unnamed: 0,Entry,seq
0,Q7PG43,MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCL...
1,Q7PNB7,LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2,Q7PTK6,MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3,Q7QJN4,MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4,Q7QDS6,MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...


In [None]:
dyun_kin.to_csv("../../data/kif/dyn_kin.csv",index = False)
kin_both.to_csv("../../data/kif/kin_both.csv",index = False)
kin_kif.to_csv("../../data/kif/kin_kif.csv",index = False)

## Organize documentation of kinesin proteins from different sources

In [44]:
kif_list_jp = pd.read_csv("../../data/kif/KIFlist.txt", sep = "\t")
kif_list_jp.head()
kif_list_jp_acc = pd.read_csv("../../data/kif/AccNo.txt", sep = "\t",header = None)
kif_list_jp_acc.head()
kif_list_jp_acc.columns = ["KIF","prot_acc","ensembl"]
# merge the accession table and the information table of japan database
kif_jp = kif_list_jp_acc.merge(kif_list_jp,on = "KIF")



In [45]:
kif_jp.head()

Unnamed: 0,KIF,prot_acc,ensembl,Family,Subfamily,Alias
0,AgKHC,XP_310522,ENSANGG00000014820,Kinesin-1,KIF5(KHC),
1,AgKlp31E,XP_317685,ENSANGP00000002307,Kinesin-4,KIF21,
2,AgNcd,XP_307936,ENSANGP00000006252,Kinesin-14A,NCD,
3,AgKin73,XP_308280,ENSANGP00000009361,Kinesin-3,KIF13,
4,AgKlp68D,XP_311552,ENSANGP00000010166,Kinesin-2,KIF3B/C,


In [51]:
def get_uniprot_entry(acc):
    url = "https://www.uniprot.org/uniprot/?query="+acc+"&sort=score&columns=id,entry name,reviewed,protein names,genes,organism,organism-id,lineage(all),sequence,length&format=tab"
    try:
        r = requests.get(url)
        urlData = r.content
#         print(urlData)
        rawData = pd.read_csv(io.StringIO(urlData.decode('utf-8')),sep = '\t')
    except:
        rawData = pd.DataFrame()
    return rawData 

In [52]:
kif_jp_uniprot = pd.DataFrame()
kif_jp_uniprot_acc = []
for i in range(kif_jp.shape[0]):
    kif_id = kif_jp["KIF"].iloc[i]
#     print(kif_jp["prot_acc"].iloc[i])
    curr = get_uniprot_entry(kif_jp["prot_acc"].iloc[i])
#     print(curr)
    if not (curr.empty):
        kif_jp_uniprot = pd.concat([kif_jp_uniprot,curr], ignore_index=True)
        for j in range(curr.shape[0]):
            kif_jp_uniprot_acc.append(kif_id)
    if i%100 ==0:
        print(kif_jp_uniprot.shape)
#     break

(1, 10)
(94, 10)
(172, 10)
(218, 10)
(295, 10)
(386, 10)
(448, 10)


In [53]:
kif_jp.shape[0]

626

In [90]:
len(kif_jp_uniprot_acc)

481

In [91]:
kif_jp_uniprot.shape

(481, 10)

In [92]:
kif_jp_acc = kif_jp_uniprot.loc[:,["Entry"]]

In [100]:
kif_jp_uniprot["KIF"] = kif_jp_uniprot_acc

In [94]:
kif_jp_acc["db_acc"] = pd.Series(kif_jp_uniprot_acc)

In [95]:
kif_jp_acc["db_name"] = "kif_jp"

In [104]:
kif_jp_acc["db_acc"].unique().shape

(381,)

In [97]:
# number of kif_jp entries not mapped to uniprot
len(set(kif_jp["KIF"]) - set(kif_jp_acc["db_acc"]))

245

In [60]:
list(set(kif_jp["KIF"]) - set(kif_jp_acc["db_acc"]))[:5]

['AgKIF3C', 'CbUnc116', 'GlP436_12181_14100', 'PfMAL1P2.36', 'DrKIF3C']

In [61]:
kif_jp.loc[kif_jp["KIF"]=="CbKlp15",:]

Unnamed: 0,KIF,prot_acc,ensembl,Family,Subfamily,Alias
103,CbKlp15,CAE67236,CBG12675,Kinesin-14A,NCD,


Manual inspection suggests that the prot_acc is still tractable in Uniparc but the accession is already obsolete in the current uniprot database, other entries of the entries not matching should be of similar reason. Coudl query through https://www.uniprot.org/uniparc/ if needed in the future. For now, leave the 160 unmapped entries 

In [105]:
# get the entries from kif_doc
kif_speed = pd.read_csv("../../data/kif/kif_speed.csv")
kif_speed.head()

Unnamed: 0,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type,KIF,aa_seq,kif_acc,uniprot_acc
0,D. melanogaster KHC,110,"Plus, 54 µm/min",Diffuse in cytoplasm,Required for viability & neuromuscular function,DmKHC AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEEN...,1,DmKHC,AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIA-----...,DmKHC,P17210;P21613;P35978;P28738;O60282;Q61768;Q2PQ...
1,L. pealii KHC,109,"Plus, 30 µm/min",Membranous vesicles,,LpKHC SECNIKVICRVRPLNEAEERAGSKFILKFP---TDD...,1,LpKHC,SECNIKVICRVRPLNEAEERAGSKFILKFP---TDDSISIA-----...,LpKHC,P21613;P17210;O60282;P28738;P33175;Q12840;P359...
2,N. crassa KHC (NKin),103,"Plus, 120-180 µm/min",ND,High velocity of microtubule transport; no cop...,NcKHC SANSIKVVARFRPQNRVEIESGGQPIVTFQ---GPD...,1,NcKHC,SANSIKVVARFRPQNRVEIESGGQPIVTFQ---GPDTCTVD---SK...,NcKHC,P48467;Q86Z98;Q86ZC1;O43093;Q9US60;Q54UC9;P331...
3,D. melanogaster Klp68D,88,"Plus, 18 µm/min",ND,,Dmklp68D PNECVQVVVRCRPMSNRERSERSPEVVNVYPNRGVV...,2,Dmklp68D,PNECVQVVVRCRPMSNRERSERSPEVVNVYPNRGVVELQNVV----...,Dmklp68D,
4,M. musculus KIF3A,80,"Plus, 36 µm/min",Microsomes/ synaptic vesicles,,MmKIF3A SCDNVKVVVRCRPLNEREKSMCYRQAVSVDEMRGTI...,2,MmKIF3A,SCDNVKVVVRCRPLNEREKSMCYRQAVSVDEMRGTITVHKTD----...,MmKIF3A,P28741;Q4R628;Q9Y496;Q5R4H3;P46872;P46871;Q617...


In [106]:
speed_acc = []
for i in range(kif_speed.shape[0]):
    speed_acc.extend(str(kif_speed["uniprot_acc"][i]).split(";")[:-1])

In [107]:
len(set(speed_acc) - set(kif_jp_acc["Entry"]))

85

In [108]:
# add the 85 more accessions not in kif_jp into the accession inventory
speed_acc_packed = []
for i in range(kif_speed.shape[0]):
    speed_acc_packed.append(str(kif_speed["uniprot_acc"][i]).split(";")[:-1])

In [109]:
db_accs = []
entry_accs = []
for acc in (set(speed_acc) - set(kif_jp_acc["Entry"])):
    db_acc = ''
    for i in range(len(speed_acc_packed)):
        if acc in speed_acc_packed[i]:
            db_acc = db_acc + kif_speed["kif_acc"].iloc[i] + ';'
    print(db_acc)
    db_accs.append(db_acc)
    entry_accs.append(acc)

HsMKLP1;Xlklp2;
DmKHC;MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;HsKSP;DmKLP61F;XlEg5;HsMKLP1;
MmKIF4;
HsMKLP1;
MmKIF2;
Xlklp2;
MmKIF3B;SpKRP85;MmKIF1A;MmKIF1B;
MmKIF1A;MmKIF1B;
DmKHC;LpKHC;NcKHC;MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;MmKIF1A;MmKIF1B;HsKSP;DmKLP61F;XlEg5;HsMKLP1;Xlklp2;
HsMKLP1;MmKIF2;
DmKHC;LpKHC;NcKHC;MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;MmKIF1A;MmKIF1B;MmKIF4;HsKSP;DmKLP61F;XlEg5;ScKAR3;Xlklp2;
DmKHC;LpKHC;NcKHC;Xlklp2;
MmKIF2;
HsMKLP1;Xlklp2;
DmNcd;ScKAR3;CgCHO2;
DmKHC;LpKHC;NcKHC;
MmKIF1A;MmKIF1B;
DmKHC;MmKIF3A;HsKSP;DmKLP61F;XlEg5;
MmKIF2;
DmKHC;LpKHC;NcKHC;
DmNcd;ScKAR3;CgCHO2;
ScKAR3;CgCHO2;
MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;MmKIF4;HsKSP;DmKLP61F;XlEg5;
MmKIF2;
MmKIF1A;MmKIF1B;
MmKIF1A;MmKIF1B;
MmKIF1A;HsMKLP1;Xlklp2;
MmKIF4;
MmKIF2;
MmKIF4;
MmKIF2;
MmKIF2;
MmKIF4;HsMKLP1;
NcKHC;MmKIF3A;MmKIF3B;SpKRP95;HsKSP;DmKLP61F;XlEg5;HsMKLP1;
NcKHC;MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;HsKSP;DmKLP61F;XlEg5;
HsMKLP1;
HsMKLP1;MmKIF2;
NcKHC;MmKIF3A;MmKIF3B;SpKRP85;SpKRP95;HsKSP;DmKLP61F;XlEg5;HsMKLP1;MmKI

In [110]:
len(db_accs)

85

In [111]:
kif_speed_acc = pd.DataFrame()
kif_speed_acc["Entry"] = entry_accs
kif_speed_acc["db_acc"] = db_accs
kif_speed_acc["db_name"] = "kif_duke"

In [112]:
kif_speed_acc.shape

(85, 3)

In [113]:
kif_acc_all = pd.concat([kif_jp_acc,kif_speed_acc], ignore_index=True)
print(kif_acc_all.shape)
kif_acc_all.head()

(566, 3)


Unnamed: 0,Entry,db_acc,db_name
0,Q7PG43,AgKHC,kif_jp
1,Q7PNB7,AgKlp31E,kif_jp
2,Q7PTK6,AgNcd,kif_jp
3,Q7QJN4,AgKin73,kif_jp
4,Q7QDS6,AgKlp68D,kif_jp


In [71]:
# insert the 85 uniprot sequences in kif_speed that are not in kif_jp
kif_speed_uniprot = pd.DataFrame()
for acc in (set(speed_acc) - set(kif_jp_acc["Entry"])):
    curr = get_uniprot_entry(acc)
    if not (curr.empty):
        kif_speed_uniprot = pd.concat([kif_speed_uniprot,curr], ignore_index=True)

In [114]:
kif_speed_uniprot = kif_speed_uniprot.loc[kif_speed_uniprot["Entry"].isin(kif_speed_acc["Entry"]),:]
kif_speed_uniprot.shape

(85, 10)

In [None]:
# get the kinesin/dyeinin entry from motor_toolkit search 
boo = kinesin_labelled.loc[~kinesin_labelled["Entry"].isin(kif_acc_all["Entry"]),"Protein names"].apply(lambda s: len(re.findall("kinesin",s.lower()))>0)

In [None]:
motor_uniprot = kinesin_labelled.loc[~kinesin_labelled["Entry"].isin(kif_acc_all["Entry"]),:].loc[boo,:]

In [None]:
motor_uniprot.to_csv("../../data/kif/motor_uniprot.csv",index = False)

In [115]:
motor_uniprot = pd.read_csv("../../data/kif/motor_uniprot.csv")
print(motor_uniprot.shape)
motor_uniprot.head(1)

(57, 10)


Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq,type,label
0,Q6FXI5,CIN8_CANGA,reviewed,Kinesin-like protein CIN8,CIN8 CAGL0B03641g,Candida glabrata (strain ATCC 2001 / CBS 138 /...,988,MVVTTEAVTSRSERDAEPVQEPLVEKLATEELNILVAVRCRGRNER...,kinesin,kinesin_5


In [116]:
motor_uniprot_acc = motor_uniprot.loc[:,["Entry","Entry name"]]

In [117]:
motor_uniprot_acc["db_name"] = "motor_toolkit"

In [118]:
motor_uniprot_acc.columns = ["Entry","db_acc","db_name"]

In [119]:
kif_acc_all = pd.concat([kif_acc_all,motor_uniprot_acc], ignore_index=True)

In [120]:
motor_uniprot_acc.shape

(57, 3)

In [121]:
kif_speed_uniprot.shape[0] + kif_jp_uniprot.shape[0] + motor_uniprot.shape[0]

623

In [122]:
kif_acc_all.shape

(623, 3)

In [123]:
motor_uniprot_cat = motor_uniprot.loc[:,['Entry','Entry name','Status','Protein names','Gene names','Organism','Length',"seq"]]


In [124]:
kif_speed_uniprot_cat = kif_speed_uniprot.loc[:,['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Sequence']]

In [125]:
kif_jp_uniprot = kif_jp_uniprot.loc[:,['Entry','Entry name','Status','Protein names','Gene names','Organism','Length','Sequence']]

In [126]:
motor_uniprot_cat.columns = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length',"seq"]
kif_speed_uniprot_cat.columns = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length',"seq"]
kif_jp_uniprot.columns = ['Entry','Entry name','Status','Protein names','Gene names','Organism','Length',"seq"]

In [127]:
kif_speed_uniprot_cat.shape[0] + kif_jp_uniprot.shape[0] + motor_uniprot_cat.shape[0]

623

In [128]:
kif_uniprot_all = pd.concat([kif_jp_uniprot,kif_speed_uniprot_cat,motor_uniprot_cat], ignore_index=True)

In [129]:
kif_uniprot_all.shape

(623, 8)

In [130]:
kif_uniprot_all.head(2)

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq
0,Q7PG43,Q7PG43_ANOGA,unreviewed,Kinesin-like protein,1271664 AgaP_AGAP000561,Anopheles gambiae (African malaria mosquito),983,MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCL...
1,Q7PNB7,Q7PNB7_ANOGA,unreviewed,AGAP007815-PA (Fragment),AgaP_AGAP007815,Anopheles gambiae (African malaria mosquito),1033,LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...


In [135]:
kif_jp.loc[kif_jp["Family"] == "Orphans","Family"] = "Kinesin-15"
kif_jp.loc[kif_jp["Family"] == "Kinesin-14A","Family"] = "Kinesin-14"
kif_jp.loc[kif_jp["Family"] == "Kinesin-14B","Family"] = "Kinesin-14"
kif_jp.groupby("Family").count()
kif_jp_acc_with_fam = kif_jp_acc.merge(kif_jp,left_on = "db_acc",right_on = "KIF")

In [136]:
jp_type = kif_jp_acc_with_fam["Family"].apply(lambda s: int(s.split("Kinesin-")[1]))

In [137]:
kif_speed

Unnamed: 0,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type,KIF,aa_seq,kif_acc,uniprot_acc
0,D. melanogaster KHC,110,"Plus, 54 µm/min",Diffuse in cytoplasm,Required for viability & neuromuscular function,DmKHC AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEEN...,1,DmKHC,AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIA-----...,DmKHC,P17210;P21613;P35978;P28738;O60282;Q61768;Q2PQ...
1,L. pealii KHC,109,"Plus, 30 µm/min",Membranous vesicles,,LpKHC SECNIKVICRVRPLNEAEERAGSKFILKFP---TDD...,1,LpKHC,SECNIKVICRVRPLNEAEERAGSKFILKFP---TDDSISIA-----...,LpKHC,P21613;P17210;O60282;P28738;P33175;Q12840;P359...
2,N. crassa KHC (NKin),103,"Plus, 120-180 µm/min",ND,High velocity of microtubule transport; no cop...,NcKHC SANSIKVVARFRPQNRVEIESGGQPIVTFQ---GPD...,1,NcKHC,SANSIKVVARFRPQNRVEIESGGQPIVTFQ---GPDTCTVD---SK...,NcKHC,P48467;Q86Z98;Q86ZC1;O43093;Q9US60;Q54UC9;P331...
3,D. melanogaster Klp68D,88,"Plus, 18 µm/min",ND,,Dmklp68D PNECVQVVVRCRPMSNRERSERSPEVVNVYPNRGVV...,2,Dmklp68D,PNECVQVVVRCRPMSNRERSERSPEVVNVYPNRGVVELQNVV----...,Dmklp68D,
4,M. musculus KIF3A,80,"Plus, 36 µm/min",Microsomes/ synaptic vesicles,,MmKIF3A SCDNVKVVVRCRPLNEREKSMCYRQAVSVDEMRGTI...,2,MmKIF3A,SCDNVKVVVRCRPLNEREKSMCYRQAVSVDEMRGTITVHKTD----...,MmKIF3A,P28741;Q4R628;Q9Y496;Q5R4H3;P46872;P46871;Q617...
5,M. musculus KIF3B,85,"Plus, 18 µm/min","Neurons, cell bodies, axons and dendrites",,MmKIF3B SSESVRVVVRCRPMNGKEKAASYDKVVDVDVKLGQV...,2,MmKIF3B,SSESVRVVVRCRPMNGKEKAASYDKVVDVDVKLGQVSVKNPK----...,MmKIF3B,Q61771;O15066;P46871;Q5R706;O35066;O55165;O147...
6,S. purpuratus KRP85,79,"Plus, 24 µm/min",ND,,SpKRP85 GNDNVRVVVRCRPLNSKETGQGFKSVVKMDEMRGTV...,2,SpKRP85,GNDNVRVVVRCRPLNSKETGQGFKSVVKMDEMRGTVQVTNPN----...,SpKRP85,P46872;Q4R628;Q9Y496;Q5R4H3;P28741;P46871;Q617...
7,S. purpuratus KRP95,84,"Plus, 24 µm/min",ND,,SpKRP95 SAETVKVVVRCRPMNSKEISQGHKRIVEMDNKRGLV...,2,SpKRP95,SAETVKVVVRCRPMNSKEISQGHKRIVEMDNKRGLVEVTNPK----...,SpKRP95,P46871;O15066;Q61771;P46872;Q5R706;O14782;O350...
8,M. musculus KIF1A,192,"Plus, 72 µm/min",Enriched in axons,"Associated with synaptic vesicles, monomeric?",MmKIF1A AGASVKVAVRVRPFNSREMSRDSKCIIQMSGSTTTI...,3,MmKIF1A,AGASVKVAVRVRPFNSREMSRDSKCIIQMSGSTTTIVNPK-----Q...,MmKIF1A,P33173;F1M4A4;Q12756;O60333;Q60575;O88658;O438...
9,M. musculus KIF1B,130,"Plus, 40 µm/min",Co-localizes with mitochondria,"Transports mitochondria, monomeric?",MmKIF1B SGASVKVAVRVRPFNSRETSKESKCIIQMQGNSTSI...,3,MmKIF1B,SGASVKVAVRVRPFNSRETSKESKCIIQMQGNSTSIINPK-----N...,MmKIF1B,Q60575;O88658;O60333;Q12756;F1M4A4;P33173;O350...


In [141]:
import scipy
import scipy.stats
speed_types = []
for kif_id in kif_acc_all.loc[kif_acc_all["db_name"] == "kif_duke","db_acc"]:
    ids = kif_id.split(";")[:-1]
    types = list(kif_speed.loc[kif_speed["KIF"].isin(ids),"Type"])
    print(types)
    speed_types.append(scipy.stats.mode(types)[0][0])

[6, 15]
[1, 2, 2, 2, 2, 5, 5, 5, 6]
[4]
[6]
[13]
[15]
[2, 2, 3, 3]
[3, 3]
[1, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 5, 6, 15]
[6, 13]
[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5, 5, 5, 14, 15]
[1, 1, 1, 15]
[13]
[6, 15]
[14, 14, 14]
[1, 1, 1]
[3, 3]
[1, 2, 5, 5, 5]
[13]
[1, 1, 1]
[14, 14, 14]
[14, 14]
[2, 2, 2, 2, 4, 5, 5, 5]
[13]
[3, 3]
[3, 3]
[3, 6, 15]
[4]
[13]
[4]
[13]
[13]
[4, 6]
[1, 2, 2, 2, 5, 5, 5, 6]
[1, 2, 2, 2, 2, 5, 5, 5]
[6]
[6, 13]
[1, 2, 2, 2, 2, 5, 5, 5, 6, 13]
[13]
[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5, 5, 5, 15]
[13]
[3, 3]
[6]
[14, 14, 14]
[14, 14, 14]
[1, 1, 15]
[13]
[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5, 5, 5, 15]
[1, 6, 15]
[1, 2, 3, 3, 4, 5, 5, 5, 13, 15]
[1, 1]
[1, 2, 2, 2, 2, 4, 5, 5, 5, 15]
[13]
[14, 14, 14]
[1, 1, 1, 2, 2, 4, 5, 14]
[6, 15]
[4]
[13]
[2, 4, 5, 13]
[4]
[14, 14, 14]
[14, 14, 14]
[6, 13]
[3, 3]
[2, 3, 3]
[4]
[1, 1, 1]
[14, 14, 14]
[1, 1, 1]
[2, 3, 3, 4]
[14, 14, 14]
[13]
[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 5, 5, 5, 15]
[14, 14, 14]
[3, 3]
[6, 13]
[1, 1, 2, 3, 15]
[14, 14, 14]
[13

In [142]:
speed_types[1:5]

[2, 4, 6, 13]

In [143]:
motor_uniprot.loc[motor_uniprot["label"] == "unlabeled","label"] = "kinesin_0"
motor_uniprot.loc[motor_uniprot["label"] == "KLC","label"] = "kinesin_16"
motor_uniprot.loc[motor_uniprot["label"] == "kinesin_14b","label"] = "kinesin_14"

In [144]:
motor_toolkit_type = motor_uniprot["label"].apply(lambda s: int(s.split("kinesin_")[1]))

In [145]:
type_all = list(jp_type) + list(speed_types) + list(motor_toolkit_type)

In [146]:
len(type_all)

623

In [147]:
kif_acc_all.groupby("db_name").count()

Unnamed: 0_level_0,Entry,db_acc
db_name,Unnamed: 1_level_1,Unnamed: 2_level_1
kif_duke,85,85
kif_jp,481,481
motor_toolkit,57,57


In [148]:
kif_acc_all["kinesin_family"] = type_all

In [161]:
kif_acc_all.groupby(["db_name","kinesin_family"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Entry,db_acc
db_name,kinesin_family,Unnamed: 2_level_1,Unnamed: 3_level_1
kif_duke,1,11,11
kif_duke,2,14,14
kif_duke,3,10,10
kif_duke,4,8,8
kif_duke,5,2,2
kif_duke,6,10,10
kif_duke,13,14,14
kif_duke,14,15,15
kif_duke,15,1,1
kif_jp,1,33,33


In [156]:
kif_acc_all.groupby(["db_name","kinesin_family"])

In [150]:
kif_acc_all.groupby("kinesin_family").count()

Unnamed: 0_level_0,Entry,db_acc,db_name
kinesin_family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,15,15
1,44,44,44
2,45,45,45
3,62,62,62
4,41,41,41
5,56,56,56
6,32,32,32
7,51,51,51
8,24,24,24
9,11,11,11


In [151]:
kif_acc_all

Unnamed: 0,Entry,db_acc,db_name,kinesin_family
0,Q7PG43,AgKHC,kif_jp,1
1,Q7PNB7,AgKlp31E,kif_jp,4
2,Q7PTK6,AgNcd,kif_jp,14
3,Q7QJN4,AgKin73,kif_jp,3
4,Q7QDS6,AgKlp68D,kif_jp,2
...,...,...,...,...
618,Q6Z9D2,KN7H_ORYSJ,motor_toolkit,0
619,Q6H638,KN7C_ORYSJ,motor_toolkit,0
620,Q292S8,COS_DROPS,motor_toolkit,4
621,Q6RT24,CENPE_MOUSE,motor_toolkit,8


In [153]:
kif_acc_all.to_csv("../../data/kif/kif_acc_all.csv",index = False)
kif_uniprot_all.to_csv("../../data/kif/kif_uniprot_all.csv", index = False)
