In [60]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
import glob
import re
import requests
import io
from Bio.Blast import NCBIWWW

In [61]:
kif_duke = pd.read_csv("../../data/kif/kif_duke.csv")
kif_duke.head()

Unnamed: 0,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type
0,H. sapiens KHC,109,ND,ND,,HsxKHC AECSIKVMCRFRPLNEAEILRGDKFIPKFK---GDE...,1
1,H. sapiens neuronal KHC,117,ND,Cell body in cultured neurons,Specific to neural tissue,HsnKHC NECSIKVLCRFRPLNQAEILRGDKFIPIFQ---GDD...,1
2,H. sapiens KHC,110,ND,Uniform in cultured neurons,Ubiquitously expressed,HsKHC AECNIKVMCRFRPLNESEVNRGDKYIAKFQ---GED...,1
3,D. melanogaster KHC,110,"Plus, 54 µm/min",Diffuse in cytoplasm,Required for viability & neuromuscular function,DmKHC AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEEN...,1
4,C. elegans KHC (Unc-116),92,ND,ND,"Axonal transport of synaptic vesicles, also or...",CeKHC AECGVQVFCRIRPLNKTEEKNADRFLPKFP---SED...,1


In [62]:
def get_uniprot_entry(acc):
    url = "https://www.uniprot.org/uniprot/?query="+acc+"&sort=score&columns=id,entry name,reviewed,protein names,genes,organism,organism-id,lineage(all),sequence,length&format=tab"
    try:
        urlData = requests.get(url).content
        rawData = pd.read_csv(io.StringIO(urlData.decode('utf-8')),sep = '\t')
    except:
        rawData = pd.DataFrame()
    return rawData 

In [63]:
kif_duke.loc[(~pd.isnull(kif_duke["Motor polarity & velocity"])) & (kif_duke["Motor polarity & velocity"]!="ND"),:]

Unnamed: 0,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type
3,D. melanogaster KHC,110,"Plus, 54 µm/min",Diffuse in cytoplasm,Required for viability & neuromuscular function,DmKHC AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEEN...,1
6,L. pealii KHC,109,"Plus, 30 µm/min",Membranous vesicles,,LpKHC SECNIKVICRVRPLNEAEERAGSKFILKFP---TDD...,1
10,N. crassa KHC (NKin),103,"Plus, 120-180 µm/min",ND,High velocity of microtubule transport; no cop...,NcKHC SANSIKVVARFRPQNRVEIESGGQPIVTFQ---GPD...,1
22,D. melanogaster Klp68D,88,"Plus, 18 µm/min",ND,,Dmklp68D PNECVQVVVRCRPMSNRERSERSPEVVNVYPNRGVV...,2
25,M. musculus KIF3A,80,"Plus, 36 µm/min",Microsomes/ synaptic vesicles,,MmKIF3A SCDNVKVVVRCRPLNEREKSMCYRQAVSVDEMRGTI...,2
26,M. musculus KIF3B,85,"Plus, 18 µm/min","Neurons, cell bodies, axons and dendrites",,MmKIF3B SSESVRVVVRCRPMNGKEKAASYDKVVDVDVKLGQV...,2
29,S. purpuratus KRP85,79,"Plus, 24 µm/min",ND,,SpKRP85 GNDNVRVVVRCRPLNSKETGQGFKSVVKMDEMRGTV...,2
30,S. purpuratus KRP95,84,"Plus, 24 µm/min",ND,,SpKRP95 SAETVKVVVRCRPMNSKEISQGHKRIVEMDNKRGLV...,2
44,M. musculus KIF1A,192,"Plus, 72 µm/min",Enriched in axons,"Associated with synaptic vesicles, monomeric?",MmKIF1A AGASVKVAVRVRPFNSREMSRDSKCIIQMSGSTTTI...,3
45,M. musculus KIF1B,130,"Plus, 40 µm/min",Co-localizes with mitochondria,"Transports mitochondria, monomeric?",MmKIF1B SGASVKVAVRVRPFNSRETSKESKCIIQMQGNSTSI...,3


In [64]:
kif_duke.shape

(182, 7)

In [65]:
kif_list_jp = pd.read_csv("../../data/kif/KIFlist.txt", sep = "\t")
kif_list_jp.head()



Unnamed: 0,Family,Subfamily,KIF,Alias
0,Kinesin-1,KIF5(KHC),HsKIF5A,HsnKHC
1,Kinesin-1,KIF5(KHC),MmKIF5A,
2,Kinesin-1,KIF5(KHC),RnKIF5A,
3,Kinesin-1,KIF5(KHC),DrKIF5A,
4,Kinesin-1,KIF5(KHC),MsFKIF5,


In [66]:
kif_list_jp.groupby("Family").count()

Unnamed: 0_level_0,Subfamily,KIF,Alias
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kinesin-1,42,42,3
Kinesin-10,16,16,1
Kinesin-11,25,25,0
Kinesin-12,25,25,1
Kinesin-13,43,43,1
Kinesin-14A,44,44,1
Kinesin-14B,63,63,1
Kinesin-2,43,43,0
Kinesin-3,88,88,4
Kinesin-4,53,53,2


In [67]:
kif_list_jp.shape

(626, 4)

In [68]:
kif_list_jp_acc = pd.read_csv("../../data/kif/AccNo.txt", sep = "\t",header = None)
kif_list_jp_acc.head()

Unnamed: 0,0,1,2
0,AgKHC,XP_310522,ENSANGG00000014820
1,AgKlp31E,XP_317685,ENSANGP00000002307
2,AgNcd,XP_307936,ENSANGP00000006252
3,AgKin73,XP_308280,ENSANGP00000009361
4,AgKlp68D,XP_311552,ENSANGP00000010166


In [69]:
kif_list_jp_acc.columns = ["KIF","prot_acc","ensembl"]

In [70]:
kif_list_jp_acc.shape

(626, 3)

In [71]:
# merge the accession table and the information table of japan database
kif_jp = kif_list_jp_acc.merge(kif_list_jp,on = "KIF")

In [72]:
kif_duke["KIF"] = kif_duke["Seq"].apply(lambda s: str(s).split(" ")[0])

In [73]:
kif_jp.merge(kif_duke,on = "KIF")

Unnamed: 0,KIF,prot_acc,ensembl,Family,Subfamily,Alias,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type
0,AtKCBP,AAC49901,,Kinesin-14B,KCBP,,H. sapiens KIFC2,,,,,AtKCBP MKGKIRVYCRIRPLNEKESSEREKQMLTTVDEFTVE...,14
1,AtT5A14.3,AAD10640,,Kinesin-14B,TBK5,,A. thaliana T5A14.3,100.0,,,,AtT5A14.3 LKGNIRVFCRVKPLGATEKLRPPVASDTRNVIIKL-...,14
2,CfDSK1,Q39493,,Kinesin-13,Others,,C. fusiformis DSK1,63.0,ND,Spindle,,CfDSK1 TNSNICIAVRKRPISDKERQKLDHDSVSCFQNKVWI...,13
3,CgCHO1,CAA58558,,Kinesin-6,KIF23/CHO1,,C. griseus CHO1,109.0,ND,"Spindle, midbody",Separates poles at anaphase B?,CgCHO1 LKDPVGVYCRVRPLSCPDQECCVEVVSNTTVQLHTP...,6
4,CgCHO2,CAA58559,,Kinesin-14A,KIFC1,,C. griseus CHO2,69.0,"Minus, 1-8 µm/min",Spindle/centrosomes,"Spindle at mitosis, centrosome in interphase",CgCHO2 LKGNIRVFCRVRPVLAGEPTPSPGFLLFPHGPAGPS...,14
5,CgMCAK,P70096,,Kinesin-13,KIF2,,C. griseus MCAK,83.0,ND,Kinetochore,"Localizes to kinetochore, anchors kinetochore ...",CgMCAK EEHRICVCVRKRPLNKQELAKKEIDVISVPSKCLLF...,13
6,CrFLA10,P46869,,Kinesin-2,KIF3B/C,,C. reinhardtii FLA10,87.0,ND,Flagella,,CrFLA10 GSESVKVVVRCRPLNGKEKADGRSRIVDMDVDAGQV...,2
7,DmKlp10A,NP_727494,,Kinesin-13,KIF2,,P. falciparum L2165w,,,,,DmKlp10A DDHQITVCVRKRPISRKEVNRKEIDVISVPRKDMLI...,13
8,DmKlp59C,NP_611759,,Kinesin-13,KIF2,,D. melanogaster Klp59D,,,,,DmKlp59C NCHQIMVCVRKRPLRRKELADREQDVVSIPSKHTLV...,13
9,DmKlp31E,NP_609398,,Kinesin-4,KIF21,,D. melanogaster Klp31E,,ND,,,DmKlp31E KDSSVRVAVRIRPQNSRELIDMCRICTTVTLGEPQI...,4


In [74]:
kif_jp.loc[kif_jp["KIF"] == "MmKIF2",:]

Unnamed: 0,KIF,prot_acc,ensembl,Family,Subfamily,Alias


In [75]:
kif_jp["prot_acc"]

0      XP_310522
1      XP_317685
2      XP_307936
3      XP_308280
4      XP_311552
         ...    
621     AAK91817
622     AAK91818
623     AAK91820
624     AAK91822
625     AAK91823
Name: prot_acc, Length: 626, dtype: object

In [76]:
get_uniprot_entry("AAK91823")

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Organism ID,Taxonomic lineage (all),Sequence,Length
0,Q93XF2,Q93XF2_MAIZE,unreviewed,Kinesin heavy chain (Fragment),KIN16,Zea mays (Maize),4577,"cellular organisms, Eukaryota, Viridiplantae, ...",TRPYGQTGSGKTYTMSGPGTSKEDWGVNYRALNDLFDISLSRRNAF...,405


In [77]:
kif_jp_uniprot = get_uniprot_entry(kif_jp["prot_acc"].iloc[0])
for i in range(1,kif_jp.shape[0]):
    curr = get_uniprot_entry(kif_jp["prot_acc"].iloc[i])
    if not (curr.empty):
        kif_jp_uniprot = pd.concat([kif_jp_uniprot,curr], ignore_index=True)
    if i%100 ==0:
        print(kif_jp_uniprot.shape)

(94, 10)
(170, 10)
(200, 10)
(280, 10)
(371, 10)
(433, 10)


In [102]:
kif_duke.head()

Unnamed: 0,Species/protein,Molecular mass (kDa),Motor polarity & velocity,Subcellular localization,Comments,Seq,Type,KIF,aa_seq
0,H. sapiens KHC,109,ND,ND,,HsxKHC AECSIKVMCRFRPLNEAEILRGDKFIPKFK---GDE...,1,HsxKHC,AECSIKVMCRFRPLNEAEILRGDKFIPKFK---GDETVVIG-----...
1,H. sapiens neuronal KHC,117,ND,Cell body in cultured neurons,Specific to neural tissue,HsnKHC NECSIKVLCRFRPLNQAEILRGDKFIPIFQ---GDD...,1,HsnKHC,NECSIKVLCRFRPLNQAEILRGDKFIPIFQ---GDDSVVIG-----...
2,H. sapiens KHC,110,ND,Uniform in cultured neurons,Ubiquitously expressed,HsKHC AECNIKVMCRFRPLNESEVNRGDKYIAKFQ---GED...,1,HsKHC,AECNIKVMCRFRPLNESEVNRGDKYIAKFQ---GEDTVVIA-----...
3,D. melanogaster KHC,110,"Plus, 54 µm/min",Diffuse in cytoplasm,Required for viability & neuromuscular function,DmKHC AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEEN...,1,DmKHC,AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIA-----...
4,C. elegans KHC (Unc-116),92,ND,ND,"Axonal transport of synaptic vesicles, also or...",CeKHC AECGVQVFCRIRPLNKTEEKNADRFLPKFP---SED...,1,CeKHC,AECGVQVFCRIRPLNKTEEKNADRFLPKFP---SEDSISLG-----...


In [79]:
def find_aa(s):
    pattern = r'.*[\s]([\S]+)[\s].*'
    title = "ND"
    if match := re.search(pattern, s, re.IGNORECASE):
        title = match.group(1)
    return title

kif_duke["aa_seq"] = kif_duke["Seq"].apply(lambda s: find_aa(str(s)))
kif_duke["aa_seq"]

0      AECSIKVMCRFRPLNEAEILRGDKFIPKFK---GDETVVIG-----...
1      NECSIKVLCRFRPLNQAEILRGDKFIPIFQ---GDDSVVIG-----...
2      AECNIKVMCRFRPLNESEVNRGDKYIAKFQ---GEDTVVIA-----...
3      AEDSIKVVCRFRPLNDSEEKAGSKFVVKFPNNVEENCISIA-----...
4      AECGVQVFCRIRPLNKTEEKNADRFLPKFP---SEDSISLG-----...
                             ...                        
177    DTYTGSITVTIRPKPRSVGTSRDHVGLKSPRYSQPRSNSHHGSNTF...
178    EPCHIEVILRAIPEKGLQNNESTFKIDPYENTVLFRTNNP------...
179    VGSGIITSIRIRPIGKNQGVWSHGKLSNDPYGREYIRQQT------...
180    PTEHHPAIAKRTTSSERAGAGASIAAAPSSHDLDHEDPTS------...
181    EEDAIKVFVRIRPPVEGTLTGVDGEQGLCLTALSSTTIRL------...
Name: aa_seq, Length: 182, dtype: object

In [80]:
sum(kif_duke["aa_seq"]=="ND")

54

In [81]:
kif_duke["aa_seq"][0].replace("-","")

'AECSIKVMCRFRPLNEAEILRGDKFIPKFKGDETVVIGQGKPYVFDRVLPPNTTQEQVYNACAKQIVKDVLEGYNGTIFAYGQTSSGKTHTMEGKLHDPQLMGIIPRIAHDIFDHIYSMDENLEFHIKVSYFEIYLDKIRDLLDVSKTNLAVHEDKNRVPYVKGCTERFVSSPEEVMDVIDEGKANRHVAVTNMNEHSSRSHSIFLINIKQENVETEKKLSGKLYLVDLAGSEKVSKTGAEGAVLDEAKNINKSLSALGNVISALAEGTKTHVPYRDSKMTRILQDSLGGNCRTTIVICCSPSVFNEAETKSTLMFGQRAKTIKNTVSVN'

In [82]:
fasta_string = kif_duke["aa_seq"][0].replace("-","")
result_handle = NCBIWWW.qblast("blastp", "swissprot", fasta_string)

In [83]:
from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)

In [84]:
blast_record = next(blast_records)

In [85]:
blast_record.database_length

179746915

In [100]:
get_uniprot_entry("P33175")

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Organism ID,Taxonomic lineage (all),Sequence,Length
0,P33175,KIF5A_MOUSE,reviewed,Kinesin heavy chain isoform 5A (Kinesin heavy ...,Kif5a Kiaa4086 Kif5 Nkhc1,Mus musculus (Mouse),10090,"cellular organisms, Eukaryota, Opisthokonta, M...",MAETNNECSIKVLCRFRPLNQAEILRGDKFIPIFQGDDSVIIGGKP...,1027
1,O08788,DCTN1_MOUSE,reviewed,Dynactin subunit 1 (150 kDa dynein-associated ...,Dctn1,Mus musculus (Mouse),10090,"cellular organisms, Eukaryota, Opisthokonta, M...",MAQSRRHMSSRTPSGSRMSTEASARPLRVGSRVEVIGKGHRGTVAY...,1281
2,Q96NW4,ANR27_HUMAN,reviewed,Ankyrin repeat domain-containing protein 27 (V...,ANKRD27 PP12899,Homo sapiens (Human),9606,"cellular organisms, Eukaryota, Opisthokonta, M...",MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,1050
3,O70585,DTNB_MOUSE,reviewed,Dystrobrevin beta (DTN-B) (mDTN-B) (Beta-dystr...,Dtnb,Mus musculus (Mouse),10090,"cellular organisms, Eukaryota, Opisthokonta, M...",MIEEGGNKRKTMAEKRQLFIEMRAQNFDVIRLSTYRTACKLRFVQK...,659


In [110]:
#blast out more sequences related to kenisins with documented functions
kif_doc = kif_duke.loc[(~pd.isnull(kif_duke["Motor polarity & velocity"])) & (kif_duke["Motor polarity & velocity"]!="ND"),:]
kif_doc = kif_doc.loc[(~pd.isnull(kif_duke["Seq"])) & (kif_duke["Seq"]!="ND"),:]
kif_doc.shape

(20, 9)

In [None]:
E_VALUE_THRESH = 0.0001
all_kif_accs = []
for i in range(kif_doc.shape[0]):
    kif = kif_doc["KIF"].iloc[i]
    print(kif)
    fasta_string = kif_doc["aa_seq"].iloc[i].replace("-","")
    result_handle = NCBIWWW.qblast("blastp", "swissprot", fasta_string)
    blast_records = NCBIXML.parse(result_handle)
    blast_record = next(blast_records)
    print("record fetched")
    accs = ""
    for alignment in blast_record.alignments:
         for hsp in alignment.hsps:
                if hsp.expect < E_VALUE_THRESH:
                    accs = accs + alignment.accession + ";"
    print(accs)
    all_kif_accs.append((kif,accs))
#     break
    #              print("length:", alignment.length)
    #              print("e value:", hsp.expect)
    #              print(hsp.query[0:75] + "...")
    #              print(hsp.match[0:75] + "...")
    #              print(hsp.sbjct[0:75] + "...")


DmKHC
record fetched
P17210;P21613;P35978;P28738;O60282;Q61768;Q2PQA9;P33176;P34540;Q12840;Q5R9K7;P33175;Q6QLM7;P56536;O43093;Q86Z98;P48467;Q54UC9;Q86ZC1;Q9US60;Q8T135;F9W301;Q61771;O15066;P46872;Q8GW44;Q54TL0;P46871;Q965T6;Q9Y496;Q4R628;Q5R4H3;P28741;P46873;Q29DY1;P46867;Q9P2E2;Q99PW8;Q2R2P7;P46869;F4J2K4;Q9S7P3;B9F7C8;Q86VH2;F4IIS5;P82266;B9G3M6;Q54NP8;Q91783;P33174;
LpKHC
record fetched
P21613;P17210;O60282;P28738;P33175;Q12840;P35978;Q5R9K7;Q6QLM7;P33176;Q61768;Q2PQA9;P34540;P56536;O43093;P48467;Q86ZC1;Q86Z98;Q54UC9;Q9US60;Q8T135;F9W301;Q54TL0;Q8GW44;P46872;Q5R4H3;P28741;Q9Y496;Q4R628;Q61771;Q965T6;P46871;O15066;P46873;Q9P2E2;P46869;Q29DY1;Q99PW8;P46867;B9FUF9;Q9S7P3;Q2R2P7;P33174;Q91784;Q86VH2;Q2VIQ3;Q9LDN0;F4J2K4;Q27IK6;F4IIS5;
NcKHC
record fetched
P48467;Q86Z98;Q86ZC1;O43093;Q9US60;Q54UC9;P33176;Q61768;Q2PQA9;P33175;Q12840;P21613;O60282;P28738;Q5R9K7;Q6QLM7;P17210;P35978;P34540;Q61771;O15066;P46871;Q8T135;Q8GW44;Q54TL0;F9W301;Q9Y496;Q4R628;Q5R4H3;P46872;P28741;P46869;O35066;P468

In [None]:
all_kif_accs[0][1].split(";")[:-1]