In [2]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
import glob
import re
import requests
import io

import torch
from argparse import Namespace
from esm.constants import proteinseq_toks
import math
import torch.nn as nn
import torch.nn.functional as F
from esm.modules import TransformerLayer, PositionalEmbedding  # noqa
from esm.model import ProteinBertModel
import esm
import time

import tape
from tape import ProteinBertModel, TAPETokenizer,UniRepModel

In [3]:
kif_acc_all = pd.read_csv("../../data/kif/kif_acc_all.csv")
kif_uniprot_all = pd.read_csv("../../data/kif/kif_uniprot_all.csv")
kif_all = pd.concat([kif_acc_all,kif_uniprot_all],axis = 1)

In [4]:
kif_all.shape

(623, 12)

In [5]:
kif_all.iloc[1:5,11]

1    LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2    MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3    MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4    MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...
Name: seq, dtype: object

In [3]:
model = UniRepModel.from_pretrained('babbler-1900')
tokenizer = TAPETokenizer(vocab='unirep')

In [19]:
token_ids = torch.tensor([tokenizer.encode(seq)])
output = model(token_ids)

In [20]:
output[0][0].mean(0).cpu().detach().numpy().shape

(1900,)

In [23]:
print_every = 10
def generate_embedding_unirep(model,tokenizer,dat,dat_name,out_dir,seq_col):
    # initialize network 
    sequence_embeddings = []
    for epoch in range(dat.shape[0]):
        seq = dat.iloc[epoch, seq_col]
        token_ids = torch.tensor([tokenizer.encode(seq)])
        with torch.no_grad():
            output = model(token_ids)
#         print(output[0][0].device)
        sequence_embeddings.append(output[0][0].mean(0).numpy())
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
            sequence_em = np.array(sequence_embeddings)
            print(sequence_em.shape)
            print(out_dir + '/' + dat_name + ".npy")
            np.save(out_dir + '/' + dat_name + ".npy", sequence_em)
#             break
    sequence_em = np.array(sequence_embeddings)
    print(sequence_em.shape)
    print(out_dir + '/' + dat_name + ".npy")
    np.save(out_dir + '/' + dat_name + ".npy", sequence_em)
    return 

In [None]:
out_dir = "../../out/201120/"
generate_embedding_unirep(model,tokenizer,pdt_motor,"pdt_motor_unirep1900",out_dir,seq_col = 4)

At Epoch: 0.00
MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVNDLGTFKLHQVPSGVFNPDCLVVLGPGMVISPEKLTVELEEVKASGVTPKLAISDRATLCLPLHALEDTLEEQRLGDGAYGSTRQGIAPAYGDRVMKKAILVGWLKQPDVLVERIQFMLDWKLPQMKAIYPSFEFTQTAQEMADWLLEVSAPWIDAVCNVSMPLKALQAEGKTLLFEAQLGAGRDLIYGEYPWVTSSHVSGAYAGIGGGLPGLRPERVIAVAKAFSSSVGTGTLLTAMENQDEFRKITNEFGATTGRPRDVGYFDAVATKNGVELQAATEVALTKLDCLTGLPDLKICVAYEGAHTENPIWPQTAALKPVYEQMESWSEDITGCRTFEELPKAAQQYVLRIEELLGVPVPMVSVGPGRDEMILR
(1, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 10.00
MTPSSPAARAANDAPPDDARISAWSLIKPYWVSSEWKLAWSLLIAIIAMNLTVVWINVRLNAWNGAFYNALQQKNAAQVPHLLMIFAGYAFSFIIIAVYSRYLRQLLGFRWRQWITTQALDDWFGDRAFYRIERDRLADNPDQRITDDLNSLATSTLSLTLDLLSTVVTLFSFIVILWSIAGAATISLGGTSFVIPGYMVWAAAIYALIGSYVTYKAGHPLVSITYQQQRVEADLRFGLIRIRENAEQIAFYDGMSRERQGALDLFGHIRENWRRVMSITKRMTFVISFYAQLANIFPIAVASPRYFAGAYSFGVLMQIVGAFGTVSDSFSWFVNSYGTLVDWRATVNRLREFKRVTRASHLKESMSPATEHGGINLHYVDAQNLATHGLRLALPDGKPLSRIEDISIEPGSRWLVRGPSGAGKSTLMRAMAGLWPFGEGAIDAPVGAAMMFVPQRSYLPIGTLKGALAYPSAVERFSDEDCRAALRDSGLEDYAGR

At Epoch: 210.00
MAFSDRLLEAWYKGHPLLALLSPLEALYRRIVRGKRQRFLAGEGRIYRAPVPVLVVGNITVGGTGKTPLILFLVEHFRQRGLKVGVVSRGYGARPPQLPWRVRADQPAAHAGDEPLLIVQRSGVPLMIDPDRSRAVRALLEAEPLDLILCDDGLQHYRLARDLELVLIDAARGLGNRRCLPAGPLREPVERLGEVDALLYNGALADRDDGYAFTLQPTALVNLRSGERRPLDHFPAGQALHAVAGIGNPQRFFDTLRTLGWAPVEHPFADHAVFSGEALAFVPSLPLVMTEKDAVKCRSFAGDDWWYLAVDAQPSPAFVSWLDERLARLLPDSRQP
(211, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 220.00
MEKTKKIAVMSGKGGVGKTTVAVNLAVALAAEGYQVGLLDLDLHGPNVQRMLGVSLPPSEGEKIVPAKYGDSLKVFSLAMILQEGAPVIWRGPLKHKAIEQLTRDVEWGDLDYLICDLPPGTGDEALSTFQIIKPDAVIVVSTPQKVAGDDVRRAINFVKRLSGKILGLVENMSYLVCPNCGEKIYVFGKGETEKLAEEFGIPLLARIPMDPEVVSLSDEGRPAVVYKRGTTIEEEFKKIVEKVLSL
(221, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 230.00
MMSKPLLIGIAGGTGSGKSTVAKEIFQSLPGENIVVIEQDSYYKDQSHLSLEDRVKTNYDHPDAFDTELLIEHLKTLLNGNPIDKPIYDFEVHNRKKETVKVEPKDIIILEGILILAEPEIRNLLDIKIFVDTDADVRIIRRIRRDIKERGRTIDSVIEQYMGVVRPMHLQFVEPTKRYADIIIPEGGYNKVAIDILIAKVKHILSIK
(231, 1900)
../../out/201120//pdt_motor_unire

At Epoch: 410.00
MSREILKVENLSMRFGGLLAVNGVALTVKEKQVVALIGPNGAGKTTVFNCLTGFYKPSGGSILLDGQPIQGLAGHEIARKGVVRTFQNVRLFKDMTAVENLLIAQHRHLNTNFFAGLFKTPAFRKSEREAMEYAEYWLDKVNLTEFANRPAGTLAYGQQRRLEIARCMMTRPRILMLDEPAAGLNPKETEDLKALISVLREENNATVLLIEHDMKLVMSISDHIVVINQGTPLADGTPEQIRDNPEVIKAYLGEA
(411, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 420.00
MTAPIIIGVAGGTGSGKTTVAREIYRQFEDMSVVMIEQDAYYKKQSHLTFEERIKTNYDHPLAFDNDLLIEQLECLRRREPVAKPVYDYKLHTRSTETVWIEPKDVIILEGILILEDERLRSMMDIKVFVDTDADVRIIRRMLRDIAERGRTMESVVQQYLQVVRPMHMQFIEPTKRYADIIVPEGGQNRVAIDLLTTKINTVLQSL
(421, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 430.00
MEEENILIESHDSNENISCWLLALDDLDNHFSRIETNREVSYQVSYSSNLNDDINPKVNDFVIGYISDTASPKLLKYVFKITDVLDGDEASENQSSVSITFEKLFETSIYRDANSISEIAPNTYNNINSSSEINRLIAKIQQSEFNILVKFMLETLASNLFEITLDIETGHTTEEKTSDNFDISNKIDDKNELNNIKTDQSKIDDLTRNKIVYGAPGTGKSHKINDFKKKFFKNDFLFERVTFHPAYTYGQFVGTYKPSPIYSKIESDTETSWYGADKEKNDALMNPHIDYTLVAGPFLNILCKALNNKDYNFLLVIEEMNRANAAAVFGDVFQLLDRDNDNRSTYSVKFNADITNFLIENVKDHVSKDLFDRET

At Epoch: 630.00
MTSNARLIYTGCMNRGHKLSARDRRFFELVSRATFCNPFSDRRTELDLRIADCSPATPYEERIDRVIANVAEEVRKLERADRADLRFYRGEDRYVVQNAFLFDVYHRVSDRLDELILREIASGHERRPLPFARDTLATLTGRGFSVPEAHRFFAMFYQVRRAFYFINHGLVGASPSMKQLRLHLWDDIFTHDIRWYEKYLWNRMEDFSTLLLGETGTGKGTAAAAIGRSGFIPYDVGKGCFTESFTQNFISINLSQYPEALIESELFGHRKGAFTGAIEHHQGIFARCSAHGSIFLDEIGDVSIPVQVKLLQVLQDRTFSPVGSHEKLRFNGRVIAATNKPLDDLRRRGLFRDDLFYRLCSDMIVVPPLRQRLGEDPAELDVMVAHVIRRIVGEASRELAVMVREAIGANLGAHYAWPGNVRELEQAVRGILLTSRYHGDRGVAATAPEAELISGIRDGVLTAHELVASYCALLYDRFGTYEEVARRTGLDRRTVKKHVQERRGAERAC
(631, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 640.00
MFDPLSSRQRRALHLAWRFVRPYRRQALLALLALVVTAGITLSMGQGIRLLVNQGFVTRSPDLLGQSIALFMALVLALAVGTFSRFYLVSWIGERVVADIRRQVFDHLLGLHPGFFEHNRSSEIQSRLTADTTLLQSVIGSSLSLFLRNVLMVVGGIILLFVTNPKLTSIVVLALPLVLAPILVFGRRVRSLSRQSQDRIADVGSYVAETLGQIKTVQAYNHQAQDRQRFGETVEAAFATARKRIVQRAWLITLVIVLVLGAVGVMLWVGGMDVIAGRISGGDLAAFVFYSLIVGSAFGTVSEVIGELQRAAGAAERIAELLAARSEILAPVDGGVRLPERVRGELQLQGVRFAYPTRPDRPAIDGLDLRIEPGQTLALVGPSGAGKSTLFDLLLRFHDPQ

At Epoch: 830.00
MFEPMMEMSDDAVIKVVGVGGGGGNAVEHMVRESIEGVEFISINTDAQALRKTSVNSVIQIGGDMTKGLGAGANPQVGRDAALEDRDRIKESICGADMVFIAAGMGGGTGTGAAPVIAEVAKELGILTVAVVTKPFSFEGKKRMAFAEQGIEELSKHVDSLITIPNEKLLKVLGRGITLLEAFASANDVLKNAVQGIAELITRPGMINVDFADVRTVMSEMGHAMMGSGVAKGEDRAEEAAEMAISSPLLEDIDLAGARGVLVNITAGLDMRLDEFETVGNTVKAFASDNATVVIGTSLDPDMADEIRVTVVATGIGNEKKPDITLVAGGKAKVAPVAQPQSQVATQQPAAVKVEEKPAQTLQEKPAVTAQPATQSTSASNGSGQSTAPKVEKDGYLDIPAFLRRQAD
(831, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 840.00
MCTILQVNNLMMKFEDRLIFDRVSFTLKKGSTTALLGSNGTGKTTLIKILMGMLPATSGDFQYSPKIKVAYVPQFRNLDADYPLSIKAFVELNMPLFKSKKDKAEINRILAETHLSQIQHLRMGAASGGQRQRAYLAQALLDRPDLIILDEATASLDPTAKEELMLLIKHLNEKHQMTVLFTTHDIMLAKKYMHEYLIFHNKTLIHGQMKDFAEEEI
(841, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 850.00
MSQYIFTMNRVSKVVDNQRFILKDISLSFFPGAKIGVLGLNGSGKSTLLRIMAGVDQQYEGEARPQPGIKIGYLAQEPQLDLAKTVREVVEEGVAEIKSKLAQFDEISMRFAEPMSDDEMNALLAQQGELQNEIETCGGWDLDRKLDIAADALRLPDWDAKINQLSGGERRRVALCRLLLSSPDMLLLDEPTNHLDAESVAWLERYLEEFRG

At Epoch: 1030.00
MSEQTTKPLLSLRNVQSGYGDLRVVWDVSLDVWPGKVTALLGRNGAGKTSTLRAISGLNKVSAGTIEFDGRDISKVAPHRRVRQGMAYVQEGKRVFHRQTVEQNLILGGYVRKMRRSAVREEVARIYDLFPVLGRKRDLLAGAMSGGQQQMLAIGQALMAEPTLLLLDEPSGGLAPVIVNEVMERVTALKEAGLAVLLVEQAVDAAMSVADHVTVLDVGRVVMDADAGEIDDRAVIRDAYFGRVG
(1031, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1040.00
MTLNLTPTPNSHVELAPRITVVGVGGGGTNAVNNMIDSELKGVEFVVANTDAQQLAHSKAERRVQLGPHLTRGLGAGAKPEIGREAAEEAAQEIERQLEGANLVFITAGMGGGTGTGAAPVIARMARERGVLTVGVVSKPFNFEGRRRTLAAESGIAELQKHVDTLIVIPNQNLFNSANQNTTFREAFKMADNVLNMGVRGITDLMVSPGLINLDFADVKAVMEEMGKAMMGTGEASSEEDGEDRAVAAAERAISNPLLEDASMAGARGLLINITGGEDLTLYEVNAAADRIREEVAEEANIIFGALIDENMNGRIRVSVVATGIDTQSKNGPQGPETVSAQPPVQPQASSPSSAPRDNTSQPRSAPQNFQPGSTYAGGPARPNFSLDPNAQPQQSQPAPAAESPVVPPAHVAPQQHVPAPQDEMSQRSPRAGLFSEPPRNPAPEAPRSLFGLMTGAFRNRPAPQQPTQSAPRSEPHARDYPSDQHGTQDQRHHQAPTDGSDDSTLDIPTYLRR
(1041, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1050.00
MSKVQQPLPIRKLWPTLKRLLSYGSPYRKPLGLAVFMLWVAAAAEVSGPLLISYFIDHVVAKGTLPLGLVSGL

At Epoch: 1230.00
MSSLLHILRFTRSLTPLYVAVVVCSILTSATTLVVPFLIGRATDSVSGAVSGQTPTDTAVRTVIVLALAVLVAELATTLISNVGGWFGDVMSNRMRTILSVRYYDKLLHLPQRWFDTEITGTIVARLNRSIAEVTGFAKTMSNAFASMLITTAAVLVISAWYAWPLAVLLLVIFPVYVWLTALTSKKWQRLEGEKNEQVDIASGRFAEVIGQIRVVKSFVRERSELEDFSRRFGSTDATTREQSSHWHRMDVLRRSVLNVVFFAIYAIIFVRTVQGHFTLGEMVLLVQLMNMAKSPVESMSWVIDASQRAIAGSRDYFRVMATADDPRTEVLPDAAARLEPVPGAPALEFHHVDFAYESDEDVLHDIDFRLEHGERIALVGESGGGKSTIVNLLLGLYEPGRGSIDVAGHDGRTIPLEQLRSGIGVVFQDASLFSGTIRENIAYGRPDASEEEVRAAAVRANADGFIRRFPGGYDQLIGERGLKLSGGQRQRIAVARAILKDAPILVLDEATSALDTRAERQVQKGLDELMSGRTSLIIAHRLSTIAGVDRIITLREGHIDEIGTPAELAASGGIYAQLLELQNSGRTRALAKFDITG
(1231, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1240.00
MISNALEVTDLSYRKNQKTILTDVNLQISPGKIVGLLGENGAGKTTLMRLIASVAKGERGTIAVNQATRGVERRALVSFSESLQGFRGNDKLSQIRDFYATVYPDFSAKKYLDLITFLQIDDDQKLATLSKGTREKFIIALTLSREAAVYLLDEPFSGIDSMSRKRIISSIIKWKQADATMIISDHYVTEIAALLDEVVIVKDHTICTHKSSDQIRSEFGLGIEAFYESVYGGDIHDDEL
(1241, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 12

At Epoch: 1410.00
MIEIRNLNKKYGNLEVIKNISTKISKGDIISVIGPSGGGKSTFLRCINRLELADSGEILIKGHNILDEQTDINKIRQKVSMVFQHFNLFANKNVLQNLCLAPIKTGILNEDEAVKKAEILLKKVGLSDKKEVMPHKLSGGQKQRIAIARSLMMNPDVILFDEPTSALDPEMIGEVLSIMKDLAAEGLTMIVVTHEMGFARNVANRIFFMDKGELAVDASPKEVFENPKNERLKEFLNKVLNH
(1411, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1420.00
MIPTAAPARSAQLLAVSGLQMRFGGLLAVDGIDFEVRPREVFAIIGPNGAGKTTVFNCVGGFYQPTGGQVMFDGQRIAGLPSHLVARKGLVRTFQNIRLFKQLTVLENLLVAQHLQVETGLLHGLFATPAYRRAERQALERAAQWLERMGLGKVANQEAGTLSYGHQRRLEIARCMITEPRLLMLDEPAAGLNPQEKIALQQLIDGLRREFGISVLLIEHDMSLVMGVSDRIMVMEHGRPIVTAKPDEVRSDRRVINAYLGED
(1421, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1430.00
MSEVIAMTIVLGLTGGIATGKSTADQYFRRKGIPVIDADQISHDIIDIGKPAWEKIRAHFGPKFLNEDQSINRRKLGQFVFQNANELKVLNNITHPLIHEEIIQQIAVAKRKGVDLIVLDVPVLFETNGDLDCDQTLVISLPPQLQLERLIERNHYSIEEAKARIASQMPLRDKEARATYVIENTGTIKELEEKLTKVLNKIKVEG
(1431, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1440.00
MKPTISLLSRQKALTLLTTLLILIESLGKVASSLTLAPLTDQLINK

At Epoch: 1620.00
MAPPLIELSGITKSYGPVKSLQGVDLRLAKGEVLGVVGDNGAGKSTLMKILAGAVQHDGGEMRVNGEPVRFSTPLDAQQKKIGIVYQDLALCDTLDVASNLFLGREPRKGPFLDRHAMHEKAAHILADLHVKVKSTYQEIGQLSGGQRQTVAIARAVSFRPDVLILDEPTAALAVAEVESVLKLITDVAAAGVGVILVTHRLQDLFRVCDRITVMYEGQSVDDVPIGDLNIESLVGLITRTPVPQRTSAATQGATA
(1621, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1630.00
MSKVIGLTGGIASGKSTVSELLTAFGFKVVDADTAAREAVAKGTPGIAKVKEVFGEEAIDENGEMDRKYMGELVFNNPGERIKLNEIVHPKVREIMEEKKQQFLNKGHNVIMDIPLLFENELQDTVDEVWLVYTSESIQIDRLMERNDLTQEEAKARVYSQISIDKKSRMADHVIDNLGDKLELKQNLEKLLSEKGYIEK
(1631, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1640.00
MRQIADTLALKSPTVQSWKKRDGWDAIAPISRVETSMEARLIQLIMKDAKEGRDFKEIDLLGRQIERLARVNRYSQTGSEADLNPNVANRNKGERKTPDKNLFSESAIEKLESIFHENIFDYQRNWFEAGLTHRIRNILKSRQIGATFFFAREALLDAIVTGRNQIFLSASKAQAHVFKSYIIDFARMVDVDLKGDPMVLPNGARLFFLGTNVRTAQSYTGNLYLDEYFWIPKFQELRKVASGMSLHKKWRTTYFSTPSSLAHSAYPFWSGELFNKGRRNKSDHIQLDLSHSHLARGALCADGQWRQIVTVEDALAGGCNLFDLNQLSLEYGPSEYQNLLMCEFVDDQASVFPFKELQACMVDSLEEWEDYNPYSL

At Epoch: 1830.00
MLSATSILPLTVEAAAFSGDGKLLVEPNSFTIPAGGLTVLLGPNGAGKSLTLRLCHGLLTPSRGAVRWAAGAEGRAKRHAMVFQKPIMLRRSVEANITHALAAAGANSAERKARAAQALQRFGLAERASQPARLLSGGEQQRLAIARAWALRPELLFLDEPTSQLDPAATRQIEELLSGLVAEGITVMMSTHDLGQARRLADRVLFLHRGRLVEDAPAKDFFAGPHSAEARAFLAGELLW
(1831, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1840.00
MSALIRLDNLTVTYERHPAVHHVSGAFQAGSLTAIAGPNGAGKSTLLKAIIGELRVAEGSIDRGQLTRNDFGYLPQAAEVNRRFPISVADTIMLGAWKNSGAFGRFSREDAERARDALAAVGLSGFERRHIGSLSAGQFQRVLFARLLLQDARVILLDEPFTAIDQRTTRDLLDIVLRWHGDGRTVIAVLHDFDQVRAHFPETLLIARELVGWGRTQDVMSPANLIRARAMAESWDEDAESCSPEPDHAGHELVKREPAE
(1841, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1850.00
MKDIIKIRNLNFSYDKQVVLEDINLDYSSDEFLAIIGPNGGGKSTLLKLILGLLKPQSGEIKLFGKEPSEVSKFIGYVPQNFLSNQSFPMMVLEVVLMGLIDKKIFGFYSKDEKALALSALEKVGMREFANARIGELSGGQRQRVYIARALCANAKVLILDEPTASIDTKGQAEIYEILKGINANGVGVVLVSHDLNIVLNYATKIAYVSKNLHIHKTHENLAKREFIEHLARTHSHFCDVEIALGECGCEKTKSNVFKF
(1851, 1900)
../../out/201120//pdt_motor_unirep1900.npy
At Epoch: 1860.

In [17]:
print("done")

done
