# Documentation
- The following script generate the embedding using esm model, Unirep 1900 babbler model and tape bert-base model. It should be noted that the embedding generation is only on the pfam domain sequences instead of the full uniprot protein sequence that might contain multiple domains
- the unirep 1900 and bert-base model are directly embedded using tape interface, and esm is embedded using the 12/34 layer respectively.

In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
import glob
import re
import requests
import io

import torch
from argparse import Namespace
from esm.constants import proteinseq_toks
import math
import torch.nn as nn
import torch.nn.functional as F
from esm.modules import TransformerLayer, PositionalEmbedding  # noqa
from esm.model import ProteinBertModel
import esm
import time

In [2]:
pdt_motor = pd.read_csv("../../data/thermo/pdt_motor.csv")

In [3]:
pdt_motor.head()

Unnamed: 0,uniprot_id,pfam_id,is_thermophilic,token,seq,clan
0,K1INX9,PF00709,0,[11 16 16 8 18 18 18 6 1 12 19 6 3 4 6 ...,MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVND...,p_loop_gtpase
1,A9AVV2,PF00006,0,[11 1 18 17 1 4 3 8 10 16 15 10 9 1 16 ...,MAVTAEDILSRLKASIQQPVGGDPTAVNVGTVASVGDGVARISGLR...,p_loop_gtpase
2,L9Y3C9,PF13361,0,[11 8 17 17 15 2 3 3 18 18 4 3 17 20 7 ...,MITTRCDDVVEDTYHVRVLTIHASKGAEATDDCCFDGITGEIAREM...,p_loop_gtpase
3,A0A062ACI1,PF03618,0,[11 16 4 16 9 14 5 9 15 16 18 5 5 8 16 ...,MSESKQFKRSVFFISDGTAITAETLGHSLLAQFPNVDFDIHIMPYI...,p_loop_gtpase
4,A0A102DZZ7,PF00480,0,[11 14 12 8 18 6 8 3 8 6 6 16 7 8 17 ...,MQNIVGIDIGGSHITLAQVDPDKHEIITSTYVRERVDSFADPETIF...,actin_like


In [4]:
pdt_motor.shape

(501461, 6)

In [5]:
# esm_t12_thermo,esm_t34_thermo,unirep_thermo and bert_thermo should be averaged hidden states as ndarray with the 
# the shape (#row in pdt_motor, #of hidden states)
seq = pdt_motor["seq"][0]
print(seq)

MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVNDLGTFKLHQVPSGVFNPDCLVVLGPGMVISPEKLTVELEEVKASGVTPKLAISDRATLCLPLHALEDTLEEQRLGDGAYGSTRQGIAPAYGDRVMKKAILVGWLKQPDVLVERIQFMLDWKLPQMKAIYPSFEFTQTAQEMADWLLEVSAPWIDAVCNVSMPLKALQAEGKTLLFEAQLGAGRDLIYGEYPWVTSSHVSGAYAGIGGGLPGLRPERVIAVAKAFSSSVGTGTLLTAMENQDEFRKITNEFGATTGRPRDVGYFDAVATKNGVELQAATEVALTKLDCLTGLPDLKICVAYEGAHTENPIWPQTAALKPVYEQMESWSEDITGCRTFEELPKAAQQYVLRIEELLGVPVPMVSVGPGRDEMILR


In [6]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t12_85M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_12 = {prs(arg[0]): arg[1] for arg in model_data["model"].items()}

model_t12 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t12.load_state_dict(model_state_12)

cuda


<All keys matched successfully>

In [7]:

alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t34_670M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_34 = {prs(arg[0]): arg[1] for arg in model_data["model"].items()}
model_t34 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t34.load_state_dict(model_state_34)

cuda


<All keys matched successfully>

In [8]:
print_every = 2000
def generate_embedding_transformer_t12(model,batch_converter,dat,dat_name,out_dir,seq_col):
    # initialize network 
    model.cuda()
    sequence_embeddings = []
    for epoch in range(dat.shape[0]):
        data = [(dat.iloc[epoch, 1], dat.iloc[epoch, seq_col])]
        _, _, batch_tokens = batch_converter(data)
        with torch.no_grad():
            results = model(batch_tokens.to('cuda'), repr_layers=[12])
            # last layer
            token_embeddings = results["representations"][12]
            seq = dat.iloc[epoch,seq_col]
            sequence_embeddings.append(token_embeddings[0, 1:len(seq) + 1].mean(0).cpu().detach().numpy())
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
    sequence_embeddings = np.array(sequence_embeddings)
    print(sequence_embeddings.shape)
    print(out_dir + '/' + dat_name + ".npy")
    np.save(out_dir + '/' + dat_name + ".npy", sequence_embeddings)
    return 

In [9]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../out/201120/"

In [10]:
generate_embedding_transformer_t12(model_t12,batch_converter,pdt_motor,"pdt_motor_t12",out_dir,seq_col = 4)

At Epoch: 0.00
MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVNDLGTFKLHQVPSGVFNPDCLVVLGPGMVISPEKLTVELEEVKASGVTPKLAISDRATLCLPLHALEDTLEEQRLGDGAYGSTRQGIAPAYGDRVMKKAILVGWLKQPDVLVERIQFMLDWKLPQMKAIYPSFEFTQTAQEMADWLLEVSAPWIDAVCNVSMPLKALQAEGKTLLFEAQLGAGRDLIYGEYPWVTSSHVSGAYAGIGGGLPGLRPERVIAVAKAFSSSVGTGTLLTAMENQDEFRKITNEFGATTGRPRDVGYFDAVATKNGVELQAATEVALTKLDCLTGLPDLKICVAYEGAHTENPIWPQTAALKPVYEQMESWSEDITGCRTFEELPKAAQQYVLRIEELLGVPVPMVSVGPGRDEMILR
At Epoch: 2000.00
MTSTTLERTSTDPRPPARVTGVLEIAQGGQGHLRVDSGMPNPADPQVPAALIRRHGLRKGDTVEGVRDGRRGLIDVERINGRTPEELRRRPHFHDLTPLHPRERLRLEHPASGLTGRLIDLVAPVGKGQRGLIVAPPKTGKTVLLQQVAAAIAANHPEARLMVVLLDERPEEVTDMRRSVRGEVFASTFDRSPKQHIALAELVVERAKRLVEQGEDVVILLDSLTRLCRAHNNAAAAGGRTLSGGVDAAALHGPKRLFGAARLTEEAGSLTILATALVETGSRADGFFFEELKGTGNMELRLDRALADRRVFPAVDITPSGTRREELLLAPGELTAVRGLRRALGSREGQANLETLLERMRATPDNATFLRQVQPTLPAA
At Epoch: 4000.00
MNNSHNYGDGGDIVVSIKGVGKTYKSGFTALKGVDLDIRRGEILALLGPNGAGKTTLIGIVTGLVNASEGTILVDGDDVRRDYRRVRPKIGLVPQELFTDSFETVWDTVSFSRGLFGKPADPAHIERVLKALSLWNKKDNKILSLSGGM

At Epoch: 42000.00
MASIKPAEVSAILKEQLTNFEAQASLSEVGTVLQVGDGIARVYGLSNVQYGELVEFENGLEGIVLNLEEDNAGVVLLGASTSVREGSTVKRTERIASLRAGEGIVGRVVDTLGSPIDGKGPIEGTTYEMPLERRAPGVIYREPVTEPLQTGIKSIDAMIPVGRGQRELIIGDRQTGKSTVALDTILNQKEFYDAGNPVYCIYVAIGQKASTVAAIANMLEERGALAYTTIVAANASDPAAMQVYAPFAGAAIGEYFRDTGRPALIIYDDLSKQAVAYREISLLLRRPPGREAYPGDVFYLHSRLLERAAKVINDNKIASEMNDLPDSLKGIVKGGGSLTALPIIETQAGDVSAYIPTNVISITDGQIFLDGDLFNSGVRPAINVGISVSRVGGNAQIKSMKKVSGTLKLDQAQFRELEAFAKFGSDLDAATMSVISKGQRNVEILKQAQNDPFTVEDQIAIIYAGSKNLLKDVPVNKVKQFEKDYIDYLNAKHRDTLDVLKSGKLTPEATATLEEAAAEISKHFA
At Epoch: 44000.00
MSDERYQQRQQRVKEKVDARVAQAQDERGIIIVFTGNGKGKTTAAFGTATRAVGHGKKVGVVQFIKGTWPNGERNLLEPHGVEFQVMATGFTWDTQNRESDTAACREVWQHAKRMLADSSLDMVLLDELTYMVAYDYLPLEEVVQALNERPHQQTVIITGRGCHRDILELADTVSELRPIKHAFDAGVKAQIGIDY
At Epoch: 46000.00
MIFEANNLSIEINKKKIVTNFNLSVKAGDFILITGKSGTGKTTLINNLSLLEKVYTGSLNYEQFENTKKNCQRIRKNVISYMFQNYGLLENMTVLENLKLAIKYNKSFKKSDLTLLLEKFSLSESILTKKVFLLSGGEQQRIALIRSLLKPFDIIFADEPTGNLDDENASFIIKYFQYLVTEKNKAVVMVTHDKQLLKYASLVIDLDLKQTQ
At Epoc

At Epoch: 90000.00
KSQVFSTAEDNQNAVTIRVFQGEREMAADNKVLGQFDLMGIPPAPRGMPQIEVTFDIDANGIVNVSAKDKATGKEQQIRIQASGGLSESDIQKMVKDAEANAAEDKKRREAVDAKNHADALVHSTEKALAEHGSKVEESERRAIEDAVSDLKEALKGEDAEAIKAKTNTLAQASMKLGEAMYKQQAEADAKRDAAKDDV
At Epoch: 92000.00
MGFQCGIVGLPNVGKSTLFNALTKTAAAQAANYPFCTIEPNTGEVAVPDPRLYAIRDIAQSKEVIPTRITFVDIAGLVRGASKGEGLGNQFLANIREVDAIAHVLRCFEDDDITHVDGAIDPIADADTVETELMVSDMESLEKRIAPLKKKATGGDKEAKAVLPIMEGALALLQDGKPARMLETADAEEAKMLQGLNLLTSKPVLYVCNVDEASAGEGNELSRKVAEKAAAEGAVAVVISAAIEAEISQLDKEEQEEFLETIGLEEPGLDRMIRAGYELLGLITYFTAGPKETRAWTITDGTKAPGAAGVIHTDFERGFIRAQTIAYDDYTSLGGESAAKEAGKARDEGKEYIVKDGDVLLFKFNT
At Epoch: 94000.00
MMIGLTGGIASGKSSVAKMMEELGLPIVDADQVARDVVEPGMPAYEAIVAHFGTGVVNDDGTLNRKALGSIVFQQEEERRVLNEIVHPAVRRQMQQQKEQLIRSGEKTIVFDIPLLYESNLFYLVEKVLLVYVDEHTQLQRLMNRDQAGKDDAIHRIRSQRPLESKRDRADAIIDNSGTLDATKRQLIDILKRWKVIPEDQ
At Epoch: 96000.00
MTRPLPGTLGIDFGTSNSAMAWAAPGGTARLIPLEGAATAMPTAVFYNHEDLSTHFGRDAVALYLEGTEGRLMRSLKSLLGSPLLLETTVVNNRQVSFQDIIATFLATLRDRATLALGAAPLRVVMGRPVHFVDDDAERDALAQQSLLQAAQAVG

At Epoch: 136000.00
MSENSPAATQRDLWMRVRARLKAAVGEDVFTSWFARLELEELVDDLVHLSAPTRFLCSWVQSNYSDRIVEAFRHDVPDAARLHVTLRVNGQARPRLAPAPVVAAEPAEETPAAKPAIQAAVEAPSAAPVARLVQQAQKGDALAGSAIDQRMTFDTFVSGEANEMAFGVAKQIANAAINNTVTFNPVYIHSTVGLGKSHLLNAIAHAVSQADSSKNIVYLTADHFMYHFITAVQRQSALGFKEWLRKVDLLLIDDMQFLQGKSATEFGHTLGALLTGAKQVVVAGDAPPRDLEMLDERVRSRLSGGLVVPISTFDMDLRRAIVQRRADQATQRFGMHFPPAVLDYVARAVTSHGRDLDGAVNRLVAANQLTGELITVPLAEKTLADLIRARDAKRVRIEDILKIVSRHYKVPRNELLSARRSRDVVRPRQIAMYLAKALTSRSLPEIGRRFGGRDHTTVLHSVRKVEQMIKDDLELGQEIELLKRMLEE
At Epoch: 138000.00
MAQDSSSPGSGPQRILLVTGLLGAGKTTALRTLEDLGWEAIDNFPIRLLDRLLETEPGSARMESGAPMAIGFDTRTRGFDPARTIQLVKKLSQRKDIEVTTLFLDCGGAELERRYNETRRRHPMSEDKPASTGIMAERELLEPLRRWADVVISTTSFTSNDLQQAIRDQFGDVTASAPTLTISSFGFSRGTPPLADLVFDMRFLANPHWVDELRPQTGQDPAVGDFIRQDPAFAETFQRIRDLLLLLLPRYRQQGKAYVHVAFGCTGGKHRSVFMAEQIASALRAEGFSPTLLHRNLASRAADLLEGRKA
At Epoch: 140000.00
MAILTVENLGHSFGDRTLFKDVSFRLVEGDHIGLVGANGVGKSTLMGIITGQTIHDTGKVEWLPGTHYGYLDQHTVLTAGRTMRDILRDAFLPLYKKEEEMNEITAKMADATPEELETLLEQMADIQDALDAGDFYTLDI

At Epoch: 180000.00
MNHSETIVAQATPPGRGGVGILRVSGPQAADVARLVLGKLPRPRHADYLPFMDNHNVKLDQGIALWFPGPNSFTGEDVLELQGHGGPVILDLLLKRITSISGVRIARPGEFSERAFLNDKLDLAQAEAIADLIDASSEQAARSALNSLQGAFSLRINELVEALTHLRIYVEAAIDFPDEEIDFLSDGKIEAQLNQVIARVNDVRAEARQGSLLREGMKVVIAGRPNAGKSSLLNALAGREAAIVTDIAGTTRDVLREHIHIDGMPLHIIDTAGLREASDEVERIGIERAWQEIEQADRVLFMVDGTTTRETEPAVIWPDFITRLPASLPVTVVRNKADITGEEARIEKVNGHSLIRLSARSGEGIEVLRNHLKASMGFAGNTEGGFLARRRHLQALALASTHLLQGKEQLTGTKAGELLAEELRAAQLALSEITGEFTSDDLLGRIFSSFCIGK
At Epoch: 182000.00
MDIRAAEISAILKSQIANFGVEADVSDVGQVLSVGDGIARIHGLDNVQAGEMIEFPKAGVKGMALNLERDNVGAVIFGADAQIAEGDEVRRLGEIVDVPVGKGLLGRVVNPLGEPIDGKGPIQFTERRRVDVKAPGIIPRKSVHEPMQTGLKAIDTLIPVGRGQRELIIGDRQIGKTAVAIDAILNQKSINVEGASEGEKLYCIYVAIGQKRSTVAQIVKTLEERGALDYTIVVAATASEPAPLQFLAPFAGTAMGEFFRDNGMHALIIYDDLSKQAVAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERSAKLNEDNGSGSLTALPIIETQANDVSAYIPTNVISITDGQIFLESDLFYQGIRPAVNVGISVSRVGSSAQIKAMKQVAGSIKGELAQYREMAAFAKFGSDLDVSTQQLLARGARLTELLKQPQYSPLTVEEQVVSVYAGTRGFLDKIAVADIGRFEAELLARMHSAHQSTLDAIKSTKALSKDLEAELKAAIEA

At Epoch: 224000.00
MIAFENVNKWYGDYHALNDISAEIGRGEVVVLCGPSGSGKSTLIRTVNRLEEIQNGRIEFDGEDIHSRRLDLNKFRSHVGFVFQSFNLFPHLSVAENIMLAPVTVLKKKRNEARERALQLLARVGLAAKADAYPGQLSGGQQQRVAIARALAMDPPAMLFDEPTSALDPEMVGEVLQVMKSLARDGMTMMCVTHEMNFAREVADRVWFLDQGRIVESGSPAEFFSNPQSDRAKKFLSDLRSH
At Epoch: 226000.00
MAAKKDKSVPDSKITDKEGKEKAVKDAMAAITKGFGSGLIMKLGEKSSMNVESIPTGSINLDIALGIGGVPKGRIIEIYGAESSGKTTLALHVIAEAQKQGGTVAFIDAEHALDPVYAKALGVDIDELLISQPDYGEQALEIADTLVRSGAIDLIVIDSVAALVPKAEIDGEMSDQQMGLQARLMSKGLRKLTGNLNKYKTTMIFINQIREKIGVTYGPTTTTTGGKALKFYSSVRMEVKKMGTVKQGDDPIGSEVIVKVTKNKVAPPFKEAAFEILYGKGISKVGEIIDAAVAKDVIVKAGSWFSFRDQSIGQGKEKVRAELEINPELLAQVEKDLKEAIAKGPVDKKKKKSKKEASSDDTDDENLEIDDAIDENND
At Epoch: 228000.00
MGIINAEKLKAIESAMSHIEKQFGKGSVMKLGDHNISNMDAISTGCLDLDIALGIGGVPKGRIIEIYGPESSGKTTIALHIAAESQKKGGAVGYIDAEHALDPSYAQKLGVDVDSLIISQPDTGEQGLEIAEALVRSGAIDVLVVDSVAALVPKAEIEGEMGDSHIGLQARLMSQALRKLAGTINKTNCVAIFINQLREKVGVMFGSPETTTGGRALKFYASVRLDIRRIDSIKQGDSIIGNRTRIKVMKNKVAPPFKQAEFDIMYNEGISRCGNIVDVGVKEEIVQKSGAWFSYGDIRLGQGRENAKLYLKENPEVA

At Epoch: 268000.00
MAKLYYRYGTMQSNKSNQIITTHHQYTTQGKQCLAYSTPIDTRSGHKRIKSRIGLELVCEYITENIYEEVKAIHEKDRVHAVVVDEAQFLSRADVHRLSDIADVLDIPVICFGLKTDFRNHLFEGSRVLLELSDAIDELKTICQFCNKKATLNMRLLNGVPTNIGETIQIGDEEYVPVCRKCYKERLELV
At Epoch: 270000.00
MTNENKNTAHFTDFPIHSALLEALEDIHFTKTTPIQAQTLPLTLAGYDVMGIAQTGTGKTAAFLLSLMHYLMTNPVHPKAKGPWAIVLAPTRELAIQIKKEMDLLGAYTGLVSLAIYGGTSIEHQKKLFQACNVDVIIGTPGRIIDLFKQKVFRLKNIEVCVLDEADRMFDLGFIDDVRYLLRQMPPAYERLNLLFSATMPQKVQELAYEHFNAPKVVAIESQQTTANNITHYLYHTAKHEKTPLLLGLFARERPERSMIFLNTKHDLERLSLVLTANGYHNAALSGDVAQKKREQIIRDFQEGTVNIVVATDVAARGIHIDGITHVFNYDLPQIAEDYVHRIGRTARAGASGTAISFACEEYVYSLPEIEHFLGEKIPVKPIEESLIVDVIPPSDEALDALRNEKESRIRIHNRGNRKPMTKSIRRRRYQK
At Epoch: 272000.00
MVDYDRLVPYGWTEAVASEYLPVLAEGLSPARIVRMDRSECDVVTAHGPTRAACPRDITGLCTGDWVGIDAERTVRQLLPRRSVIMRSSVSGRSVAQVLATNVDTVLICTAADGDVDLGRIERMLALAWESNAQPVVVLTKADAAVDIPLDEVRAIAPGATVLAVSAAADVGLDVLRAVLTGTVALLGPSGAGKSTLANALLGADVFATNAVRAVDGKGRHTTVHRELRPLPGGGTLIDTPGLRGIGLYDAAEGIGKTFSDIESLAADCRFDDCAHETEPGCAVLAALADGTLPDRRLASYRKLAKENEWMAARTD

At Epoch: 314000.00
LGSNENVVEVETVSTGSLSLDIALGIGGLPRGRIVEIYGPESSGKTTLALQTIAECQKKGGICAFVDAEHALDPIYARKLGVDLQNLLISQPDTGEQALEITDTLVRSGAVDILVVDSVAALTPRAEIEGEMGDSLPGLQARL
At Epoch: 316000.00
MLKRFKSQNRSMIGMDITSLSIHALQITKIAATYHIDNFLSESLDPHVIVDNQIRDQEALFNSIKKLLFRANWSCKSIVLAVPDAAIITKVIQIKNNLPKDEMDEIIFLEASKHFSCPLHEINMDFAIRGPSSIHGNMLDVLITACRAEHVNTRVEAITRAGLTAQVVESESSALERVIPLLTTRLSKEEKKGIAIIHINELFTHLKMAQDGKIIFAHEEASGCNQWGEVLMPQHDNSKEGNYCKMERSITLGEQKSQVIRMEQLIRQIKRMLQFFYSANSSKVIHHLFLAGDTARLPNFCQRLQEKVGIPTSLANPFKEMCFSNNAFSQNFIEKAPSLLIACGLALRE
At Epoch: 318000.00
MSNSAHDTIAAQATAPGRGGVGIIRVSGPQALHIAEQVIGHTPKPRYAHYGDFHAAEGQVIDQGIALFFPGPHSFTGEDVLELQAHGGPVVMDFILQRVITLGARPARPGEFSERAFLNDKLDLAQAEAIADLINSTSEQAARCALRSLQGAFSARIQALLERLIQLRIYVEAAIDFPEEEIDFLADGKVRDDLLQIITDLEAVQQEAHQGSLMREGMQVVIAGKPNAGKSSLLNALAGRETAIVTDIAGTTRDVLREHIHLDGMPLHIIDTAGLRDAPDQVERIGIDRAWQEIHKADRILMVVDSSETSAQSPEQIWPEFVAQLADASKITVIRNKIDLQAEKPTLQQQGEYSLISLSAKHGQGVELLREHLKACIGFSNTTEGGFMARRRHLDALQRCHQQLDAGLAQLDGFSAGELLAEDLRMAQQSLGEITGEFTPDDLLGR

At Epoch: 356000.00
MRFTIQNQNNTQILYPELLVSKDTQNQQSFQSIYKIANLLPQKLKEEFQYSQELQQHLQGKRLLLDELPFPLSDIHKHYLNGYIQYHHGIIITPKKQFICQRCGNQNQHIFASFHCARCCEQGCTYCRNCIMMGRVSQCTLLITWTGPSSQKTAHSKSVLNWSGQLSPGQQTASDKVIEAINNQQDLLVWAVCGSGKTEVLFQGIKTALQHGKQVCITTPRTDVVHELTPRLKQAFPNTTLISLYGGSEDRSKTAQLTISTTHQLLRYYKNFDVIIIDEVDAFPYSADPSLQYAVHQAKKEISAMLYLTATPNAEFKQKSNKNKIQTVTIPARYHRQPLPVPIFRWCGNWKNRLKKKHDIPTNLKNWITNQLQNNKQAFLFVPHITTLNEVIPSLKKINPSIEGVHSEDLNRKQKVQAFREGTIPILVTTTILERGVTVPNTDVAVLGAEDDIFTESALVQIAGRVGRSPKYPTGEVIFFHYGKTKAMNDARTQILKMNKQGRSLGFLD
At Epoch: 358000.00
MPTLVAGIDSSTQSCKVVIRDAHTGELVRTGSAKHPEGTEVAPAAWWDALLDAIAAAGGLADVAAASVGGQQHGMVALDAEGEVIRDALLWNDTRSADAAAQLIAEFGGGAEGAAAWTSMTGSVPVASLTVTKLRWLADAEPENAGRVAAVALPHDWLTWKLSGSTELADLVTDRSDASGTGYFDAATGSYRYDLLARALRISEDAARAIILPRVAGPQEQVGTGDAARDWAHLLLGPGAGDNAAAALGLGMRTGDVAISIGTSGVVSAVSPKPIQDPSGMVTGFADATGEFLPLAVTLNGSRVLDGAAKMLGVDHAGLADLALAAAPGANGLTLVPYLEGERTPNLPHATGSFIGLTLASMNPEDVARAAVEGLLCGLADGLEAMTSQGVPVESITLIGGAARSAAVQQIAPAILGREVSVPAPGEYVADGAARQAAWVLSGAAQPPAW

At Epoch: 398000.00
MPEGYKSGFVSIVGNPNVGKSTLMNYLVGERISIITSKAQTTRHRILGIVNSDRMQVVYSDTPGVLQPSYKLQERMRAYSEQALEDADLLLYVTDTMEERDKHHDFVERVQRLSCPIIIVINKVDLTEQKHLEELVDYWHSVIPQAEIIPVSALRQFNLAPLKKRIEELLPVSPPYFEQDALTDRPARFFVSEIIREKALQYYHQEVPYAVEVVVEEFVESPDRIDMRCVILVERESQKGIIIGHKGSAIKRLGISARKDLERFFDKHIHLTLLVRVDKDWRQSDQALTQFGYDI
At Epoch: 400000.00
MMTVIVLSGPIGAGKSSLTSILAEHLGSNAFYEDVSKNPVLPLYYKDMKRYTFLLNTYLLNKRLAQINEALSENNPNVVLDRSIYEDALFFKMNADSGIADPTECDIYVDLVHNMMEDVPGNPHKKPDLLIYINVSLDNMLSRIKRRGREFEQLETDPSLKEYYARLRNYYGPWYENYSESPKMVIDGDKYNFVDNEDDRKAVLELIDNKLKELGNL
At Epoch: 402000.00
VIFINQIRMKIGVMFGNPETTTGGNALKFYSSVRLDIRRIGSIKKNDEVIGNETRVKVVKNKVSPPFREAIFDILYGEGISRQGEIIDLGVQAKIVDKAGAWYSYNGEKIGQGKDNAREFLRENPEIAREIENRIRESLGVVTMPDGAAQDEAEAMDE
At Epoch: 404000.00
MALVEIRRLTKQFRKGDEVITPLLDVDLDIERGDFVSLMGASGSGKSTLLNSVAGIDRPTSGEIVINGSDITRLSRGALADWRAANIGYIFQMHNLIPVLTAYENVELPLLLLPISSAERHKRVEIALEAVNLRDRAQHYPRQLSGGQEQRVGIARAIVASPTIIVGDEPTGDLDADTTEQILELVQRLNDELGMTLLLVTHDPKVAQLARRQIRLEKGKLIEKGRDLALASTPYPLEKRA
At Ep

At Epoch: 446000.00
MGKNVVVLGTQWGDEGKGKIVDLLTEHATAVVRYQGGHNAGHTLVIDGEKTVLHLIPSGVLREGVQCLIGNGVVVAPDALMREITKLEEKGIPVRERLRISPSCPLILSYHVALDQAREKARGEHKIGTTGRGIGPAYEDKVARRGLRIGDLFHRERFAAKLGELLDYHNFVLVNYYKEPAIDFQKTLDECMEYAELLKPMMLDVTAVLHEMRRAGKDIMFEGAQGSLLDIDHGTYPYVTSSNTTAGGIATGSGMGPMYLDYILGITKAYTTRVGSGPFPTELFDDVGAFLAKRGHEFGATTGRARRCGWFDAVILRRAIDVNSISGICLTKLDVLDGLETINICVGYKNENGAVIDAPTDADSYIGLEPVYEEVPGWSESTLGAKTLEELPANARAYIKRLEELVGAPIDIISTGPDRNETIVLRHPFD
At Epoch: 448000.00
MRREDRAEGSASLGDRLLALFGEVGTRAKRRRPGVWRGLIGGLDDGGILNLSLVALLSAVCGTAVLLLLNAEAREVEYHGYSDLIAFGFLALLVIYRWSQNYLIREATQAIELALHERRLSTTRNVLRLSLEDVQALGLRSVIDGIGAHYGTLSQTLVPIVAGVEGVVLLVFMFGYLVILSPMAAALTTVVVAVTVMGFLASRSRLDEDLSLAVRSEEAFRDLAEGLVRGKKELRLNPDKRSAFQNDMIERSEELAICRSNAAAHFASMLATGNSASYLMAGAVVFVLPLITGVEQTDISRIVVAVIFLLGPISSVVQTFQQVTTARFALAEIGAFQDRVEALADRHDSDRSNGRVPAFQELRLESLSYSHAGENGFSIRDIDLQLQKGEILFVTGGNGSGKTTLLRIITGLYPRHEGAMRLNGERLAHYPPQSYRNLFASVFSDFHVFQHPYGLDDAGLERLEQWLVEFDIRHKLGDDLGFIDANALSTGQKKRVGLALALAEQRQILVLDEWAADQDPATRRRFYHE

At Epoch: 486000.00
MFLGLDIGTGGTRAVLVSPAGELVASASAEHESFRSPEPGWAEQDPHDWWRAAQQAIRETLAQVPGVKIDAVGLTGQMHGAVMLSKDGSVLRPSLIWCDQRTQPQCDWLHEQFGGGEAGREKLIELTANPALPNFTLTKLLWVRDHEPEIFARIAHVLCPKDYVRFRMTGTYAIDVQEASGTLLLDVAHRRWSSEVARVAGIPESWLPEVFESPEVCAHISAEAAQLLGIPQGTPVVAGAGDQGAGAVGMGILEPGSVSATIGTSGVVFAATAAPTRDPRGRLHTFCHAVPGRWHVMGVTQAAGLSLRWFRDTFSPCVDYNDFTTGAANIPAGSEGLLWTPYLLGERTPHLDPEARAAFVGIHANHTRDHFVRAVMEGVAYSLRDTFTLFAELGIPVKGVRLGGGGARGKLWRQIQADIYNHTVDVLTAEEGGAFGAALLAGVGAGAWANLDEACKAGIHVAQQIPPQPAAVERYERAYKAFRAVYPALKNIHA
At Epoch: 488000.00
KSQVFSTAEDSQNAVTIRVFQGEREMAADNKMLGQFDLMGIPPAPRGMPQIEVTFDIDANGIVNVSAKDKATSKEQQIRIQASGGLSEADIEKMVKDAEANAEADKKRREAVTAKNDADGLVHSTEKALAEHGSKVAETERRAIEDAVSDLKEALKGDDAEAIKAKTQTLAQASMKLGEAMYKQQAEADAKKDAAKDDV
At Epoch: 490000.00
MSGTIPAISVRGLRKRYGDQQVLGGLDLSVPAGSVYALLGANGAGKTTAVRILTTLIPFDGGEVTVAGHSLPGDPQNVRRRISVTGQYAAVDTLLTGFENLILLGELNHLGRKNSRRRAEELLDQFDLTDFGDKRAGDYSGGMLRRLDIAMSLVARPDVVFLDEPTTGLDPRSRHEMWSIVRGLTASGVTIFLTTQYLDEADELADTVGVLSGGRLVAEGTPDELKRLVPGGHVALQFENLAQLE

In [11]:
print_every = 5000
def generate_embedding_transformer_t34(model,batch_converter,dat,dat_name,out_dir,seq_col):
    # initialize network 
    model.cuda()
    sequence_embeddings = []
    for epoch in range(dat.shape[0]):
        data = [(dat.iloc[epoch, 1], dat.iloc[epoch, seq_col])]
        _, _, batch_tokens = batch_converter(data)
        with torch.no_grad():
            results = model(batch_tokens.to('cuda'), repr_layers=[34])
            # last layer
            token_embeddings = results["representations"][34]
            seq = dat.iloc[epoch,seq_col]
            sequence_embeddings.append(token_embeddings[0, 1:len(seq) + 1].mean(0).cpu().detach().numpy())
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
    sequence_embeddings = np.array(sequence_embeddings)
    print(sequence_embeddings.shape)
    print(out_dir + '/' + dat_name + ".npy")
    np.save(out_dir + '/' + dat_name + ".npy", sequence_embeddings)
    return 

In [12]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../out/201120/"
generate_embedding_transformer_t34(model_t34,batch_converter,pdt_motor,"pdt_motor_t34",out_dir,seq_col = 4)

At Epoch: 0.00
MSSIVVVGANWGDEGKGRIVDYLAGQAGASIRFQGGNNAGHTVVNDLGTFKLHQVPSGVFNPDCLVVLGPGMVISPEKLTVELEEVKASGVTPKLAISDRATLCLPLHALEDTLEEQRLGDGAYGSTRQGIAPAYGDRVMKKAILVGWLKQPDVLVERIQFMLDWKLPQMKAIYPSFEFTQTAQEMADWLLEVSAPWIDAVCNVSMPLKALQAEGKTLLFEAQLGAGRDLIYGEYPWVTSSHVSGAYAGIGGGLPGLRPERVIAVAKAFSSSVGTGTLLTAMENQDEFRKITNEFGATTGRPRDVGYFDAVATKNGVELQAATEVALTKLDCLTGLPDLKICVAYEGAHTENPIWPQTAALKPVYEQMESWSEDITGCRTFEELPKAAQQYVLRIEELLGVPVPMVSVGPGRDEMILR
At Epoch: 5000.00
MSIDEKPIKIKVEKVSKVFGKQTKKAIQMLDSGKNKKEILKATGSTVGVNQADFDVYDGEIFVIMGLSGSGKSTLVRLLNRLIEPTAGKIYIDGDMITNMSKDQLREVRRKKISMVFQKFALFPHRTILENTEYGLELQGIEKGKRQQIAIESLKLVGLEGYEEQYPDQLSGGMQQRVGLARALTNDPDILLMDEAFSALDPLIRKDMQDELLDLHDSVGKTIIFITHDLDEALRIGDRIALMKDGNIVQIGTPEEILMSPSNEYVEKFVEDVDLSKVLTAGHIMKRAETVRIDKGPRVALTLMKNLGISSIYAVDKQKQLLGVINAADAKKAAESDLSLQDILNTEFTTVPESTYLTEIFDVVSDANIPIAVVDEKQRMKGIVVRGALIGALAGNNDYINVESTDEQAQNPSIQEVK
At Epoch: 10000.00
MQLSGPQLSHEDLQRVGGTVHYLQNALSAKIVGQKTLQQSLLIGLIASGHVLLESVPGLAKTTAAKTLAESLAARFQRIQCTPDLLPSDITGGQIWDQKNGEFKVSFGPV

At Epoch: 105000.00
MFDNLSERLERSFKILKGEGKITEINVAETLKDVRKALLDADVNYKVAKGFTDTVKEKALGQNVLTAVKPSQLMVKIVHDELTQLMGGETVEIDTKGQPAVILMSGLQGSGKTTFSGKLARMLKTKKNKRPLLVACDVYRPAAIEQLRVLAEQIDVPMYSEIDSKDPVAIAQNAIKEARAKGYDLVIVDTAGRLAVDEQMMNEIAAIKEAIQPNEILFVVDSMTGQDAVNTAKEFNERLDFDGVVLTKLDGDTRGGAALSIRSVVNKPIKFVGTGEKLDAIDQFHPARMADRILGMGDIVSLVERAQEQYDEEEAKRLQKKIAKNQFDFNDFLSQIGQIKKMGNLKELASMIPGVGKAIKDIDIDDNAFKSIEAIIYSMTPEERSNPAILNGSRRTRIAKGSGTTIQEVNRLLKQFDQTRKMMKMVTSSKMGKMMPKMK
At Epoch: 110000.00
MSAGVAAQNLRVKIGGKEILHGLNVSVAPGRRTAIIGPNGAGKTTLLRALSGLNARYTGEILLGGKELGSYAEKELARVRAILPQERGAAQGLTVEQLVSYGRFSHRSVFQARDGADDRAAVAWAMETAHVDAFAEREVHTLSGGERQRVYLAMALSQRPRLLLLDEPTTYLDVAHQLRVMEIITNLNRDSGITILMVLHDMAHAMQYADDIVLMRHGEIVATGTPADVLTEERIADVFGVRVEIFTNSLGVRVPSPVSLV
At Epoch: 115000.00
MPILNVNHLSKVYGSKQKYKALHDINFSVDKGEFVAIMGPSGSGKTTLLNVISSIDSISGGTVEVSGNEINQLKDKQLAQFRKKELGFIFQDYSVLPTLTVKENIMLPLSVQKVSKDEMEQNYKEVTEALGIYELSDKYPSEISGGQQQRTAAARAFVHKPAIIFADEPTGALDSKSAQDLLNRLEDMNERFNSTIIMVTHDPSAASFAQRVIMLKDGNIHSEIHQEDKSKREFYNEI

At Epoch: 210000.00
MTKSIEKAISAKNLAFKYKDSEKYALKDFNLDVNKGEFVVIMGPSGAGKSTFANSLNGLIPNFIKGTCSGELSVFDKDPRKESVSTMAREVGLVFQDFESQLFCTNIRLEVAFGPENFLVPRQDMDRRIDKVLDVVDLVGFDERQPATLSGGQKQRLAIGSVLSSEPQLICMDEPTTDLDPYGKMGIFNIARALHDEGEMTLVVIEHETEEALRADRLVLMNEGKIVKEGKPREVLADVETMDELGLQSLHVPKFFKEMGDETLPITPEEGKEEFLKKFKINEEKYQKLLEKDKKIEDSYGETLIEVKDLSFTYPNGKKALNNINLEVKKGEFLAVLGHNGSGKTTLVKHFNGLLSPTEGNVIVKGIDTKKSSIFEIGKIVGYTFQNPDHQIFSDTVYDEVAFSPKIRGLSDEEIKKRVEESLAAVDMSGYEKEDPFTLTKGMRQRIAVASILSSRPDVLILDEPTTGLDYKGQKQMMELIKKLNEEGHTIIMITHTMWVVAEYAHRVCVIENGDVVMMGRTRDIFKKEEKLEKVYLKNPHIVSMSNKLGKTVLSIDEMKSITEAK
At Epoch: 215000.00
MNENAIEVHNLSKDYFISSRGENWRAVVKNIFKPEKTAVQAVKALNFTIKNGEKVGFIGKNGAGKTTTIKMLTGTLFPSAGTCRVNGYDPTKRINDFKKSISVVMGNRSQLFPDLTPRDYLKLLQSMYDIPEEIFQKTVNDLAQILNVTSKLDVQTRKLSLGERMKVEFLAGVATRPKILFLDEPTIGLDVLAKRDIRKFLLRLNQEEKLTVFLTSHDMEDIATICDHLIIVNSGQIMWDGPKTDLLERFNQNKYITFIKSENFNESKLGAKIIDQDDLTVTIKVPVEKVDDQIAQLARDNQGSDYQINDLKLEDIILELFAEEKEK
At Epoch: 220000.00
MPALSLEVENLSAGYGPTRVLEDISFSIPAGARLAVLGRNGMGKT

At Epoch: 315000.00
MSNTKEVFCMSLYIIETRHLTKQYGTQKSVADLNIHVKKGRVYGLLGRNGAGKTTTMKMLLGLTKPTIGEVLLWGEPLHGNENKVLPRIGSMIESPGFYPNLTGTENLSIFAILRGVPNNHSIQDALNLVGLPYKDKKLFSQYSLGMKQRLAIALAVMHDPELLILDEPINGLDPIGIAEVRKFIRELCNKRGKTILLSSHILSEIALLADDIGIIDRGVLLEEETFAELEQKSSRHIHFTVSDSAQAAMVLEKTFNETHFLVHDDHNLNLYNLDLPVGKIVAAYVETGLEVSEAHTCEETLEDYFKRVTGGEGIA
At Epoch: 320000.00
MEIELRKLTKEFQKTKAVDNITLTLNNGVYGLLGANGAGKTTLMRMLCTLLKPTSGEILCNGKDIFEMDGGYRKRAGGHSKGTGTEKRMGRGADRRKDTKGHREKRRNFQHTGGAVGGFPAE
At Epoch: 325000.00
MSQLKKALAENRLEIRLKHFSKYKVLIIDEVGYLPIDTDASNIFFQLISKRYEKHSTIITTNKPFSNWAEVFGSATLANAILDRLLHHSHVISIKGPSYRLKSKVEYFNSSSNAS
At Epoch: 330000.00
MDTTIQNDERVIITGLDTGQDDYNYSMTELAELAQANHMEVVQRVDQVIDRPNPATYFGKGKVAEIAELAAANDVTTIITNDELSPSQLRNLEDETGKRILDRTALILEIFATRAQTKEAKLQVQIAELQYRLPRLQTSASQRLDQQTGGGSGFTNRGAGETKLEMDRRTIQHHITHLRHELAAIDKSEETKRKQRAKSNIPTAALVGYTNAGKSTIMNGLVRRYGAVEDKTVFEKDMLFATLDTSVRRLTLPGRKDFLLSDTVGFVSKLPTNLVESFKSTLAEAANADLLIQVIDYSDPNYEEMMQTTKETLKQIGIDNIPMVNVFNKADKTEIEFPVLEGDDQVVISAKQDESLDLLVDVIR

At Epoch: 430000.00
MQPIISFEQFTFQYWHAAQPTLSDITFHIYPGEKVLIAGRSGSGKSTLAHCINGLIPFSYEGTSTGNVLIAGKDPREGSIFEQSKQVGTILQDQDAQFIGLTVEEDVAFYLENECVKQDDMKKIVSDSLRKVKMHTFHKQSPHELSGGQKQTVSLAGLLTTNADILLFDEPLANLDPLSAIHTIELMKDIHEQYNKTIVIIEHRIEEILKLDLDKVILIDEGKVIAIGTPKEILASNILPRIGLREPIYIEALKRLHFDSNNDVIYPMENLQKEKVSNVIKEWMEKQVILKGNTKNKELLKVENLSFSYSNKQKVLENVNLSIYEGEIVALLGHNGAGKSTLAHSLIGINKMKNGKILLKGEDISSWSIRKRGEIISYVMQNPNHMITQPTVFEEVSFTLTLHNFSKEEIKNRVEGTLKICGLYPFRNWPIQALSYGQKKRFTIASVLTTNPKLIILDEPTAGQDYYHYKQFMSFIKNLAKKGISFVFITHDMNLVLEYADRAVVLHEGKIIADNTVFDVLGDQETLQRANLRESSLTKLVKFSGIACPEKFMELYLDSNRREEGA
At Epoch: 435000.00
MEELYCIGCGAKIQSQDKEAPGYTPQSSIEKSQETGELYCQRCFRLRHYNEIVDVAISDDDFLKLLHEVGDSDALILNVVDIFDFNGSVIPSLSRFVAGNDVLLVGNKKDILPKSVKNGKVTQWLTERAHEEGLRPVDVLLTSAQNGQAIKELIARIEKLRAGRDVYVVGVTNVGKSTLINAIIKEITGDQEVITTSRFPGTTLDKIEIPLDDGAYIFDTPGIIHRHQMAHYLSAKDLKFVSPKKEIKPKTYQLNPEQTLFLGGLGRFDFVEGDKQGFTAYFDNNLKLHRTKLEGADDFYLKHAGSLLTPPSAKEVANFPKLVRHEFTITGKTDVVFSGLGWVRVNGSARVAAWAPEGVGVMVRKAII
At Epoch: 440000.00
MIPV

In [13]:
print("done")

done


## Generate Embedding using Unirep-1900