In [89]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm 
import glob
import re
import requests
import io

import torch
from argparse import Namespace
from esm.constants import proteinseq_toks
import math
import torch.nn as nn
import torch.nn.functional as F
from esm.modules import TransformerLayer, PositionalEmbedding  # noqa
from esm.model import ProteinBertModel
import esm
import time

In [2]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t34_670M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_34 = {prs(arg[0]): arg[1] for arg in model_data["model"].items()}
model_t34 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t34.load_state_dict(model_state_34)

cuda


<All keys matched successfully>

In [90]:
kif_acc_all = pd.read_csv("../../data/kif/kif_acc_all.csv")
kif_uniprot_all = pd.read_csv("../../data/kif/kif_uniprot_all.csv")

In [91]:
kif_acc_all.head()

Unnamed: 0,Entry,db_acc,db_name,kinesin_family
0,Q7PG43,AgKHC,kif_jp,1
1,Q7PNB7,AgKlp31E,kif_jp,4
2,Q7PTK6,AgNcd,kif_jp,14
3,Q7QJN4,AgKin73,kif_jp,3
4,Q7QDS6,AgKlp68D,kif_jp,2


In [92]:
kif_uniprot_all.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq
0,Q7PG43,Q7PG43_ANOGA,unreviewed,Kinesin-like protein,1271664 AgaP_AGAP000561,Anopheles gambiae (African malaria mosquito),983,MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCL...
1,Q7PNB7,Q7PNB7_ANOGA,unreviewed,AGAP007815-PA (Fragment),AgaP_AGAP007815,Anopheles gambiae (African malaria mosquito),1033,LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2,Q7PTK6,Q7PTK6_ANOGA,unreviewed,Kinesin-like protein,1269313 AgaP_AGAP002248,Anopheles gambiae (African malaria mosquito),762,MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3,Q7QJN4,Q7QJN4_ANOGA,unreviewed,AGAP007592-PA,AgaP_AGAP007592,Anopheles gambiae (African malaria mosquito),1944,MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4,Q7QDS6,Q7QDS6_ANOGA,unreviewed,Kinesin-like protein (Fragment),AgaP_AGAP010396,Anopheles gambiae (African malaria mosquito),781,MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...


In [93]:
kif_all = pd.concat([kif_acc_all,kif_uniprot_all],axis = 1)

In [94]:
kif_all.shape

(623, 12)

In [95]:
kif_all.iloc[1:5,11]

1    LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2    MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3    MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4    MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...
Name: seq, dtype: object

In [96]:
kif_acc_all.shape

(623, 4)

In [97]:
kif_uniprot_all.shape

(623, 8)

In [21]:
print_every = 100
def generate_embedding_transformer_t34(model,batch_converter,dat,dat_name,out_dir,seq_col):
    # initialize network 
    model.cuda()
    sequence_embeddings = []
    for epoch in range(dat.shape[0]):
        data = [(dat.iloc[epoch, 1], dat.iloc[epoch, seq_col])]
        _, _, batch_tokens = batch_converter(data)
        with torch.no_grad():
            results = model(batch_tokens.to('cuda'), repr_layers=[34])
            # last layer
            token_embeddings = results["representations"][34]
            seq = dat.iloc[epoch,seq_col]
            sequence_embeddings.append(token_embeddings[0, 1:len(seq) + 1].mean(0).cpu().detach().numpy())
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
    sequence_embeddings = np.array(sequence_embeddings)
    print(sequence_embeddings.shape)
    print(out_dir + '/' + dat_name + ".npy")
    np.save(out_dir + '/' + dat_name + ".npy", sequence_embeddings)
    return 

In [23]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../data/kif/"
generate_embedding_transformer_t34(model_t34,batch_converter,kif_all,"kif_all_t34",out_dir,seq_col = 11)

At Epoch: 0.00
MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCLSIGGKVYLFDKVFKPNATQEKVYNEAAKSIVSDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDPAKQGIIPRIVNDIFNHIYTMEMNIEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGASERFVSSPEEVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENMENEKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEREKEKNTKLKGKIEKLEAELARWRAGETVNVEEQLDLQQDAMEASTPNVEVLLAQPADLPVPATPGGGGLPLSAERDTLEVERERLYQQLDEKDEEINQQSQYVEKLKEQIIDQEELIANTRRDYENLQSEMTRIQQENENAKEEVKEVLQALEELAVNYDQKSQEIELKNKEIDMVNDELLQKQTTLNSVQSELQQLKDMSSHQKKRINEMLTNLLRDLSEVGQALAADQNEMKMNVEASAGKVEEEFTVARLYISKMKSEAKNLSARCANLETLQQDTCRKVGDYEKDLSECRLLISQHEARMKSLQESMREAENKKRTLEENIDALREECAKLKAAEQVSAVNAEEKQRADQLKVAFESQMDQLRDVHTKQVSALRDEISEKQELINELKDTNQKLTLAHQQMTADYEKLKQEEQEKSAKLQTLMLTDERREQARKDLKGLEDTVAKELQSLHALRKLFVLDLQARIKKSLNSEDTEDDGGSLAQKQKISFLENNLEQLTKVHKQLVRDNADLRCELPKLEKRLRTTVERVKALETALKEAKEGAMRDRKRYQYEVDRIKEAVRQKNLARRGPQAQIAKPIRAGQGQYLFKSGGTGAATTAPGGGTAITPKAMADEKRKSQIKDMDS
A

## Generate embedding for t12 esm model without evotune

In [25]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t12_85M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_12 = {prs(arg[0]): arg[1] for arg in model_data["model"].items()}
model_t12 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t12.load_state_dict(model_state_12)

cuda


<All keys matched successfully>

In [26]:
kif_acc_all = pd.read_csv("../../data/kif/kif_acc_all.csv")
kif_uniprot_all = pd.read_csv("../../data/kif/kif_uniprot_all.csv")

In [27]:
kif_acc_all.head()

Unnamed: 0,Entry,db_acc,db_name,kinesin_family
0,Q7PG43,AgKHC,kif_jp,1
1,Q7PNB7,AgKlp31E,kif_jp,4
2,Q7PTK6,AgNcd,kif_jp,14
3,Q7QJN4,AgKin73,kif_jp,3
4,Q7QDS6,AgKlp68D,kif_jp,2


In [28]:
kif_uniprot_all.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,seq
0,Q7PG43,Q7PG43_ANOGA,unreviewed,Kinesin-like protein,1271664 AgaP_AGAP000561,Anopheles gambiae (African malaria mosquito),983,MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCL...
1,Q7PNB7,Q7PNB7_ANOGA,unreviewed,AGAP007815-PA (Fragment),AgaP_AGAP007815,Anopheles gambiae (African malaria mosquito),1033,LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2,Q7PTK6,Q7PTK6_ANOGA,unreviewed,Kinesin-like protein,1269313 AgaP_AGAP002248,Anopheles gambiae (African malaria mosquito),762,MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3,Q7QJN4,Q7QJN4_ANOGA,unreviewed,AGAP007592-PA,AgaP_AGAP007592,Anopheles gambiae (African malaria mosquito),1944,MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4,Q7QDS6,Q7QDS6_ANOGA,unreviewed,Kinesin-like protein (Fragment),AgaP_AGAP010396,Anopheles gambiae (African malaria mosquito),781,MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...


In [29]:
kif_all = pd.concat([kif_acc_all,kif_uniprot_all],axis = 1)

In [30]:
kif_all.shape

(623, 12)

In [31]:
kif_all.iloc[1:5,11]

1    LLTNVFSLSLFLGHRFRIRPQIPRELIDMCRVCTQVTPGEPQVLLG...
2    MDSRIPKPSFLKKPTGPLSLPGNARLPLTRDLLNLPSANSTMFAKV...
3    MSDKIRVAVRVRPFNRRELELATENVIEMNGTQTILKYPASLDKME...
4    MDRTIKTRSNLSNTKNECVQVVVRCRPLNNKELTGNFQKVVDVFPS...
Name: seq, dtype: object

In [32]:
kif_acc_all.shape

(623, 4)

In [33]:
kif_uniprot_all.shape

(623, 8)

In [98]:
print_every = 100
def generate_embedding_transformer_t12(model,batch_converter,dat,dat_name,out_dir,seq_col):
    # initialize network 
    model.cuda()
    sequence_embeddings = []
    for epoch in range(dat.shape[0]):
        data = [(dat.iloc[epoch, 1], dat.iloc[epoch, seq_col])]
        _, _, batch_tokens = batch_converter(data)
        with torch.no_grad():
            results = model(batch_tokens.to('cuda'), repr_layers=[12])
            # last layer
            token_embeddings = results["representations"][12]
            seq = dat.iloc[epoch,seq_col]
            sequence_embeddings.append(token_embeddings[0, 1:len(seq) + 1].mean(0).cpu().detach().numpy())
        if epoch % print_every == 0:
            print(f"At Epoch: %.2f"% epoch)
            print(seq)
    sequence_embeddings = np.array(sequence_embeddings)
    print(sequence_embeddings.shape)
    print(out_dir + '/' + dat_name + ".npy")
    np.save(out_dir + '/' + dat_name + ".npy", sequence_embeddings)
    return 

In [38]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../data/kif/"
generate_embedding_transformer_t12(model_t12,batch_converter,kif_all,"kif_all_t12",out_dir,seq_col = 11)

At Epoch: 0.00
MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCLSIGGKVYLFDKVFKPNATQEKVYNEAAKSIVSDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDPAKQGIIPRIVNDIFNHIYTMEMNIEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGASERFVSSPEEVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENMENEKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEREKEKNTKLKGKIEKLEAELARWRAGETVNVEEQLDLQQDAMEASTPNVEVLLAQPADLPVPATPGGGGLPLSAERDTLEVERERLYQQLDEKDEEINQQSQYVEKLKEQIIDQEELIANTRRDYENLQSEMTRIQQENENAKEEVKEVLQALEELAVNYDQKSQEIELKNKEIDMVNDELLQKQTTLNSVQSELQQLKDMSSHQKKRINEMLTNLLRDLSEVGQALAADQNEMKMNVEASAGKVEEEFTVARLYISKMKSEAKNLSARCANLETLQQDTCRKVGDYEKDLSECRLLISQHEARMKSLQESMREAENKKRTLEENIDALREECAKLKAAEQVSAVNAEEKQRADQLKVAFESQMDQLRDVHTKQVSALRDEISEKQELINELKDTNQKLTLAHQQMTADYEKLKQEEQEKSAKLQTLMLTDERREQARKDLKGLEDTVAKELQSLHALRKLFVLDLQARIKKSLNSEDTEDDGGSLAQKQKISFLENNLEQLTKVHKQLVRDNADLRCELPKLEKRLRTTVERVKALETALKEAKEGAMRDRKRYQYEVDRIKEAVRQKNLARRGPQAQIAKPIRAGQGQYLFKSGGTGAATTAPGGGTAITPKAMADEKRKSQIKDMDS
A

## Generate embedding for t12 esm model with dynein and kinesin evotune

In [99]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t12_85M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_12 = torch.load("../../data/esm_t12_85M_UR50S_dyn_kin_201204.pt")
model_t12 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t12.load_state_dict(model_state_12)

cuda


<All keys matched successfully>

In [100]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../data/kif/"
generate_embedding_transformer_t12(model_t12,batch_converter,kif_all,"kif_all_t12_dyn_kin",out_dir,seq_col = 11)

At Epoch: 0.00
MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCLSIGGKVYLFDKVFKPNATQEKVYNEAAKSIVSDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDPAKQGIIPRIVNDIFNHIYTMEMNIEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGASERFVSSPEEVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENMENEKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEREKEKNTKLKGKIEKLEAELARWRAGETVNVEEQLDLQQDAMEASTPNVEVLLAQPADLPVPATPGGGGLPLSAERDTLEVERERLYQQLDEKDEEINQQSQYVEKLKEQIIDQEELIANTRRDYENLQSEMTRIQQENENAKEEVKEVLQALEELAVNYDQKSQEIELKNKEIDMVNDELLQKQTTLNSVQSELQQLKDMSSHQKKRINEMLTNLLRDLSEVGQALAADQNEMKMNVEASAGKVEEEFTVARLYISKMKSEAKNLSARCANLETLQQDTCRKVGDYEKDLSECRLLISQHEARMKSLQESMREAENKKRTLEENIDALREECAKLKAAEQVSAVNAEEKQRADQLKVAFESQMDQLRDVHTKQVSALRDEISEKQELINELKDTNQKLTLAHQQMTADYEKLKQEEQEKSAKLQTLMLTDERREQARKDLKGLEDTVAKELQSLHALRKLFVLDLQARIKKSLNSEDTEDDGGSLAQKQKISFLENNLEQLTKVHKQLVRDNADLRCELPKLEKRLRTTVERVKALETALKEAKEGAMRDRKRYQYEVDRIKEAVRQKNLARRGPQAQIAKPIRAGQGQYLFKSGGTGAATTAPGGGTAITPKAMADEKRKSQIKDMDS
A

## Generate embedding for t12 esm model with kinesin both

In [101]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t12_85M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_12 = torch.load("../../data/esm_t12_85M_UR50S_kin_both_201204.pt")
model_t12 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t12.load_state_dict(model_state_12)

cuda


<All keys matched successfully>

In [102]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../data/kif/"
generate_embedding_transformer_t12(model_t12,batch_converter,kif_all,"kif_all_t12_kin_both",out_dir,seq_col = 11)

At Epoch: 0.00
MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCLSIGGKVYLFDKVFKPNATQEKVYNEAAKSIVSDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDPAKQGIIPRIVNDIFNHIYTMEMNIEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGASERFVSSPEEVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENMENEKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEREKEKNTKLKGKIEKLEAELARWRAGETVNVEEQLDLQQDAMEASTPNVEVLLAQPADLPVPATPGGGGLPLSAERDTLEVERERLYQQLDEKDEEINQQSQYVEKLKEQIIDQEELIANTRRDYENLQSEMTRIQQENENAKEEVKEVLQALEELAVNYDQKSQEIELKNKEIDMVNDELLQKQTTLNSVQSELQQLKDMSSHQKKRINEMLTNLLRDLSEVGQALAADQNEMKMNVEASAGKVEEEFTVARLYISKMKSEAKNLSARCANLETLQQDTCRKVGDYEKDLSECRLLISQHEARMKSLQESMREAENKKRTLEENIDALREECAKLKAAEQVSAVNAEEKQRADQLKVAFESQMDQLRDVHTKQVSALRDEISEKQELINELKDTNQKLTLAHQQMTADYEKLKQEEQEKSAKLQTLMLTDERREQARKDLKGLEDTVAKELQSLHALRKLFVLDLQARIKKSLNSEDTEDDGGSLAQKQKISFLENNLEQLTKVHKQLVRDNADLRCELPKLEKRLRTTVERVKALETALKEAKEGAMRDRKRYQYEVDRIKEAVRQKNLARRGPQAQIAKPIRAGQGQYLFKSGGTGAATTAPGGGTAITPKAMADEKRKSQIKDMDS
A

## Generate embedding for t12 esm model with kinesin kif

In [103]:
alphabet = esm.Alphabet.from_dict(proteinseq_toks)
model_name = "esm1_t12_85M_UR50S"
url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
if torch.cuda.is_available():
    print("cuda")
    model_data = torch.hub.load_state_dict_from_url(url, progress=False)
else:
    model_data = torch.hub.load_state_dict_from_url(url, progress=False, map_location=torch.device('cpu'))

pra = lambda s: ''.join(s.split('decoder_')[1:] if 'decoder' in s else s)
prs = lambda s: ''.join(s.split('decoder.')[1:] if 'decoder' in s else s)
model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
model_state_12 = torch.load("../../data/esm_t12_85M_UR50S_kin_kif_201204.pt")
model_t12 = esm.ProteinBertModel(Namespace(**model_args), len(alphabet), padding_idx=alphabet.padding_idx)
model_t12.load_state_dict(model_state_12)

cuda


<All keys matched successfully>

In [104]:
batch_converter = alphabet.get_batch_converter()
out_dir = "../../data/kif/"
generate_embedding_transformer_t12(model_t12,batch_converter,kif_all,"kif_all_t12_kin_kif",out_dir,seq_col = 11)

At Epoch: 0.00
MSGVREIPAEDSIKVVCRFRPLNDSEELAGSKFVVKFPSGPEENCLSIGGKVYLFDKVFKPNATQEKVYNEAAKSIVSDVLAGYNGTIFAYGQTSSGKTHTMEGVIGDPAKQGIIPRIVNDIFNHIYTMEMNIEFHIKVSYYEIYMDKIRDLLDVSKVNLSVHEDKNRVPYVKGASERFVSSPEEVFEVIEEGKSNRHIAVTNMNEHSSRSHSVFLINVKQENMENEKKLSGKLYLVDLAGSEKVSKTGAEGTVLDEAKNINKSLSALGNVISALADGNKTHIPYRDSKLTRILQESLGGNARTTIVICCSPASFNESETKSTLDFGRRAKTVKNVVCVNEELTAEEWKRRYEREKEKNTKLKGKIEKLEAELARWRAGETVNVEEQLDLQQDAMEASTPNVEVLLAQPADLPVPATPGGGGLPLSAERDTLEVERERLYQQLDEKDEEINQQSQYVEKLKEQIIDQEELIANTRRDYENLQSEMTRIQQENENAKEEVKEVLQALEELAVNYDQKSQEIELKNKEIDMVNDELLQKQTTLNSVQSELQQLKDMSSHQKKRINEMLTNLLRDLSEVGQALAADQNEMKMNVEASAGKVEEEFTVARLYISKMKSEAKNLSARCANLETLQQDTCRKVGDYEKDLSECRLLISQHEARMKSLQESMREAENKKRTLEENIDALREECAKLKAAEQVSAVNAEEKQRADQLKVAFESQMDQLRDVHTKQVSALRDEISEKQELINELKDTNQKLTLAHQQMTADYEKLKQEEQEKSAKLQTLMLTDERREQARKDLKGLEDTVAKELQSLHALRKLFVLDLQARIKKSLNSEDTEDDGGSLAQKQKISFLENNLEQLTKVHKQLVRDNADLRCELPKLEKRLRTTVERVKALETALKEAKEGAMRDRKRYQYEVDRIKEAVRQKNLARRGPQAQIAKPIRAGQGQYLFKSGGTGAATTAPGGGTAITPKAMADEKRKSQIKDMDS
A

## Generate normalized embeddings for both t12 and t34 embeddings

In [105]:
dat_t12_dyn_kin = np.load("../../data/kif//kif_all_t12_dyn_kin.npy")
dat_t12_kin_both = np.load("../../data/kif//kif_all_t12_kin_both.npy")
dat_t12_kin_kif = np.load("../../data/kif//kif_all_t12_kin_kif.npy")

In [106]:
from sklearn.preprocessing import StandardScaler

In [107]:
scaler = StandardScaler()
scaler.fit(dat_t12_dyn_kin)
dat_t12_dyn_kin_scaled = scaler.transform(dat_t12_dyn_kin)

scaler = StandardScaler()
scaler.fit(dat_t12_kin_both)
dat_t12_kin_both_scaled = scaler.transform(dat_t12_kin_both)

scaler = StandardScaler()
scaler.fit(dat_t12_kin_kif)
dat_t12_kin_kif_scaled = scaler.transform(dat_t12_kin_kif)


In [135]:
sigma = np.cov(dat_t12_dyn_kin_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.9143122243393961


array([0.1037386 , 0.17346857, 0.22312034, 0.2711978 , 0.31501837,
       0.34959135, 0.3810746 , 0.41127113, 0.44036258, 0.46841588])

In [136]:
sigma = np.cov(dat_t12_kin_both_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.9231702483504243


array([0.14355875, 0.21469045, 0.26714097, 0.31047324, 0.3457676 ,
       0.3784513 , 0.40961244, 0.43754314, 0.46433769, 0.4892312 ])

In [137]:
sigma = np.cov(dat_t12_kin_kif_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.938463124525364


array([0.11628238, 0.21911502, 0.27074055, 0.3180739 , 0.35890227,
       0.39807475, 0.43098283, 0.46215509, 0.49194599, 0.5192833 ])

## Observation: Slight increase in the variance accounted for the specific kinesin dataset with the weight-updated models than the raw esm t12 model

In [138]:
from sklearn.decomposition import PCA

In [139]:
pca = PCA(n_components=90)
dat_t12_kin_kif_scaled_reduced = pca.fit_transform(dat_t12_kin_kif_scaled)

In [140]:
pca = PCA(n_components=90)
dat_t12_dyn_kin_scaled_reduced = pca.fit_transform(dat_t12_dyn_kin_scaled)

In [141]:
pca = PCA(n_components=90)
dat_t12_kin_both_scaled_reduced = pca.fit_transform(dat_t12_kin_both_scaled)

In [142]:
np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_kin_kif_scaled_reduced.npy", dat_t12_kin_kif_scaled_reduced)
np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy", dat_t12_dyn_kin_scaled_reduced)
np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_kin_both_scaled_reduced.npy", dat_t12_kin_both_scaled_reduced)

np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_kin_kif_scaled.npy", dat_t12_kin_kif_scaled)
np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_dyn_kin_scaled.npy", dat_t12_dyn_kin_scaled)
np.save("../../out/201212/embedding/esm_models/t12_evo/kif_all_t12_kin_both_scaled.npy", dat_t12_kin_both_scaled)


In [51]:
scaler = StandardScaler()
scaler.fit(dat_t34)
dat_t34_scaled = scaler.transform(dat_t34)

In [52]:
sigma = np.cov(dat_t34_scaled.T)
sigma.shape

(1280, 1280)

In [53]:
u, s, v = np.linalg.svd(sigma)
s[0:10]

array([307.53562147, 107.41777344,  91.98352673,  69.9235464 ,
        63.3094603 ,  50.19162478,  35.88060167,  30.76523453,
        30.52840563,  27.52509972])

In [54]:
s_ratio = np.cumsum(s)/sum(s)

In [59]:
s_ratio[80]

0.933411760121551

In [60]:
from sklearn.decomposition import PCA
pca = PCA(n_components=80)
dat_t34_scaled_reduced = pca.fit_transform(dat_t34_scaled)

In [61]:
np.cumsum(pca.explained_variance_ratio_)

array([0.23987648, 0.32366195, 0.3954088 , 0.44994885, 0.49932992,
       0.5384792 , 0.5664659 , 0.5904627 , 0.6142747 , 0.63574415,
       0.65285987, 0.6685754 , 0.68184376, 0.69354457, 0.7049386 ,
       0.7149493 , 0.7243274 , 0.7330448 , 0.741408  , 0.74944043,
       0.7566267 , 0.7629791 , 0.76920086, 0.7750953 , 0.78077245,
       0.7863416 , 0.7916782 , 0.79678273, 0.80164117, 0.8063084 ,
       0.81074506, 0.81485856, 0.8188833 , 0.8228533 , 0.8266949 ,
       0.83033806, 0.8338366 , 0.837285  , 0.8406885 , 0.8440735 ,
       0.8473215 , 0.8505465 , 0.85370916, 0.8567171 , 0.85969514,
       0.8625895 , 0.8654622 , 0.86830443, 0.8710722 , 0.8738029 ,
       0.8763979 , 0.87896395, 0.8814524 , 0.8838775 , 0.88626313,
       0.8885633 , 0.89083505, 0.89304537, 0.89524364, 0.8973704 ,
       0.8994721 , 0.90152854, 0.9035551 , 0.905528  , 0.9074723 ,
       0.9093682 , 0.9112302 , 0.91301566, 0.91478926, 0.9165155 ,
       0.91820586, 0.9198587 , 0.92146903, 0.92303   , 0.92455

In [62]:
np.save("../../out/201212/embedding/esm_models/t12_normalized/kif_all.npy", dat_t12_scaled)
np.save("../../out/201212/embedding/esm_models/t12_normalized_reduced/kif_all.npy", dat_t12_scaled_reduced)
np.save("../../out/201212/embedding/esm_models/t34_normalized/kif_all.npy", dat_t34_scaled)
np.save("../../out/201212/embedding/esm_models/t34_normalized_reduced/kif_all.npy", dat_t34_scaled_reduced)

In [63]:
print(dat_t12_scaled.shape)
print(dat_t34_scaled.shape)

(623, 768)
(623, 1280)


# normalize within the subset to reduce affect of family 14 very large amount 

In [143]:
kif_acc_all_balanced = pd.read_csv("../../data/kif/kif_acc_all_balanced.csv")
kif_acc_all_balanced.head(1)

Unnamed: 0.1,Unnamed: 0,Entry,db_acc,db_name,kinesin_family
0,0,Q7PG43,AgKHC,kif_jp,1


In [68]:
dat_t12 = np.load("../../data/kif//kif_all_t12.npy")
dat_t34 = np.load("../../data/kif//kif_all_t34.npy")

dat_t12 = dat_t12[kif_acc_all_balanced["Unnamed: 0"],:]
dat_t34 = dat_t34[kif_acc_all_balanced["Unnamed: 0"],:]

In [69]:
scaler = StandardScaler()
scaler.fit(dat_t12)
dat_t12_scaled = scaler.transform(dat_t12)

In [70]:
sigma = np.cov(dat_t12_scaled.T)
sigma.shape

(768, 768)

In [71]:
u, s, v = np.linalg.svd(sigma)
s[0:10]

array([168.72236191,  68.65800771,  66.29760791,  53.1949032 ,
        44.148112  ,  36.24220486,  30.36354889,  28.9050794 ,
        23.92338204,  20.00333523])

In [72]:
s_ratio = np.cumsum(s)/sum(s)

In [74]:
s_ratio[50]

0.9721366424150297

In [75]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
dat_t12_scaled_reduced = pca.fit_transform(dat_t12_scaled)

In [76]:
np.cumsum(pca.explained_variance_ratio_)

array([0.21813229, 0.30689678, 0.39260942, 0.46138236, 0.51845914,
       0.5653148 , 0.60457027, 0.6419402 , 0.67286944, 0.69873077,
       0.72214085, 0.7439846 , 0.76469606, 0.7832524 , 0.798235  ,
       0.812644  , 0.8254138 , 0.8365268 , 0.84737563, 0.8573931 ,
       0.86653525, 0.8751833 , 0.88303316, 0.8901916 , 0.89684796,
       0.9027524 , 0.9082548 , 0.9131301 , 0.9178595 , 0.9222001 ,
       0.92650133, 0.9302965 , 0.9340038 , 0.93757296, 0.94067657,
       0.9437247 , 0.94669384, 0.9493839 , 0.95199037, 0.95437396,
       0.95660555, 0.9586458 , 0.9605108 , 0.9623289 , 0.96399635,
       0.96563214, 0.96707344, 0.9684071 , 0.96967125, 0.97089565],
      dtype=float32)

In [77]:
scaler = StandardScaler()
scaler.fit(dat_t34)
dat_t34_scaled = scaler.transform(dat_t34)

In [78]:
sigma = np.cov(dat_t34_scaled.T)
sigma.shape

(1280, 1280)

In [79]:
u, s, v = np.linalg.svd(sigma)
s[0:10]

array([290.26010168, 124.8123434 ,  82.56208729,  80.27729774,
        73.88226833,  66.29377796,  54.51859608,  46.55778631,
        34.96512113,  34.39871901])

In [80]:
s_ratio = np.cumsum(s)/sum(s)

In [82]:
s_ratio[50]

0.9669857937816018

In [83]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
dat_t34_scaled_reduced = pca.fit_transform(dat_t34_scaled)

In [84]:
np.cumsum(pca.explained_variance_ratio_)

array([0.22515742, 0.3219754 , 0.38601947, 0.44829124, 0.50560236,
       0.5570271 , 0.59931767, 0.63543296, 0.6625557 , 0.68923914,
       0.71403056, 0.736706  , 0.75732225, 0.77528924, 0.7912302 ,
       0.8059437 , 0.819023  , 0.83142656, 0.8420159 , 0.85180455,
       0.8610204 , 0.86941177, 0.8773987 , 0.884699  , 0.8911087 ,
       0.89700216, 0.90252906, 0.90767896, 0.9122206 , 0.9165678 ,
       0.9207648 , 0.9247405 , 0.92830014, 0.9315554 , 0.93467546,
       0.93752736, 0.9402053 , 0.9427276 , 0.9452007 , 0.9475157 ,
       0.9497772 , 0.9518697 , 0.95392406, 0.9558173 , 0.9576544 ,
       0.9593626 , 0.9610177 , 0.9625949 , 0.96407866, 0.96551305],
      dtype=float32)

In [85]:
np.save("../../out/201212/embedding/esm_balanced/t12_normalized/kif_all.npy", dat_t12_scaled)
np.save("../../out/201212/embedding/esm_balanced/t12_normalized_reduced/kif_all.npy", dat_t12_scaled_reduced)
np.save("../../out/201212/embedding/esm_balanced/t34_normalized/kif_all.npy", dat_t34_scaled)
np.save("../../out/201212/embedding/esm_balanced/t34_normalized_reduced/kif_all.npy", dat_t34_scaled_reduced)

In [86]:
print(dat_t12_scaled.shape)
print(dat_t34_scaled.shape)

(141, 768)
(141, 1280)


In [88]:
np.save("../../out/201212/embedding/esm_balanced/t12/kif_all.npy", dat_t12)
np.save("../../out/201212/embedding/esm_balanced/t34/kif_all.npy", dat_t34)


In [145]:
dat_t12_dyn_kin = np.load("../../data/kif//kif_all_t12_dyn_kin.npy")
dat_t12_kin_both = np.load("../../data/kif//kif_all_t12_kin_both.npy")
dat_t12_kin_kif = np.load("../../data/kif//kif_all_t12_kin_kif.npy")

dat_t12_dyn_kin = dat_t12_dyn_kin[kif_acc_all_balanced["Unnamed: 0"],:]
dat_t12_kin_both = dat_t12_kin_both[kif_acc_all_balanced["Unnamed: 0"],:]
dat_t12_kin_kif = dat_t12_kin_kif[kif_acc_all_balanced["Unnamed: 0"],:]

In [146]:
scaler = StandardScaler()
scaler.fit(dat_t12_dyn_kin)
dat_t12_dyn_kin_scaled = scaler.transform(dat_t12_dyn_kin)

scaler = StandardScaler()
scaler.fit(dat_t12_kin_both)
dat_t12_kin_both_scaled = scaler.transform(dat_t12_kin_both)

scaler = StandardScaler()
scaler.fit(dat_t12_kin_kif)
dat_t12_kin_kif_scaled = scaler.transform(dat_t12_kin_kif)


In [147]:
sigma = np.cov(dat_t12_dyn_kin_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.983067490613334


array([0.09112677, 0.17288392, 0.23263112, 0.28678037, 0.33424339,
       0.37714119, 0.41491219, 0.45247715, 0.4862294 , 0.51632342])

In [148]:
sigma = np.cov(dat_t12_kin_both_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.9851216024923916


array([0.13792925, 0.21290178, 0.27475619, 0.32502268, 0.37151055,
       0.41484069, 0.45324263, 0.48678509, 0.51799493, 0.54648689])

In [149]:
sigma = np.cov(dat_t12_kin_kif_scaled.T)
u, s, v = np.linalg.svd(sigma)
s_ratio = np.cumsum(s)/sum(s)
print(s_ratio[90])
s_ratio[0:10]

0.986893532956967


array([0.10400307, 0.18398372, 0.24928287, 0.3067526 , 0.35622227,
       0.39941743, 0.43967839, 0.47852036, 0.51382815, 0.54353139])

In [150]:
pca = PCA(n_components=90)
dat_t12_kin_kif_scaled_reduced = pca.fit_transform(dat_t12_kin_kif_scaled)

In [151]:
pca = PCA(n_components=90)
dat_t12_dyn_kin_scaled_reduced = pca.fit_transform(dat_t12_dyn_kin_scaled)

In [152]:
pca = PCA(n_components=90)
dat_t12_kin_both_scaled_reduced = pca.fit_transform(dat_t12_kin_both_scaled)

In [154]:
dat_t12_kin_both_scaled_reduced.shape

(141, 90)

In [153]:
np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_kin_kif_scaled_reduced.npy", dat_t12_kin_kif_scaled_reduced)
np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy", dat_t12_dyn_kin_scaled_reduced)
np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_kin_both_scaled_reduced.npy", dat_t12_kin_both_scaled_reduced)

np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_kin_kif_scaled.npy", dat_t12_kin_kif_scaled)
np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_dyn_kin_scaled.npy", dat_t12_dyn_kin_scaled)
np.save("../../out/201212/embedding/esm_balanced/t12_evo/kif_all_t12_kin_both_scaled.npy", dat_t12_kin_both_scaled)
