In [47]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [48]:
import glob
import time 

In [32]:
hn_dir = "../../out/201212/embedding/esm_models"
tuning_dir = ["t12","t34","t12_normalized","t34_normalized","t12_normalized_reduced","t34_normalized_reduced"]
tsne_out_dir = "../../out/201212/tsne/"
pca_out_dir = "../../out/201212/pca/"
umap_out_dir = "../../out/201212/umap/"
data_names = ["kif_all"]

In [34]:
# make one directory for each tuning method
import os, sys
for tuning_dir_ in tuning_dir:
    pca_path = pca_out_dir+tuning_dir_
    print(pca_path)
    os.mkdir(pca_path)
    tsne_path = tsne_out_dir+tuning_dir_
    os.mkdir(tsne_path)
    umap_path = umap_out_dir+tuning_dir_
    os.mkdir(umap_path)

../../out/201212/pca/t12
../../out/201212/pca/t34
../../out/201212/pca/t12_normalized
../../out/201212/pca/t34_normalized
../../out/201212/pca/t12_normalized_reduced
../../out/201212/pca/t34_normalized_reduced


In [37]:
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = '../../out/201212/embedding/esm_models/'+tuning_dir_ +'/'+dat+ '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)

    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    umap_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=20).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        reducer = umap.UMAP()
        umap_full = reducer.fit_transform(dat)
        umap_res.append(umap_full)
        print("--- %s seconds ---" % (time.time() - start_time))
  
        
    for i,dat_name in enumerate(data_names):
        pca_out_path = pca_out_dir + tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir + tuning_dir_+"/"+dat_name+".npy"
        umap_out_path = umap_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        print(umap_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
        np.save(umap_out_path, umap_res[i])
#         break
#     break

t12
(623, 768)
kif_all
--- 4.01861047744751 seconds ---
../../out/201212/pca/t12/kif_all.npy
../../out/201212/tsne/t12/kif_all.npy
../../out/201212/umap/t12/kif_all.npy
t34
(623, 1280)
kif_all
--- 4.065727949142456 seconds ---
../../out/201212/pca/t34/kif_all.npy
../../out/201212/tsne/t34/kif_all.npy
../../out/201212/umap/t34/kif_all.npy
t12_normalized
(623, 768)
kif_all
--- 3.9619154930114746 seconds ---
../../out/201212/pca/t12_normalized/kif_all.npy
../../out/201212/tsne/t12_normalized/kif_all.npy
../../out/201212/umap/t12_normalized/kif_all.npy
t34_normalized
(623, 1280)
kif_all
--- 4.127148389816284 seconds ---
../../out/201212/pca/t34_normalized/kif_all.npy
../../out/201212/tsne/t34_normalized/kif_all.npy
../../out/201212/umap/t34_normalized/kif_all.npy
t12_normalized_reduced
(623, 75)
kif_all
--- 3.9038920402526855 seconds ---
../../out/201212/pca/t12_normalized_reduced/kif_all.npy
../../out/201212/tsne/t12_normalized_reduced/kif_all.npy
../../out/201212/umap/t12_normalized_redu

In [38]:
print("done")

done


## Generate visualization 2D embedding for evotuned t12 embeddings

In [49]:
hn_dir = "../../out/201212/embedding/esm_models"
tuning_dir = ["t12_evo"]
tsne_out_dir = "../../out/201212/tsne/"
pca_out_dir = "../../out/201212/pca/"
umap_out_dir = "../../out/201212/umap/"
data_names = ["kif_all_t12_dyn_kin_scaled",\
              "kif_all_t12_dyn_kin_scaled_reduced",\
              "kif_all_t12_kin_both_scaled",\
              "kif_all_t12_kin_both_scaled_reduced",\
              "kif_all_t12_kin_kif_scaled",\
              "kif_all_t12_kin_kif_scaled_reduced"]

In [50]:
# make one directory for each tuning method
import os, sys
for tuning_dir_ in tuning_dir:
    pca_path = pca_out_dir+tuning_dir_
    print(pca_path)
    os.mkdir(pca_path)
    tsne_path = tsne_out_dir+tuning_dir_
    os.mkdir(tsne_path)
    umap_path = umap_out_dir+tuning_dir_
    os.mkdir(umap_path)

../../out/201212/pca/t12_evo


In [52]:
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = '../../out/201212/embedding/esm_models/'+tuning_dir_ +'/'+dat+ '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)

    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    umap_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=20).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        reducer = umap.UMAP()
        umap_full = reducer.fit_transform(dat)
        umap_res.append(umap_full)
        print("--- %s seconds ---" % (time.time() - start_time))
  
        
    for i,dat_name in enumerate(data_names):
        pca_out_path = pca_out_dir + tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir + tuning_dir_+"/"+dat_name+".npy"
        umap_out_path = umap_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        print(umap_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
        np.save(umap_out_path, umap_res[i])
#         break
#     break

t12_evo
(623, 768)
kif_all_t12_dyn_kin_scaled
--- 3.7123987674713135 seconds ---
kif_all_t12_dyn_kin_scaled_reduced
--- 3.8351075649261475 seconds ---
kif_all_t12_kin_both_scaled
--- 3.8568196296691895 seconds ---
kif_all_t12_kin_both_scaled_reduced
--- 4.0791497230529785 seconds ---
kif_all_t12_kin_kif_scaled
--- 4.209481954574585 seconds ---
kif_all_t12_kin_kif_scaled_reduced
--- 4.006385326385498 seconds ---
../../out/201212/pca/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/tsne/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/umap/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/pca/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/tsne/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/umap/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/pca/t12_evo/kif_all_t12_kin_both_scaled.npy
../../out/201212/tsne/t12_evo/kif_all_t12_kin_both_scaled.npy
../../out/201212/umap/t12_evo/kif_all_t12_kin_both_scaled.npy
../../out/20

## subsampling for each of the kif family to build a balanced dataset then UMAP

In [12]:
kif_acc_all = pd.read_csv("../../data/kif/kif_acc_all.csv")
kif_uniprot_all = pd.read_csv("../../data/kif/kif_uniprot_all.csv")

In [20]:
kif_acc_all_balanced = kif_acc_all.loc[kif_acc_all["db_name"]=="kif_jp",:].groupby(["db_name","kinesin_family"]).head(10)

In [21]:
kif_acc_all_balanced.loc[kif_acc_all["db_name"]=="kif_jp",:].groupby(["db_name","kinesin_family"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Entry,db_acc
db_name,kinesin_family,Unnamed: 2_level_1,Unnamed: 3_level_1
kif_jp,1,10,10
kif_jp,2,10,10
kif_jp,3,10,10
kif_jp,4,10,10
kif_jp,5,10,10
kif_jp,6,10,10
kif_jp,7,10,10
kif_jp,8,10,10
kif_jp,9,8,8
kif_jp,10,10,10


In [23]:
kif_acc_all_balanced.to_csv("../../data/kif/kif_acc_all_balanced.csv")

In [41]:
hn_dir = "../../out/201212/embedding/esm_balanced"
tuning_dir = ["t12","t34","t12_normalized","t34_normalized","t12_normalized_reduced","t34_normalized_reduced"]
tsne_out_dir = "../../out/201212/balanced/tsne/"
pca_out_dir = "../../out/201212/balanced/pca/"
umap_out_dir = "../../out/201212/balanced/umap/"
data_names = ["kif_all"]

In [42]:
# make one directory for each tuning method
import os, sys
for tuning_dir_ in tuning_dir:
    pca_path = pca_out_dir+tuning_dir_
    print(pca_path)
    os.mkdir(pca_path)
    tsne_path = tsne_out_dir+tuning_dir_
    os.mkdir(tsne_path)
    umap_path = umap_out_dir+tuning_dir_
    os.mkdir(umap_path)

../../out/201212/balanced/pca/t12
../../out/201212/balanced/pca/t34
../../out/201212/balanced/pca/t12_normalized
../../out/201212/balanced/pca/t34_normalized
../../out/201212/balanced/pca/t12_normalized_reduced
../../out/201212/balanced/pca/t34_normalized_reduced


In [45]:
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = '../../out/201212/embedding/esm_balanced/'+tuning_dir_ +'/'+dat+ '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)

    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    umap_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=20).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        reducer = umap.UMAP()
        umap_full = reducer.fit_transform(dat)
        umap_res.append(umap_full)
        print("--- %s seconds ---" % (time.time() - start_time))
  
        
    for i,dat_name in enumerate(data_names):
        pca_out_path = pca_out_dir + tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir + tuning_dir_+"/"+dat_name+".npy"
        umap_out_path = umap_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        print(umap_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
        np.save(umap_out_path, umap_res[i])
#         break
#     break

t12
(141, 768)
kif_all
--- 2.2646279335021973 seconds ---
../../out/201212/balanced/pca/t12/kif_all.npy
../../out/201212/balanced/tsne/t12/kif_all.npy
../../out/201212/balanced/umap/t12/kif_all.npy
t34
(141, 1280)
kif_all
--- 2.3942694664001465 seconds ---
../../out/201212/balanced/pca/t34/kif_all.npy
../../out/201212/balanced/tsne/t34/kif_all.npy
../../out/201212/balanced/umap/t34/kif_all.npy
t12_normalized
(141, 768)
kif_all
--- 2.3643734455108643 seconds ---
../../out/201212/balanced/pca/t12_normalized/kif_all.npy
../../out/201212/balanced/tsne/t12_normalized/kif_all.npy
../../out/201212/balanced/umap/t12_normalized/kif_all.npy
t34_normalized
(141, 1280)
kif_all
--- 2.2972822189331055 seconds ---
../../out/201212/balanced/pca/t34_normalized/kif_all.npy
../../out/201212/balanced/tsne/t34_normalized/kif_all.npy
../../out/201212/balanced/umap/t34_normalized/kif_all.npy
t12_normalized_reduced
(141, 50)
kif_all
--- 2.432793378829956 seconds ---
../../out/201212/balanced/pca/t12_normalize

In [46]:
print("done")

done


In [54]:
hn_dir = "../../out/201212/embedding/esm_models"
tuning_dir = ["t12_evo"]
tsne_out_dir = "../../out/201212/balanced/tsne/"
pca_out_dir = "../../out/201212/balanced/pca/"
umap_out_dir = "../../out/201212/balanced/umap/"
data_names = ["kif_all_t12_dyn_kin_scaled",\
              "kif_all_t12_dyn_kin_scaled_reduced",\
              "kif_all_t12_kin_both_scaled",\
              "kif_all_t12_kin_both_scaled_reduced",\
              "kif_all_t12_kin_kif_scaled",\
              "kif_all_t12_kin_kif_scaled_reduced"]

In [55]:
for tuning_dir_ in tuning_dir:
    pca_path = pca_out_dir+tuning_dir_
    print(pca_path)
    os.mkdir(pca_path)
    tsne_path = tsne_out_dir+tuning_dir_
    os.mkdir(tsne_path)
    umap_path = umap_out_dir+tuning_dir_
    os.mkdir(umap_path)

../../out/201212/balanced/pca/t12_evo


In [56]:
for tuning_dir_ in tuning_dir:
    print(tuning_dir_)
    dats = []
    for dat in data_names:
        d_path = '../../out/201212/embedding/esm_balanced/'+tuning_dir_ +'/'+dat+ '.npy'
#         print(d_path)
        d = np.load(d_path)
        dats.append(d)
    print(dats[0].shape)

    # first, embed each of the dataset themselves
    pca_res = []
    tsne_res = []
    umap_res = []
    for i, dat in enumerate(dats):
        print(data_names[i])
        start_time = time.time()
        pca_curr = PCA(n_components=2).fit_transform(dat)
        pca_res.append(pca_curr)
        tsne_curr = PCA(n_components=20).fit_transform(dat)
        tsne_curr = TSNE(n_components=2).fit_transform(tsne_curr)
        tsne_res.append(tsne_curr)
        reducer = umap.UMAP()
        umap_full = reducer.fit_transform(dat)
        umap_res.append(umap_full)
        print("--- %s seconds ---" % (time.time() - start_time))
  
        
    for i,dat_name in enumerate(data_names):
        pca_out_path = pca_out_dir + tuning_dir_+"/"+dat_name+".npy"
        tsne_out_path = tsne_out_dir + tuning_dir_+"/"+dat_name+".npy"
        umap_out_path = umap_out_dir + tuning_dir_+"/"+dat_name+".npy"
        print(pca_out_path)
        print(tsne_out_path)
        print(umap_out_path)
        np.save(pca_out_path, pca_res[i])
        np.save(tsne_out_path, tsne_res[i])
        np.save(umap_out_path, umap_res[i])
#         break
#     break

t12_evo
(141, 768)
kif_all_t12_dyn_kin_scaled
--- 2.466611623764038 seconds ---
kif_all_t12_dyn_kin_scaled_reduced
--- 2.776846408843994 seconds ---
kif_all_t12_kin_both_scaled
--- 2.4373815059661865 seconds ---
kif_all_t12_kin_both_scaled_reduced
--- 2.5017733573913574 seconds ---
kif_all_t12_kin_kif_scaled
--- 2.773343801498413 seconds ---
kif_all_t12_kin_kif_scaled_reduced
--- 2.815920829772949 seconds ---
../../out/201212/balanced/pca/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/balanced/tsne/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/balanced/umap/t12_evo/kif_all_t12_dyn_kin_scaled.npy
../../out/201212/balanced/pca/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/balanced/tsne/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/balanced/umap/t12_evo/kif_all_t12_dyn_kin_scaled_reduced.npy
../../out/201212/balanced/pca/t12_evo/kif_all_t12_kin_both_scaled.npy
../../out/201212/balanced/tsne/t12_evo/kif_all_t12_kin_both_scaled.npy
../.