# Documentation
#### 201111
This code chunk aimed to normalize the hidden states of transformer_enocoder_mlm model and the esm related models. Then generate new pca and tsne embedding for the corresponding hidden states to be readily visualized as before.

The folder structure should follow as before to be readily visualized

pca/tsne       
---> all the model names ---> motor_toolkit, kinesin_labelled, balanced, random, target    
   

In [5]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Part one: Normalize the hidden states

In [6]:
# load hidden states to be processed
# /out/201102/embedding/esm_models/*/*.npy


## A test trail on pfamA_balanced embedding from t12 esm model

In [8]:
hn_balanced = np.load("../out/201102/embedding/esm_models/t12/pfamA_balanced.npy")

In [9]:
u, s, v = np.linalg.svd(hn_balanced)
s[0:10]

array([4554.556  ,  907.02954,  556.1    ,  469.8737 ,  382.7925 ,
        343.15863,  296.84637,  238.43242,  220.95883,  215.51518],
      dtype=float32)

In [10]:
scaler = StandardScaler()
scaler.fit(hn_balanced)
hn_balanced = scaler.transform(hn_balanced)
u, s, v = np.linalg.svd(hn_balanced)
s[0:10]

array([1836.8103 , 1149.1133 , 1001.7509 ,  830.58234,  811.7272 ,
        725.1434 ,  570.95764,  539.11926,  532.85425,  481.0784 ],
      dtype=float32)

## A test trail on pfamA_balanced embedding from t34 esm model

In [11]:
hn_balanced = np.load("../out/201102/embedding/esm_models/t34/pfamA_balanced.npy")

In [12]:
u, s, v = np.linalg.svd(hn_balanced)
s[0:10]

array([10160.743  ,  1458.9883 ,  1320.6609 ,  1001.17737,   841.4909 ,
         809.6085 ,   659.71814,   566.5445 ,   499.0098 ,   429.91965],
      dtype=float32)

In [13]:
scaler = StandardScaler()
scaler.fit(hn_balanced)
hn_balanced = scaler.transform(hn_balanced)
u, s, v = np.linalg.svd(hn_balanced)
s[0:10]

array([1803.6572 , 1605.1532 , 1445.1599 , 1155.7971 ,  997.59686,
        961.9244 ,  880.9136 ,  751.4918 ,  638.22876,  594.2315 ],
      dtype=float32)

### from the above trail on esm models, it is clear that the standardization reduced the variance being accounted by each of the principal components, but could potentially neglect the scale difference information

## A helper function to normalize each dataset

In [20]:
def normalize_hn(hn_path, out_path):
    '''
    The function takes in a hiddens state path and output it to the corresponding normalized folder
    '''
    hn = np.load(hn_path)
    scaler = StandardScaler()
    scaler.fit(hn)
    hn = scaler.transform(hn)
    np.save(out_path,hn)
    return

In [18]:
hn_dir = "../../out/201102/embedding/esm_models/"
out_dir = "../../out/201102/embedding/esm_models_normalized/"
tuning_dir = ["t12_balanced","t12_kinesin","t12_motor_toolkit","t12","t34"]
tsne_out_dir = "../../out/201102/normalized/tsne/"
pca_out_dir = "../../out/201102/normalized/pca/"
data_names = ["pfamA_random","motor_toolkit","pfamA_balanced","pfamA_target","kinesin_labelled"]

In [14]:
import os, sys
for tuning_dir_ in tuning_dir:
    out_path = out_dir+tuning_dir_
    print(out_path)
    os.mkdir(out_path)

../../out/201102/embedding/esm_models_normalized/t12_balanced
../../out/201102/embedding/esm_models_normalized/t12_kinesin
../../out/201102/embedding/esm_models_normalized/t12_motor_toolkit
../../out/201102/embedding/esm_models_normalized/t12
../../out/201102/embedding/esm_models_normalized/t34


In [21]:
for t in tuning_dir:
    for d in data_names:
        hn_path = hn_dir+t+'/'+d+'.npy'
        out_path = out_dir+t+'/'+d+'.npy'
        normalize_hn(hn_path, out_path)
        print(out_path)

../../out/201102/embedding/esm_models_normalized/t12_balanced/pfamA_random.npy
../../out/201102/embedding/esm_models_normalized/t12_balanced/motor_toolkit.npy
../../out/201102/embedding/esm_models_normalized/t12_balanced/pfamA_balanced.npy
../../out/201102/embedding/esm_models_normalized/t12_balanced/pfamA_target.npy
../../out/201102/embedding/esm_models_normalized/t12_balanced/kinesin_labelled.npy
../../out/201102/embedding/esm_models_normalized/t12_kinesin/pfamA_random.npy
../../out/201102/embedding/esm_models_normalized/t12_kinesin/motor_toolkit.npy
../../out/201102/embedding/esm_models_normalized/t12_kinesin/pfamA_balanced.npy
../../out/201102/embedding/esm_models_normalized/t12_kinesin/pfamA_target.npy
../../out/201102/embedding/esm_models_normalized/t12_kinesin/kinesin_labelled.npy
../../out/201102/embedding/esm_models_normalized/t12_motor_toolkit/pfamA_random.npy
../../out/201102/embedding/esm_models_normalized/t12_motor_toolkit/motor_toolkit.npy
../../out/201102/embedding/esm_m

## sanity check to make sure that normalization worked

In [23]:
hn_balanced = np.load("../../out/201102/embedding/esm_models_normalized/t34/pfamA_balanced.npy")
u, s, v = np.linalg.svd(hn_balanced)
s[0:10]

array([1803.6572 , 1605.1532 , 1445.1599 , 1155.7971 ,  997.59686,
        961.9244 ,  880.9136 ,  751.4918 ,  638.22876,  594.2315 ],
      dtype=float32)