# Documentation
> From the TSNE analysis for transformer encoder model, there may be too many hidden states output by transformer, and so the PCA and subsequent TSNE have the linear pattern. Perform a SVD to see how each PC account for the variance.

In [2]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

## Transformer Encoder

In [3]:
hn_motortoolkit = np.load("../data/hn_transformerencoder_motortoolkit.npy")
hn_pfammotors= np.load("../data/hn_transformerencoder_pfammotors.npy")
hn_dfdev = np.load("../data/hn_transformerencoder_dfdev.npy")
print(hn_motortoolkit.shape)
print(hn_pfammotors.shape)
print(hn_dfdev.shape)

(3235, 768)
(1914831, 768)
(1212912, 768)


In [4]:
u, s, v = np.linalg.svd(hn_motortoolkit)

In [7]:
s[0:10]

array([1.6293489e+05, 2.0418205e+00, 1.3351089e+00, 1.0232561e+00,
       8.3068514e-01, 6.5283227e-01, 5.3178918e-01, 4.0863904e-01,
       3.9062384e-01, 3.7214699e-01], dtype=float32)

In [11]:
u, s, v = np.linalg.svd(hn_pfammotors[1:10000,:])

In [12]:
s[0:10]

array([8.9091117e+04, 7.2454149e-01, 3.5801572e-01, 3.0840611e-01,
       2.6150218e-01, 2.3170654e-01, 1.9952329e-01, 1.9444086e-01,
       1.7441043e-01, 1.5912345e-01], dtype=float32)

In [4]:
u, s, v = np.linalg.svd(hn_dfdev[110000:120000,:])

In [5]:
s[0:10]

array([4.9420723e+04, 4.5896345e-01, 4.5610082e-01, 2.8848442e-01,
       2.5892946e-01, 2.1507658e-01, 2.0485921e-01, 1.7426261e-01,
       1.5409079e-01, 1.3732859e-01], dtype=float32)

From the analysis, we could tell that only one principle component dominate over, and so the hidden states are actually very linearly dependent. The network need to be randomized/further inspected to see if next token prediction is not effective

With above analysis, perform it also on LSTM5 and Seq2Seq result to see if only one PC dominates entirely

## Seq2Seq

In [6]:
hn_motortoolkit = np.load("../data/hn_s2sencoder_motortoolkit.npy")
hn_pfammotors= np.load("../data/hn_s2sencoder_pfammotors.npy")
hn_dfdev = np.load("../data/hn_s2sencoder_dfdev.npy")

In [7]:
u, s, v = np.linalg.svd(hn_motortoolkit)

In [8]:
s[0:10]

array([257.47748 ,  85.86048 ,  56.964283,  40.470497,  37.40044 ,
        30.117552,  25.10778 ,  23.10032 ,  22.317842,  20.853163],
      dtype=float32)

In [9]:
u, s, v = np.linalg.svd(hn_pfammotors[1:20000,:])

In [10]:
s[0:10]

array([585.71173 , 277.25177 , 176.3308  , 112.73699 , 102.46996 ,
        96.50056 ,  77.918335,  73.30223 ,  59.386765,  55.59358 ],
      dtype=float32)

In [11]:
u, s, v = np.linalg.svd(hn_dfdev[110000:120000,:])

In [12]:
s[0:10]

array([400.16098 , 190.04813 , 113.94885 ,  84.33569 ,  70.39741 ,
        68.85091 ,  58.15323 ,  51.65772 ,  43.141678,  41.207542],
      dtype=float32)

## LSTM5

In [13]:
hn_motortoolkit = np.load("../data/hn_lstm5_motortoolkit.npy")
hn_pfammotors= np.load("../data/hn_lstm5_pfammotors.npy")
hn_dfdev = np.load("../data/hn_lstm5_dfdev.npy")
print(hn_motortoolkit.shape)
print(hn_pfammotors.shape)
print(hn_dfdev.shape)

(3255, 256)
(1914831, 256)
(1212912, 256)


In [14]:
u, s, v = np.linalg.svd(hn_motortoolkit)

In [15]:
s[0:10]

array([349.7835  , 112.17764 ,  96.96205 ,  86.31042 ,  74.649895,
        59.51239 ,  55.253628,  52.919804,  51.304253,  50.316635],
      dtype=float32)

In [16]:
u, s, v = np.linalg.svd(hn_pfammotors[1:20000,:])

In [17]:
s[0:10]

array([762.4147 , 297.05518, 258.34515, 223.35733, 176.10948, 170.1825 ,
       158.89563, 154.27856, 149.13907, 138.52145], dtype=float32)

In [18]:
u, s, v = np.linalg.svd(hn_dfdev[110000:120000,:])

In [19]:
s[0:10]

array([526.8835  , 189.43001 , 160.37401 , 155.0355  , 129.91481 ,
       120.06838 , 117.729546, 114.029755, 110.14683 , 102.44181 ],
      dtype=float32)

From PCA, it seems only transformer model have the pc of super large magnitude

## Perform the analysis but first normalize each hidden dimension 

In [13]:
hn_motortoolkit = np.load("../../data/first_try/hn_transformerencoder_motortoolkit.npy")
hn_pfammotors= np.load("../../data/first_try/hn_transformerencoder_pfammotors.npy")
hn_dfdev = np.load("../../data/first_try/hn_transformerencoder_dfdev.npy")
print(hn_motortoolkit.shape)
print(hn_pfammotors.shape)
print(hn_dfdev.shape)

(3235, 768)
(1914831, 768)
(1212912, 768)


In [14]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()
scaler.fit(hn_motortoolkit)
hn_motortoolkit = scaler.transform(hn_motortoolkit)
u, s, v = np.linalg.svd(hn_motortoolkit)
s[0:10]

In [25]:
scaler = StandardScaler()
scaler.fit(hn_pfammotors)
hn_pfammotors = scaler.transform(hn_pfammotors)
u, s, v = np.linalg.svd(hn_pfammotors[1:20000,:])
s[0:10]

array([5.8841968e+03, 5.6800276e-01, 4.5774418e-01, 2.6847205e-01,
       1.6561021e-01, 1.6108577e-01, 1.3962923e-01, 1.3225430e-01,
       9.5220186e-02, 9.0758875e-02], dtype=float32)

In [26]:
# The function reduced the variance being accounted from x100000 to x10000 and so did help to a limited extent