In [None]:
!pip install sovai[full]

In [3]:
import sovai as sov

sov.token_auth(token="visit https://sov.ai/profile for your token")

# Load ratios - takes around 5 mins to load data 
df_mega = sov.data("accounting/weekly").select_stocks("mega").date_range("2018-01-01")
df_mega.shape

(22784, 81)

**PCA** (Principal Component Analysis): Reduces the data to `n_components` dimensions by projecting it onto the top `n_components` directions that maximize variance.

In [24]:
df_mega.reduce_dimensions(method="pca", n_components=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2018-01-05,-0.403,3.475,1.200,-2.321,-0.752,-1.960,-0.162,-0.535,-1.049,0.402
AAPL,2018-01-12,-0.375,3.614,1.248,-2.405,-0.783,-2.034,-0.178,-0.558,-1.118,0.404
AAPL,2018-01-19,-0.347,3.753,1.296,-2.490,-0.814,-2.108,-0.193,-0.581,-1.186,0.406
AAPL,2018-01-26,-0.319,3.892,1.344,-2.574,-0.845,-2.182,-0.209,-0.604,-1.254,0.407
AAPL,2018-02-02,-0.291,4.031,1.392,-2.659,-0.876,-2.255,-0.224,-0.628,-1.323,0.409
...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,-0.601,1.654,0.641,-0.753,-0.551,-1.016,0.064,-0.763,-1.206,0.390
XOM,2024-07-05,-0.601,1.654,0.641,-0.753,-0.551,-1.016,0.064,-0.763,-1.206,0.390
XOM,2024-07-12,-0.601,1.654,0.641,-0.753,-0.551,-1.016,0.064,-0.763,-1.206,0.390
XOM,2024-07-19,-0.601,1.654,0.641,-0.753,-0.551,-1.016,0.064,-0.763,-1.206,0.390


**Gaussian Random Projection**: Reduces the data to `n_components` dimensions by projecting it onto a randomly generated Gaussian matrix while preserving the pairwise distances between points.

In [25]:
df_mega.reduce_dimensions(method="gaussian_random_projection", n_components=10)

Unnamed: 0_level_0,Unnamed: 1_level_0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2018-01-05,0.141,-1.048,-2.031,1.059,-0.799,-0.891,-0.409,-0.138,-1.308,0.902
AAPL,2018-01-12,0.134,-1.124,-2.112,1.090,-0.837,-0.930,-0.337,-0.135,-1.300,0.988
AAPL,2018-01-19,0.128,-1.200,-2.194,1.120,-0.874,-0.969,-0.266,-0.132,-1.293,1.074
AAPL,2018-01-26,0.121,-1.276,-2.276,1.150,-0.912,-1.009,-0.195,-0.129,-1.285,1.160
AAPL,2018-02-02,0.114,-1.351,-2.357,1.181,-0.950,-1.048,-0.124,-0.126,-1.278,1.246
...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,1.369,-0.499,-1.320,0.284,-0.902,0.281,0.014,-1.571,-0.845,-0.236
XOM,2024-07-05,1.369,-0.499,-1.320,0.284,-0.902,0.281,0.014,-1.571,-0.845,-0.236
XOM,2024-07-12,1.369,-0.499,-1.320,0.284,-0.902,0.281,0.014,-1.571,-0.845,-0.236
XOM,2024-07-19,1.369,-0.499,-1.320,0.284,-0.902,0.281,0.014,-1.571,-0.845,-0.236


**UMAP** (Uniform Manifold Approximation and Projection): Reduces the data to `n_components` dimensions using a non-linear technique that aims to preserve the global and local structure of the data manifold.

In [26]:
df_mega.reduce_dimensions(method="umap", verbose=True, n_components=10)

Starting dimensionality reduction using umap


Unnamed: 0_level_0,Unnamed: 1_level_0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2018-01-05,0.206,3.453,5.423,5.430,2.583,6.505,5.977,2.108,-2.197,4.842
AAPL,2018-01-12,0.198,3.432,5.417,5.440,2.574,6.538,5.963,2.101,-2.205,4.841
AAPL,2018-01-19,0.203,3.444,5.422,5.435,2.579,6.518,5.971,2.103,-2.202,4.839
AAPL,2018-01-26,0.191,3.407,5.424,5.470,2.544,6.556,5.935,2.068,-2.238,4.804
AAPL,2018-02-02,0.198,3.426,5.421,5.448,2.567,6.539,5.954,2.089,-2.219,4.826
...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,8.477,3.981,6.332,9.618,8.755,11.296,2.745,3.173,-1.207,4.631
XOM,2024-07-05,8.477,3.981,6.332,9.618,8.755,11.296,2.744,3.173,-1.207,4.631
XOM,2024-07-12,8.477,3.981,6.332,9.618,8.755,11.296,2.744,3.173,-1.207,4.631
XOM,2024-07-19,8.477,3.981,6.332,9.618,8.755,11.296,2.744,3.173,-1.207,4.631


**Factor Analysis** reduces dataset dimensionality by representing correlated variables with fewer `n_components` unobserved variables, known as factors.

In [4]:
df_mega.reduce_dimensions(method="factor_analysis", verbose=True, n_components=10)

Starting dimensionality reduction using factor_analysis


Unnamed: 0_level_0,Unnamed: 1_level_0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2018-01-05,-0.030,-0.016,-0.305,-0.216,0.621,-0.224,0.774,-0.801,-3.090,0.225
AAPL,2018-01-12,-0.024,-0.018,-0.324,-0.238,0.628,-0.240,0.793,-0.833,-3.246,0.206
AAPL,2018-01-19,-0.018,-0.020,-0.343,-0.260,0.634,-0.256,0.812,-0.865,-3.401,0.186
AAPL,2018-01-26,-0.012,-0.022,-0.362,-0.282,0.641,-0.272,0.831,-0.897,-3.557,0.166
AAPL,2018-02-02,-0.006,-0.024,-0.380,-0.304,0.647,-0.289,0.850,-0.929,-3.713,0.147
...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,-0.078,-0.028,0.047,-0.453,-0.162,0.084,0.007,-0.379,-1.252,0.172
XOM,2024-07-05,-0.078,-0.028,0.047,-0.453,-0.162,0.084,0.007,-0.379,-1.252,0.172
XOM,2024-07-12,-0.078,-0.028,0.047,-0.453,-0.162,0.084,0.007,-0.379,-1.252,0.172
XOM,2024-07-19,-0.078,-0.028,0.047,-0.453,-0.162,0.084,0.007,-0.379,-1.252,0.172
