In [None]:
!pip install sovai[full]

In [5]:
import sovai as sov

sov.token_auth(token="visit https://sov.ai/profile for your token")

# Load ratios - takes around 5 mins to load data 
df_accounting = sov.data("accounting/weekly")


# Choose long enough history for the model to train
df_mega = df_accounting.select_stocks("mega").date_range("2018-01-01")

### Feature Importance

**Random Projection:**
The feature importance reflects how much each feature contributes to the variance in the randomly projected space.
Random Fourier Features:


In [6]:
df_mega.importance("random_projection")

Unnamed: 0,feature,importance,importance_percentile
0,free_cash_flow,1.472,100.000
1,revenue_usd,1.318,98.765
2,total_operating_assets,1.255,97.531
3,ebitda,1.233,96.296
4,net_cash_flow_business,1.227,95.062
...,...,...,...
76,accum_other_comp_income,0.696,6.173
77,current_debt,0.685,4.938
78,cost_of_revenue,0.681,3.704
79,operating_accruals,0.677,2.469


**Random Fourier Features** the importance indicates how strongly each feature influences the approximation of non-linear relationships in the Fourier-transformed space.


In [7]:
df_mega.importance("fourier")

Unnamed: 0,feature,importance,importance_percentile
0,operating_working_capital,1.073,100.000
1,net_cash_flow,1.070,98.765
2,net_income_non_controlling_int,1.063,97.531
3,total_revenue,1.056,96.296
4,dividends_total,1.055,95.062
...,...,...,...
76,total_investments,0.949,6.173
77,total_accruals,0.948,4.938
78,current_investments,0.944,3.704
79,accounts_payable,0.931,2.469



**Independent Component Analysis (ICA):**
The feature importance is based on the magnitude of each feature's contribution to the extracted independent components, which represent underlying independent signals in the data.


In [8]:
df_mega.importance("ica")

Unnamed: 0,feature,importance,importance_percentile
0,net_cash_flow_investing,1.678,100.000
1,equity_usd,1.444,98.765
2,bank_deposits,1.427,97.531
3,accum_other_comp_income,1.367,96.296
4,net_cash_flow,1.224,95.062
...,...,...,...
76,net_cash_flow_business,0.017,6.173
77,interest_expense,0.015,4.938
78,stock_based_compensation,0.012,3.704
79,net_income_discontinued_ops,0.001,2.469


**Truncated Singular Value Decomposition (SVD):**
The importance is determined by each feature's influence on the principal singular vectors, which represent directions of maximum variance in the data.

In [9]:
df_mega.importance("svd")

Unnamed: 0,feature,importance,importance_percentile
0,enterprise_value,0.996,100.000
1,net_cash_acquisitions_disposals,0.906,98.765
2,retained_earnings,0.889,97.531
3,bank_deposits,0.731,96.296
4,net_cash_flow,0.644,95.062
...,...,...,...
76,net_cash_flow_business,0.000,6.173
77,interest_expense,0.000,4.938
78,stock_based_compensation,0.000,3.704
79,net_income_discontinued_ops,0.000,2.469


**Sparse Random Projection:**
The feature importance is based on how much each feature contributes to the variance in the sparsely projected space, similar to standard Random Projection but with improved computational efficiency.

In [10]:
df_mega.importance("sparse_projection")

Unnamed: 0,feature,importance,importance_percentile
0,net_cash_flow_debt,2.100,98.765
1,cash_equivalents,2.100,98.765
2,total_liabilities,2.100,98.765
3,consolidated_income,1.800,93.210
4,net_income_common_stock,1.800,93.210
...,...,...,...
76,retained_earnings,0.300,6.173
77,current_investments,0.300,6.173
78,enterprise_value,0.300,6.173
79,operating_income,0.000,1.852


**Clustered SHAP Ensemble:** This method iteratively applies clustering, uses XGBoost to predict cluster membership, calculates SHAP values, and averages results across multiple runs to determine feature importance in identifying natural data structures.


In [11]:
df_mega.importance("shapley")

Unnamed: 0,feature,importance,importance_percentile
0,cash_short_term,0.549,100.000
1,net_cash_acquisitions_disposals,0.358,98.765
2,net_cash_flow,0.343,97.531
3,bank_deposits,0.294,96.296
4,research_development_expenses,0.218,95.062
...,...,...,...
76,non_current_debt,0.001,6.173
77,net_income_discontinued_ops,0.000,4.938
78,net_income,0.000,2.469
79,operating_cash_flow,0.000,2.469


### Global Feature Importance

In [12]:
df_mega.feature_importance()

Unnamed: 0_level_0,Unnamed: 1_level_0,cash_short_term,operating_working_capital,total_nonoperating_assets,total_operating_assets,accounts_receivable,cash_equiv_usd,cash_equivalents,current_assets,current_investments,intangible_assets,inventory_amount,non_current_assets,non_current_investments,property_plant_equipment_net,tax_assets,total_assets,total_investments,accum_other_comp_income,adjusted_parent_equity,book_equity_value,equity_usd,retained_earnings,total_equity,accounts_payable,bank_deposits,current_debt,current_liabilities,debt_usd,deferred_revenue,non_current_debt,non_current_liabilities,tax_liabilities,total_debt,total_liabilities,capital_expenditures,depreciation_amortization,net_cash_acquisitions_disposals,net_cash_flow,net_cash_flow_business,net_cash_flow_common,net_cash_flow_debt,net_cash_flow_dividends,net_cash_flow_financing,net_cash_flow_fx,net_cash_flow_investing,net_cash_flow_operating,operating_accruals,operating_cash_flow,stock_based_compensation,total_accruals,cost_of_revenue,dividends_total,interest_expense,operating_expenses,preferred_dividends,research_development_expenses,selling_general_admin_expenses,tax_expenses,cash_operating_profit,comprehensive_net_income,consolidated_income,ebit,ebit_usd,gross_profit,net_income,net_income_common_stock,net_income_common_stock_usd,net_income_discontinued_ops,net_income_excluding_discontinued,net_income_non_controlling_int,operating_income,revenue_usd,total_operating_net_income,total_revenue,tangible_assets,working_capital,free_cash_flow,invested_capital,earnings_before_tax,ebitda,enterprise_value
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1
AAPL,2018-01-05,0.323,0.061,0.090,0.117,0.017,0.166,0.118,0.021,0.277,0.019,0.188,0.004,0.004,0.032,0.015,0.060,0.114,0.023,0.005,0.038,0.037,0.013,0.020,0.009,0.218,0.017,0.111,0.114,0.205,0.025,0.001,0.017,0.023,0.030,0.042,0.058,0.426,0.159,0.181,0.048,0.027,0.027,0.108,0.046,0.022,0.080,0.002,0.000,0.119,0.093,0.012,0.022,0.010,0.009,0.000,0.288,0.092,0.014,0.017,0.020,0.051,0.044,0.039,0.043,0.000,0.017,0.070,0.000,0.005,0.020,0.333,0.078,0.010,0.008,0.083,0.031,0.022,0.043,0.016,0.109,0.148
AAPL,2018-01-12,0.324,0.052,0.087,0.116,0.018,0.147,0.129,0.027,0.281,0.021,0.202,0.004,0.003,0.033,0.014,0.056,0.106,0.023,0.006,0.015,0.049,0.007,0.025,0.009,0.215,0.018,0.112,0.113,0.151,0.027,0.000,0.017,0.015,0.024,0.040,0.058,0.426,0.156,0.168,0.030,0.031,0.039,0.100,0.049,0.061,0.079,0.002,0.000,0.099,0.095,0.007,0.023,0.011,0.011,0.000,0.283,0.076,0.009,0.016,0.029,0.070,0.044,0.078,0.082,0.000,0.013,0.093,0.000,0.005,0.022,0.333,0.098,0.009,0.015,0.082,0.029,0.020,0.037,0.017,0.101,0.182
AAPL,2018-01-19,0.303,0.056,0.068,0.115,0.020,0.159,0.186,0.020,0.278,0.022,0.197,0.004,0.002,0.032,0.015,0.061,0.133,0.023,0.006,0.011,0.039,0.013,0.017,0.008,0.221,0.015,0.111,0.114,0.174,0.025,0.001,0.017,0.035,0.026,0.041,0.057,0.451,0.172,0.176,0.031,0.027,0.025,0.109,0.050,0.048,0.079,0.002,0.000,0.145,0.092,0.006,0.029,0.010,0.009,0.000,0.366,0.046,0.010,0.022,0.023,0.063,0.045,0.032,0.042,0.000,0.017,0.067,0.002,0.010,0.022,0.333,0.094,0.008,0.010,0.087,0.023,0.019,0.042,0.015,0.092,0.140
AAPL,2018-01-26,0.324,0.055,0.088,0.115,0.018,0.187,0.135,0.026,0.277,0.021,0.190,0.003,0.003,0.032,0.015,0.061,0.127,0.021,0.008,0.012,0.051,0.014,0.016,0.012,0.200,0.018,0.112,0.112,0.150,0.025,0.001,0.016,0.029,0.025,0.041,0.057,0.426,0.158,0.177,0.037,0.026,0.029,0.111,0.048,0.038,0.080,0.002,0.000,0.154,0.094,0.007,0.022,0.011,0.009,0.000,0.282,0.068,0.012,0.021,0.023,0.065,0.044,0.030,0.104,0.000,0.017,0.073,0.000,0.011,0.021,0.331,0.080,0.007,0.016,0.096,0.030,0.022,0.042,0.015,0.118,0.149
AAPL,2018-02-02,0.325,0.057,0.086,0.115,0.023,0.169,0.131,0.027,0.277,0.018,0.198,0.004,0.006,0.033,0.015,0.060,0.112,0.027,0.009,0.012,0.057,0.009,0.015,0.009,0.211,0.015,0.112,0.114,0.201,0.025,0.001,0.017,0.024,0.029,0.041,0.058,0.425,0.159,0.178,0.031,0.031,0.027,0.104,0.048,0.036,0.081,0.002,0.000,0.130,0.095,0.019,0.023,0.010,0.009,0.000,0.287,0.115,0.013,0.017,0.024,0.072,0.045,0.042,0.056,0.000,0.014,0.068,0.000,0.011,0.022,0.332,0.085,0.008,0.009,0.085,0.024,0.022,0.042,0.017,0.075,0.153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,0.324,0.050,0.090,0.116,0.018,0.165,0.150,0.022,0.253,0.018,0.224,0.005,0.004,0.027,0.015,0.091,0.110,0.018,0.006,0.014,0.045,0.010,0.039,0.010,0.326,0.018,0.112,0.111,0.158,0.029,0.001,0.019,0.034,0.033,0.040,0.060,0.427,0.165,0.170,0.033,0.034,0.030,0.109,0.040,0.051,0.080,0.003,0.000,0.143,0.093,0.034,0.021,0.010,0.007,0.000,0.285,0.036,0.007,0.016,0.022,0.035,0.042,0.029,0.031,0.000,0.018,0.121,0.000,0.011,0.021,0.331,0.067,0.007,0.019,0.114,0.029,0.021,0.069,0.012,0.057,0.159
XOM,2024-07-05,0.324,0.052,0.163,0.115,0.019,0.205,0.163,0.025,0.277,0.019,0.198,0.003,0.003,0.028,0.016,0.062,0.130,0.025,0.006,0.024,0.036,0.012,0.021,0.009,0.215,0.016,0.112,0.116,0.151,0.025,0.000,0.017,0.038,0.024,0.039,0.058,0.425,0.158,0.148,0.041,0.030,0.025,0.125,0.047,0.027,0.079,0.002,0.000,0.177,0.093,0.007,0.025,0.010,0.010,0.000,0.271,0.038,0.009,0.015,0.022,0.067,0.043,0.036,0.047,0.000,0.021,0.077,0.000,0.004,0.023,0.331,0.076,0.006,0.017,0.120,0.026,0.021,0.048,0.017,0.100,0.191
XOM,2024-07-12,0.327,0.054,0.087,0.114,0.022,0.170,0.131,0.026,0.279,0.024,0.201,0.003,0.006,0.032,0.015,0.061,0.122,0.025,0.008,0.019,0.047,0.012,0.018,0.008,0.217,0.016,0.112,0.112,0.172,0.024,0.001,0.017,0.023,0.025,0.042,0.058,0.425,0.159,0.181,0.033,0.026,0.029,0.103,0.050,0.033,0.080,0.002,0.000,0.132,0.093,0.006,0.027,0.010,0.010,0.000,0.284,0.087,0.009,0.020,0.029,0.072,0.045,0.033,0.068,0.000,0.012,0.075,0.000,0.012,0.023,0.331,0.048,0.008,0.008,0.106,0.024,0.020,0.042,0.013,0.110,0.155
XOM,2024-07-19,0.324,0.052,0.087,0.115,0.020,0.172,0.155,0.025,0.278,0.021,0.196,0.004,0.004,0.032,0.015,0.062,0.117,0.026,0.007,0.018,0.034,0.015,0.015,0.013,0.212,0.018,0.111,0.115,0.154,0.025,0.001,0.017,0.024,0.024,0.039,0.059,0.424,0.159,0.179,0.033,0.027,0.027,0.102,0.047,0.034,0.080,0.002,0.000,0.112,0.095,0.011,0.021,0.011,0.009,0.000,0.282,0.042,0.010,0.021,0.022,0.068,0.045,0.031,0.065,0.000,0.016,0.071,0.000,0.013,0.023,0.331,0.076,0.009,0.019,0.138,0.023,0.020,0.044,0.018,0.125,0.145


### Feature Selection

An example of how you can select the top 25 features, using sparse projection. The feature selection process uses the importance scores to select the top features, reducing dimensionality while retaining the variables that have the most significant impact on the data's structure or variance.

In [10]:
feature_importance = df_mega.importance("sparse_projection")

In [11]:
df_select = df_mega[feature_importance["feature"].head(25)]; df_select

Unnamed: 0_level_0,Unnamed: 1_level_0,net_cash_flow_debt,cash_equivalents,total_liabilities,consolidated_income,net_income_common_stock,net_cash_flow_fx,net_cash_acquisitions_disposals,inventory_amount,total_equity,free_cash_flow,cash_operating_profit,preferred_dividends,operating_expenses,non_current_liabilities,current_debt,tangible_assets,tax_liabilities,tax_assets,property_plant_equipment_net,accounts_receivable,net_income_non_controlling_int,operating_cash_flow,net_income,net_cash_flow_financing,cost_of_revenue
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
AAPL,2018-01-05,6956230656.000,25275000832.000,258803302400.000,17187768320.000,17187768320.000,0.000,-9055769600.000,4554538496.000,138306076672.000,21121923072.000,106791305216.000,0.000,7383538688.000,147622690816.000,18476462080.000,389078450176.000,0.000,0.000,33710999552.000,46214078464.000,0.000,24404692992.000,17187768320.000,-6422538240.000,47693922304.000
AAPL,2018-01-12,6959923200.000,25828999168.000,260751228928.000,17907077120.000,17907077120.000,0.000,-9421076480.000,4521154048.000,138779312128.000,22173691904.000,110793228288.000,0.000,7447153664.000,148418772992.000,18476847104.000,391497842688.000,0.000,0.000,33703000064.000,47385309184.000,0.000,25376770048.000,17907077120.000,-6692153856.000,49365692416.000
AAPL,2018-01-19,6963615232.000,26382999552.000,262699155456.000,18626383872.000,18626383872.000,0.000,-9786384384.000,4487769088.000,139252531200.000,23225460736.000,114795151360.000,0.000,7510769152.000,149214838784.000,18477230080.000,393917235200.000,0.000,0.000,33695000576.000,48556539904.000,0.000,26348847104.000,18626383872.000,-6961769472.000,51037462528.000
AAPL,2018-01-26,6967307776.000,26936999936.000,264647081984.000,19345692672.000,19345692672.000,0.000,-10151692288.000,4454384640.000,139725766656.000,24277231616.000,118797074432.000,0.000,7574384640.000,150010920960.000,18477615104.000,396336627712.000,0.000,0.000,33686999040.000,49727770624.000,0.000,27320922112.000,19345692672.000,-7231384576.000,52709232640.000
AAPL,2018-02-02,6970999808.000,27491000320.000,266594992128.000,20064999424.000,20064999424.000,0.000,-10517000192.000,4421000192.000,140199002112.000,25329000448.000,122798997504.000,0.000,7638000128.000,150807003136.000,18478000128.000,398755987456.000,0.000,0.000,33678999552.000,50899001344.000,0.000,28292999168.000,20064999424.000,-7501000192.000,54380998656.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOM,2024-06-28,-1003000000.000,33348999168.000,164865998848.000,8566000128.000,8220000256.000,-324000000.000,703000000.000,23491000320.000,205250002944.000,9589999616.000,99253002240.000,0.000,7478000128.000,92944998400.000,8226999808.000,377917997056.000,28269000704.000,0.000,213723004928.000,40365998080.000,346000000.000,14663999488.000,8220000256.000,-7982000128.000,63014998016.000
XOM,2024-07-05,-1003000000.000,33348999168.000,164865998848.000,8566000128.000,8220000256.000,-324000000.000,703000000.000,23491000320.000,205250002944.000,9589999616.000,99253002240.000,0.000,7478000128.000,92944998400.000,8226999808.000,377917997056.000,28269000704.000,0.000,213723004928.000,40365998080.000,346000000.000,14663999488.000,8220000256.000,-7982000128.000,63014998016.000
XOM,2024-07-12,-1003000000.000,33348999168.000,164865998848.000,8566000128.000,8220000256.000,-324000000.000,703000000.000,23491000320.000,205250002944.000,9589999616.000,99253002240.000,0.000,7478000128.000,92944998400.000,8226999808.000,377917997056.000,28269000704.000,0.000,213723004928.000,40365998080.000,346000000.000,14663999488.000,8220000256.000,-7982000128.000,63014998016.000
XOM,2024-07-19,-1003000000.000,33348999168.000,164865998848.000,8566000128.000,8220000256.000,-324000000.000,703000000.000,23491000320.000,205250002944.000,9589999616.000,99253002240.000,0.000,7478000128.000,92944998400.000,8226999808.000,377917997056.000,28269000704.000,0.000,213723004928.000,40365998080.000,346000000.000,14663999488.000,8220000256.000,-7982000128.000,63014998016.000
