In [1]:
from decision_tree import DecisionTree, TreeNode, Rule
import json
import os
import pickle

In [2]:
cols = []
col_counts = {}

In [3]:
def get_cols(node:TreeNode):
    global cols, col_counts
    if node is None: return
    rule: Rule = node.rule
    if rule is None: return
    col = rule.predictor
    if col not in cols: cols.append(col)
    col_counts[col] = col_counts.get(col, 0) + 1
    get_cols(node.left)
    get_cols(node.right)

In [4]:
files = [os.path.join('saved_models', file_name) for file_name in os.listdir('saved_models') if file_name.startswith('dt')]

for model_path in files:
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
        
    get_cols(model.tree)
    
cols = [col for col in cols if not col.startswith('row_num')]
col_counts = {col: col_counts[col] for col in cols}
col_counts = {k:v for k,v in sorted(col_counts.items(), key=lambda x: -x[1])}

In [5]:
col_counts

{'Open-Close_ma_1-Open-Close_ma_10_val': 5,
 'Volume_ma_10_val': 4,
 'Close_slope_1_polarity': 3,
 'Open-Close_ma_1-Open-Close_ma_50_val': 3,
 'BBP_5_2.0_slope_1_changelen_val': 3,
 'Open-Close_slope_1_val': 2,
 'BBM_5_2.0_ma_1-BBM_5_2.0_ma_20_changelen_val': 2,
 'BBL_5_2.0_ma_10_slope_15_val': 2,
 'MACDs_12_26_9_ma_20_changelen_val': 1,
 'Open-High_ma_5-Open-High_ma_20_changelen_val': 1,
 'BBL_5_2.0_ma_5_slope_1_changelen_polarity': 1,
 'High_n-Low_n_ma_50_changelen_val': 1,
 'High_n-Low_n_ma_20_slope_15_changelen_val': 1,
 'Open_n-Close_n_ma_10_val': 1,
 'Volume_ma_5_slope_5_val': 1,
 'Close_n_slope_1_polarity': 1,
 'Low_ma_5_slope_3_polarity': 1,
 'Open_n_ma_10_slope_5_changelen_polarity': 1,
 'RSI_14_ma_1_val': 1,
 'Open-Close_polarity': 1,
 'Open-High_ma_1-Open-High_ma_50_val': 1,
 'Open-Close_ma_1-Open-Close_ma_20_val': 1,
 'Close_n_ma_1-Close_n_ma_50_changelen_polarity': 1,
 'High_n_ma_1-High_n_ma_20_changelen_polarity': 1,
 'Open-High_ma_1-Open-High_ma_50_polarity': 1}

In [6]:
len(cols)

25

In [9]:
prediscrete_imp_cols = [col.rstrip('_val').rstrip('_polarity') for col in cols]
count_prediscrete = {}
for col in prediscrete_imp_cols:
    count_prediscrete[col] = count_prediscrete.get(col, 0) + 1
count_prediscrete = {k:v for k,v in sorted(count_prediscrete.items(), key=lambda x: -x[1])}
prediscrete_imp_cols = list(count_prediscrete.keys())
prediscrete_imp_cols, len(prediscrete_imp_cols)

(['Open-High_ma_1-Open-High_ma_50',
  'Open-Close_ma_1-Open-Close_ma_10',
  'Close_slope_1',
  'Volume_ma_10',
  'Open-Close_slope_1',
  'MACDs_12_26_9_ma_20_changelen',
  'Open-Close_ma_1-Open-Close_ma_50',
  'BBP_5_2.0_slope_1_changelen',
  'BBM_5_2.0_ma_1-BBM_5_2.0_ma_20_changelen',
  'Open-High_ma_5-Open-High_ma_20_changelen',
  'BBL_5_2.0_ma_5_slope_1_changelen',
  'High_n-Low_n_ma_50_changelen',
  'High_n-Low_n_ma_20_slope_15_changelen',
  'BBL_5_2.0_ma_10_slope_15',
  'Open_n-Close_n_ma_10',
  'Volume_ma_5_slope_5',
  'Close_n_slope_1',
  'Low_ma_5_slope_3',
  'Open_n_ma_10_slope_5_changelen',
  'RSI_14_ma_1',
  'Open-Close',
  'Open-Close_ma_1-Open-Close_ma_20',
  'Close_n_ma_1-Close_n_ma_50_changelen',
  'High_n_ma_1-High_n_ma_20_changelen'],
 24)

In [10]:
cols_to_use = ['Open', 'Close', 'Open_n', 'Close_n',
               'High_n', 'Low_n', 'High', 'Low', 'BBM_5_2.0', 'Volume', 'RSI_14',
               'MACDs_12_26_9', 'BBL_5_2.0', 'BBP_5_2.0']

In [11]:
with open('additional_utils/cols.pkl', 'rb') as f:
    d = pickle.load(f)
d['imp_cols'] = cols
d['cols_to_use'] = cols_to_use
d['col_counts'] = col_counts
d['prediscrete_imp_cols'] = prediscrete_imp_cols
d['prediscrete_col_counts'] = count_prediscrete
with open('additional_utils/cols.pkl', 'wb') as f:
    pickle.dump(d, f)