# Feature Selection

In [1]:
import copy

from feature_search.SearchNodes import SearchNodeAdding
import configuration
from feature_search.SplitConfig import SplitConfig
from feature_search.ModelConfig import ModelConfigCNN
from modeling.Trainer import Trainer

In [2]:
from data_loading import load_data

data = load_data()

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1883 entries, 0 to 1882
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1883 non-null   datetime64[ns]
 1   EMP     1883 non-null   float64       
 2   PE      1883 non-null   float64       
 3   CAPE    1883 non-null   float64       
 4   DY      1883 non-null   float64       
 5   Rho     1883 non-null   float64       
 6   MOV     1883 non-null   float64       
 7   IR      1883 non-null   float64       
 8   RR      1883 non-null   float64       
 9   Y02     1883 non-null   float64       
 10  Y10     1883 non-null   float64       
 11  STP     1883 non-null   float64       
 12  CF      1883 non-null   float64       
 13  MG      1883 non-null   float64       
 14  RV      1883 non-null   float64       
 15  ED      1883 non-null   float64       
 16  UN      1883 non-null   float64       
 17  GDP     1883 non-null   float64       
 18  M2      

In [4]:
window_size = 52

model_config = ModelConfigCNN(window_size=window_size,
                              num_features= 1,
                              output_size=1,
                              num_conv = 2,
                              kernel_size = 9,
                              channels=2,
                              padding=True,
                              num_hidden_layers=2,
                              hidden_size=50,
                              dropout=0.1
                              )

    

split_config = SplitConfig(window_size=window_size,
                           prediction_length=1,
                           look_ahead=13,
                           val_split=configuration.validation_split,
                           test_split=configuration.test_split,
                           is_indexed=True,
                           scale_target=True,
                           fixed_feature_size=None,
                           batch_size=200,
                           )


model_trainer = Trainer(epochs = 50, learning_rate=1e-4)


In [5]:
root_node = SearchNodeAdding(data,
                             selection=[],
                             model_trainer=model_trainer,
                             target_column=configuration.target_column,
                             date_column=configuration.date_column,
                             model_config=model_config,
                             split_config=split_config,
                             num_iterations=8,
                             max_children=10,
                             parent=None
                             )

In [6]:
from queue import PriorityQueue
import time
import pickle

max_depth = 3
max_time = 3600 # in seconds
file_name = "Feature Selection V5"
out_path = configuration.output_path + "feature_selection/"

start_time = time.time()

finished_nodes = []
expanded_nodes = []
result = []

queue = PriorityQueue()

queue.put(root_node)
expanded_nodes.append(root_node)

h_mean, h_std, h_med = root_node.get_heuristic()
result.append((copy.deepcopy(root_node.selection), h_mean, h_std, h_med))


best_node = root_node


while max_time + start_time >= time.time():
    if queue.qsize() == 0:
        break
    
    
    current_node = queue.get()
    
    if current_node in finished_nodes:
        print("Backtracking, nodes was fully visited")
        continue
        
        
    if current_node.get_heuristic()[2] < best_node.get_heuristic()[2]:
        best_node = current_node      
        print(current_node.selection)
        print("New Best Node: ", best_node.selection, " with score: ", round(best_node.get_heuristic()[2], 3))
        
        
    if current_node.get_depth() >= max_depth:
        finished_nodes.append(current_node)
        print("Backtracking, max depth reached")
        continue
        
    
    if current_node.has_next_child():
        child_node = current_node.next_child()
        
        if (child_node not in expanded_nodes) and (child_node not in finished_nodes):
            h_mean, h_std, h_med = child_node.get_heuristic()
            
            queue.put(child_node)
            expanded_nodes.append(child_node)
            result.append((copy.deepcopy(child_node.selection), h_mean, h_std, h_med))

            print("New Child: ", child_node.selection, " with score: ", round(h_med,3))
            
            with open(out_path + file_name, "wb") as file:
                pickle.dump(result, file, protocol=pickle.HIGHEST_PROTOCOL)

    
    if not current_node.has_next_child():
        print("Backtracking, node is fully expanded")
        finished_nodes.append(current_node)
    else:
        queue.put(current_node)
    

New Child:  ['MOV ']  with score:  3.553
New Child:  ['YSS']  with score:  5.9
New Child:  ['CF']  with score:  1.949
['CF']
New Best Node:  ['CF']  with score:  1.949
New Child:  ['CF', 'MOV ']  with score:  2.483
New Child:  ['CF', 'YSS']  with score:  5.098
New Child:  ['CF', 'RV']  with score:  4.988
New Child:  ['CF', '_DXY']  with score:  2.54
New Child:  ['CF', 'PE']  with score:  2.25
New Child:  ['CF', 'IR']  with score:  2.361
New Child:  ['CF', 'CAPE']  with score:  3.689
New Child:  ['CF', 'Y02']  with score:  2.273
New Child:  ['CF', 'Y10']  with score:  2.87
New Child:  ['CF', '_TY']  with score:  1.286
Backtracking, node is fully expanded
['CF', '_TY']
New Best Node:  ['CF', '_TY']  with score:  1.286
New Child:  ['CF', '_TY', 'MOV ']  with score:  2.859
New Child:  ['CF', '_TY', 'RV']  with score:  3.37
New Child:  ['CF', '_TY', '_DXY']  with score:  5.784
New Child:  ['CF', '_TY', 'YSS']  with score:  5.8
New Child:  ['CF', '_TY', 'ED']  with score:  8.326
New Child:  

In [8]:
print("Best Node: ", best_node.selection, " with score: ", round(best_node.get_heuristic()[2], 3))

Best Node:  ['CF', 'Y10', 'MOV ']  with score:  1.126
