In [2]:
import random

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import random
from deap import base, creator, tools, algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
from tqdm import tqdm, trange

In [3]:
df = pd.read_csv('data/oil_merge_13.csv')
FEATURES = ["copper_close", "dji_index", "gold_close", "eur_close",
            "Henry Hub Natural Gas Spot Price Dollars per Million Btu", "rub_close", "silver_close", "nasdaq_close",
            "SP500", "pal_close", "corn_close", "heat_close"]
TARGET = "WTI_dollar_per_barrel"
X = df[FEATURES]
y = df[TARGET]

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X

array([[0.        , 0.14571157, 0.01842924, ..., 0.01041586, 0.09341085,
        0.        ],
       [0.00381731, 0.14373759, 0.01908978, ..., 0.01186788, 0.09224806,
        0.00874084],
       [0.00558964, 0.14459744, 0.01922188, ..., 0.011713  , 0.09767442,
        0.00984107],
       ...,
       [0.33933197, 0.58859081, 0.72911025, ..., 0.54108264, 0.24689922,
        0.063478  ],
       [0.33933197, 0.58859081, 0.72911025, ..., 0.54108264, 0.24689922,
        0.063478  ],
       [0.33933197, 0.58859081, 0.72911025, ..., 0.54108264, 0.24689922,
        0.063478  ]])

In [4]:
# Define the evaluation function (fitness function)
def evaluate(individual):
    # Create a mask for selected features
    mask = np.array(individual, dtype=bool)
    # Use only selected features for regression
    X_selected = X[:, mask]

    # Use RandomForestRegressor for regression task
    reg = RandomForestRegressor(n_estimators=100, random_state=42)
    scores = cross_val_score(reg, X_selected, y, cv=3, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())  # Root Mean Squared Error
    return rmse

In [5]:
# Create the fitness and individual classes using DEAP
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

# Create a toolbox for creating individuals and populations
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=12)  # 2 features in the CSV file
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Register the evaluation function, crossover, mutation, and selection operators
toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

In [6]:
# Define the population size, number of generations, crossover probability, and mutation probability
POP_SIZE = 30
NUM_GENERATIONS = 25
CXPB = 0.6  # Crossover probability
MUTPB = 0.2  # Mutation probability

# Create an initial population
population = toolbox.population(n=POP_SIZE)

In [7]:
# Evaluate the entire population
fitnesses = list(map(toolbox.evaluate, tqdm(population)))
for ind, fit in zip(population, fitnesses):
    ind.fitness.values = (fit,)

100%|██████████| 30/30 [04:17<00:00,  8.59s/it]


In [8]:
# Run the genetic algorithm
for gen in trange(NUM_GENERATIONS):
    # Select the next generation individuals
    offspring = algorithms.varAnd(population, toolbox, cxpb=CXPB, mutpb=MUTPB)
    # print(offspring)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for ind, fit in zip(offspring, fits):
        ind.fitness.values = (fit,)

    # Replace the old population with the offspring
    population[:] = offspring

100%|██████████| 25/25 [1:45:01<00:00, 252.05s/it]


In [10]:
# Find the best individual in the final population
best_individual = tools.selBest(population, k=1)[0]
selected_features = [i for i in range(len(best_individual)) if best_individual[i] == 1]

print("Selected Features:", selected_features)
print([FEATURES[i] for i in selected_features])

Selected Features: [0, 1, 7, 9, 11]
['copper_close', 'dji_index', 'nasdaq_close', 'pal_close', 'heat_close']
