In [None]:
import numpy as np
import numpy.random as rand
from functools import cache
import polars as pl
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
TARGET = "quality"
df = pl.read_csv("data/Wine_Quality_Data.csv", has_header=True)
df = df.drop("color")
df = df.head(1000)

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score
import polars as pl
import itertools

def fetch_data(df, use_split, rbf, include_mul):
    assert(use_split >= 0 and use_split <= 4)
    # Perform 5-fold cross validation with a deterministic seed
    kf = KFold(n_splits=4, shuffle=True, random_state=2023)
    splits = list(kf.split(df))
    # Indexing the dataframe with an array returns the appropriate splits
    df_train, df_test = df[splits[use_split][0]], df[splits[use_split][1]]
    for col in df_train.columns:
        if col == TARGET or col in rbf:
            continue
        df_train, df_test = scale_column(df_train, df_test, col)
    cols = list(df_train.columns)
    # binary = binary_columns(df_train)
    for col in rbf:
        if col == TARGET:
            continue
        df_train, df_test = rbf_column(df_train, df_test, col) # Use radial bias function
    for col in cols:
        if col == TARGET:
            continue
        if col not in rbf:
            df_train, df_test = fourier_column(df_train, df_test, col)
    for (col_a, col_b) in itertools.combinations(cols, 2):
        if col_a == TARGET or col_b == TARGET:
            continue
        if col_a not in include_mul and col_b not in include_mul:
            continue
        df_train, df_test = mul_cols(df_train, df_test, col_a, col_b)
    return df_train, df_test

def binary_columns(df):
    binary = []
    for col in df.columns:
        if df.get_column(col).n_unique() == 2:
            binary.append(col)
    return binary

def scale_column(df_train, df_test, col_name):
    '''Scale a column from 0 to 1'''
    max = df_train.get_column(col_name).max()
    min = df_train.get_column(col_name).min()
    df_train = df_train.with_columns((pl.col(col_name) - min) / (max - min))
    df_test = df_test.with_columns((pl.col(col_name) - min) / (max - min))
    return df_train, df_test

def rbf_column(df_train, df_test, col_name):
    '''Divide a col into 3 features using a radial basis function'''
    std = df_train.get_column(col_name).std()
    low = df_train.get_column(col_name).quantile(0.25)
    middle = df_train.get_column(col_name).quantile(0.5)
    high = df_train.get_column(col_name).quantile(0.75)
    for i, val in enumerate([low, middle, high]):
        df_train = df_train.with_columns((-1.0 * (pl.col(col_name) - val)**2 / (2 * std)**2).exp().alias(f"{col_name}_{i}"))
        df_test = df_test.with_columns((-1.0 * (pl.col(col_name) - val)**2 / (2 * std)**2).exp().alias(f"{col_name}_{i}"))
    # return df_train.drop(col_name), df_test.drop(col_name)
    return df_train, df_test

def fourier_column(df_train, df_test, col_name):
    '''Divide a col in 3 features using math'''
    for i in range(1, 4):
        df_train = df_train.with_columns((pl.col(col_name) * pl.lit(i) * pl.lit(np.pi)).cos().alias(f"{col_name}_{i - 1}"))
        df_test = df_test.with_columns((pl.col(col_name) * pl.lit(i) * pl.lit(np.pi)).cos().alias(f"{col_name}_{i - 1}"))
    # return df_train.drop(col_name), df_test.drop(col_name)
    return df_train, df_test

def mul_cols(df_train, df_test, col_a, col_b):
    df_train = df_train.with_columns((pl.col(col_a) * pl.col(col_b)).alias(f"{col_a}+{col_b}"))
    df_test = df_test.with_columns((pl.col(col_a) * pl.col(col_b)).alias(f"{col_a}+{col_b}"))
    return df_train, df_test

def get_x_y(df):
    X, y = df.drop(TARGET), df.get_column(TARGET)
    # X = X.with_columns(pl.lit(1.0).alias('constant')) # extra column for the bias term
    return X.to_numpy(), y.to_numpy()

# df = pl.read_csv("data/leaf.csv", has_header=False)
# df = df.drop("column_2")
NUM_FEATURES = len(df.columns) - 1
df_train, df_test = fetch_data(df, 0, list(df.columns)[11:], [])
X_train, y_train = get_x_y(df_train)
X_test, y_test = get_x_y(df_test)
len(y_train)

In [None]:
NUM_FEATURES

In [None]:
a = np.array(list(df.columns[1:]) + list(df.columns[1:]))
len(a)

In [None]:
def evaluate(rbf):
    a = np.array(list(df.columns[1:]) + list(df.columns[1:]))
    rbf = a[rbf > 0.5]
    
    df_train, df_test = fetch_data(df, 0, rbf[:NUM_FEATURES], rbf[NUM_FEATURES:])
    X_train, y_train = get_x_y(df_train)
    X_test, y_test = get_x_y(df_test)
    reg = RandomForestClassifier(n_estimators=20)
    # reg = LogisticRegression(solver="newton-cg")
    reg.fit(X_train, y_train)
    return reg.score(X_test, y_test)

In [None]:
arr = np.ones(NUM_FEATURES * 2)
arr[NUM_FEATURES:] = 0.6
evaluate(arr)

In [None]:
arr = np.random.random(14)
arr

In [None]:
arr > 0.5

In [None]:
np.array(list(df.columns[1:]))

In [None]:
a = np.array(list(df.columns[1:]))
a[arr > 0.5]

In [None]:

list(df.columns)[2:]

In [None]:
reg = LogisticRegression(solver="newton-cg")
# reg = RandomForestClassifier()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
df_train.describe()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Augmentation: Rbf per column, 
# [X_1 * X_2

In [None]:
train = MNIST("data", train=True, download=True)
test = MNIST("data", train=False, download=True)

In [None]:
foo = train.data.flatten(start_dim=1)
foo.size()

In [None]:
train.targets.size()

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# X, y = load_digits(return_X_y=True)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

# model = RandomForestClassifier(
#     n_estimators=100, max_depth=38, random_state=2023, min_samples_split=0.01, min_samples_leaf=25,
#     max_features=1
# )
# Max Features
crit = "gini"
n_samples = 100
model = RandomForestClassifier(criterion=crit, n_estimators=10, random_state=420, max_depth=5, min_samples_leaf=1, 
                               max_features=25, min_samples_split=2,ccp_alpha=0.01, min_impurity_decrease=0.01)
model.fit(train.data[0:n_samples, :, :].flatten(start_dim=1), train.targets[0:n_samples])

model.score(test.data[:, :, :].flatten(start_dim=1), test.targets[:])

In [None]:
NUM_DIM = NUM_FEATURES * 2
LIMIT = 4

def sigmoid(x):
    # Simple check to avoid numerical errors with extreme x values
    return 1./(1. + np.exp(-x))

class Configuration:
    def __init__(self, array):
        self.data = array

    def __hash__(self):
        return hash(self.data.tobytes())
    
    def __eq__(self, other):
        return (self.data == other.data).all()

    def build_classifier(self):
        data = sigmoid(self.data)
        if data[0] < 0.5:
            crit = "gini"
        else:
            crit = "entropy"
        
        n_est = int(data[1] * 100) + 1
        max_depth = int(data[2] * 10) + 1
        min_samp_leaf = int(data[3] * 3) + 1
        max_features = int(data[4] * 25) + 1
        return RandomForestClassifier(n_estimators=n_est, criterion=crit, min_samples_leaf=min_samp_leaf,
                                      max_features=max_features,max_depth=max_depth, random_state=8675309)
        
        

class Solutions:
    def __init__(self, num, ring_size=4):
        self.solutions = np.stack([Solutions.new_solution() for _ in range(num)])
        self.employed = np.array([True] * (num // 2) + [False] * (num // 2))
        self.failures = np.zeros_like(self.employed, np.int32)
        self.onlooker = np.logical_not(self.employed) # Unemployed
        self.best_fitness = -1
        self.best_sol = None 
        self.ring_size = ring_size

    @staticmethod
    def new_solution():
        return rand.random(size=NUM_DIM)
    
    def best_in_ring(self, start_idx):
        # sol = self.solutions[start_idx]
        size = len(self.solutions)
        best_idx = start_idx 
        for i in range(1, 1 + self.ring_size):
            idx = (start_idx - i) % size
            if self.fitness(self.solutions[idx]) > self.fitness(self.solutions[best_idx]):
                best_idx = idx
        for i in range(1, 1 + self.ring_size):
            idx = (start_idx + i) % size
            if self.fitness(self.solutions[idx]) > self.fitness(self.solutions[best_idx]):
                best_idx = idx
        return best_idx
        

    def random_sol(self, exclude=-1):
        rand_idx = rand.randint(0, self.solutions.shape[0])
        if rand_idx == exclude:
            return self.random_sol(exclude=exclude)
        else:
            return self.solutions[rand_idx]

    def get_employed(self):
        return self.solutions[self.employed]

    def get_unemployed(self):
        return self.solutions[np.logical_not(self.employed)]
        
    def get_onlooker(self):
        return self.solutions[self.onlooker]

    def get_scout(self):
        return self.solutions[self.scout]

    def fitness(self, x):
        return _fitness(Configuration(x))

    def most_fit(self):
        fit = np.array([self.fitness(x) for x in self.solutions])
        idx = fit.argmax()
        return fit[idx], self.solutions[idx]

    def update_best(self):
        best_fit, best_sol = self.most_fit()
        if best_fit > self.best_fitness:
            self.best_sol = best_sol
            self.best_fitness = best_fit

@cache
def _fitness(x: Configuration):
    return evaluate(x.data)


def basic_employed(sol: Solutions, initial_idx: int):
    initial = sol.solutions[initial_idx]
    a = 0.1 # Todo figure this out
    idx = rand.randint(0, initial.size)
    phi = rand.uniform(low=-a, high=a)
    out = np.copy(initial)
    sol_k = sol.random_sol(exclude=initial_idx)
    out[idx] += phi * (out[idx] - sol_k[idx])
    out[idx] = abs(out[idx])
    # Todo make sure values stay within expected range
    return out # Greedy select this

def enhanced_employed(sol: Solutions, initial_idx: int):
    initial = sol.solutions[initial_idx]
    a = 0.1 # Todo figure this out
    idx = rand.randint(0, initial.size)
    phi = rand.uniform(low=-a, high=a)
    out = np.copy(initial)
    best_idx = sol.best_in_ring(initial_idx)
    sol_k = sol.solutions[best_idx]
    out[idx] += phi * (out[idx] - sol_k[idx])
    out[idx] = abs(out[idx])
    return out


def basic_onlooker(sol: Solutions, _initial_idx: int):
    employed = sol.get_employed()
    fitnesses = np.array([sol.fitness(x) for x in employed])
    total_fitness = np.sum(fitnesses)
    bee_idx = rand.choice(np.arange(len(employed)), p=fitnesses/total_fitness)
    return basic_employed(sol, bee_idx)

def enhanced_onlooker(sol: Solutions, initial_idx: int):
    best_bee = sol.solutions[sol.best_in_ring(initial_idx)]
    random_bee_idx = rand.choice(len(sol.solutions))
    random_bee = sol.solutions[random_bee_idx]
    a = 0.1 # Todo figure this out
    idx = rand.randint(0, best_bee.size)
    phi = rand.uniform(low=-a, high=a)
    out = np.copy(best_bee)
    out[idx] += phi * (out[idx] - random_bee[idx])
    out[idx] = abs(out[idx])
    return out

def vanilla_abc(num_bees, epoches):
    return abc(num_bees, epoches, basic_employed, basic_onlooker, smart_scout=False)



def abc(num_bees, epoches, employ_fn, onlooker_fn, smart_scout=False):
    # init_bees()
    sol = Solutions(num_bees)
    for _ in range(epoches):
        # Employed
        for idx in sol.employed.nonzero()[0]:
            # print(f"Employed {idx}")
            candidate = employ_fn(sol, idx)
            if sol.fitness(candidate) > sol.fitness(sol.solutions[idx]):
                sol.solutions[idx] = candidate
                sol.failures[idx] = 0
            else:
                sol.failures[idx] += 1
        # Onlooker
        for idx in sol.onlooker.nonzero()[0]:
            # print(f"Onlooker {idx}")
            candidate = onlooker_fn(sol, idx)
            if sol.fitness(candidate) > sol.fitness(sol.solutions[idx]):
                sol.solutions[idx] = candidate
        # Scout
        for idx in sol.employed.nonzero()[0]:
            # print(f"Scout {idx}")
            if sol.failures[idx] >= LIMIT:
                sol.failures[idx] = 0
                sol.solutions[idx, :] = Solutions.new_solution()
            if smart_scout:
                new_fitness = sol.fitness(sol.solutions[idx])
                # Candidate U2
                r1 = sol.random_sol()
                r2 = sol.random_sol()
                best = sol.solutions[sol.best_in_ring(idx)]
                diff = r1 - r2
                weighted = np.random.random(diff.size) * diff
                candidate = np.copy(best)
                # print(weighted)
                # print(candidate)
                candidate += weighted
                u2_fit = sol.fitness(candidate)
                if u2_fit > new_fitness:
                    new_fitness = u2_fit 
                    sol.solutions[idx, :] = candidate
                # Candidate U3
                # Assume in range 0 to 1
                opposite = np.ones_like(best) - best
                u3_fit = sol.fitness(opposite)
                if u3_fit > new_fitness:
                    sol.solutions[idx, :] = opposite
            
        # Mark best
        sol.update_best()
        print(sol.best_fitness)
        print(sol.best_sol)


# vanilla_abc(24, 100)

In [None]:
vanilla_abc(30, 100)

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    abc(100, 100, enhanced_employed, enhanced_onlooker, smart_scout=True)

In [None]:
sols = Solutions(100)
sols.best_in_ring(0)