In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import pathlib
import os
import numpy as np
from typing import Literal
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

from kego import plotting

In [None]:
FOLDER_COMPETITION = pathlib.Path(
    "../../data/um/um-game-playing-strength-of-mcts-variants/"
)
!ls $FOLDER_COMPETITION
FILEPATH_TRAINING = FOLDER_COMPETITION / "train.csv"

In [None]:
train_raw = pd.read_csv(FILEPATH_TRAINING)

In [None]:
plotting.value_counts.plot_value_counts(
    df=train_raw.iloc[:, :4], font_size=8, figure_size=[8, 3]
);

In [None]:
train_raw.head(2)

In [None]:
train_raw["utility_agent1"].hist()

In [None]:
train_raw.shape  # .drop(columns="utility_agent1")

In [None]:
def prepare_df(df):
    df = df.copy()
    df = df.drop(columns=df.dtypes[df.dtypes == object].index)
    df = df.reset_index(drop=True)
    df = df.drop(columns=df.columns[df.isna().sum(axis=0) == df.shape[0]])
    df = df.reset_index(drop=True)
    return df


train = prepare_df(train_raw)

y = train[["utility_agent1"]]
X = train.drop(columns=["utility_agent1"])

In [None]:
X.isna().any().any()

In [None]:
feature_relevance = pd.Series(index=X.columns, data=clf.feature_importances_)
feature_relevance.sort_values(ascending=False).iloc[:20]

In [None]:
np.array(y).ravel()

In [None]:
class FeatureSelector:
    def __init__(self, X, y) -> None:
        self.X = X
        self.y = np.array(y).ravel()
        self.feature_ranker = None
        self.feature_relevance = None

    def _rank_features(self):
        self.feature_ranker = ExtraTreesRegressor(n_estimators=50, n_jobs=16)
        self.feature_ranker = self.feature_ranker.fit(self.X, self.y)
        self.feature_relevance = pd.Series(
            index=X.columns, data=clf.feature_importances_
        )
        self.feature_relevance.sort_values(ascending=False)
        return self.feature_relevance

    def select_features(
        self,
        threshold: float | str | None = 1e-1,
        setup: Literal["selector", "threshold"] = "threshold",
    ):
        if self.feature_ranker is None:
            self._rank_features()
        if setup == "selector":
            model = SelectFromModel(
                self.feature_ranker, prefit=True, threshold=threshold
            )
            X_new = model.transform(self.X)
        elif setup == "threshold":
            X_new = X.loc[
                :, feature_relevance[feature_relevance.values >= threshold].index
            ]
        return X_new

In [None]:
feature_selector = FeatureSelector(X, y)
feature_selector.select_features()

In [None]:
X_reduced = feature_selector.select_features(threshold=1e-3)

In [None]:
plotting.value_counts.plot_value_counts(df=X_reduced)

In [None]:
clf.feature_importances_.shape

In [None]:
plotting.lines.plot_line(
    range(len(clf.feature_importances_)), clf.feature_importances_, log=["false", "log"]
)

In [None]:
clf.feature_importances_