In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split

from skrough.dataprep import prepare_factorized_data

# iris = datasets.load_iris()
# X = iris.data
# y = iris.target

In [2]:
df = pd.DataFrame(
    np.array(
        [
            ["sunny", "hot", "high", "weak", "no"],
            ["sunny", "hot", "high", "strong", "no"],
            ["overcast", "hot", "high", "weak", "yes"],
            ["rain", "mild", "high", "weak", "yes"],
            ["rain", "cool", "normal", "weak", "yes"],
            ["rain", "cool", "normal", "strong", "no"],
            ["overcast", "cool", "normal", "strong", "yes"],
            ["sunny", "mild", "high", "weak", "no"],
            ["sunny", "cool", "normal", "weak", "yes"],
            ["rain", "mild", "normal", "weak", "yes"],
            ["sunny", "mild", "normal", "strong", "yes"],
            ["overcast", "mild", "high", "strong", "yes"],
            ["overcast", "hot", "normal", "weak", "yes"],
            ["rain", "mild", "high", "strong", "no"],
        ],
        dtype=object,
    ),
    columns=["Outlook", "Temperature", "Humidity", "Wind", "Play"],
)
TARGET_COLUMN = "Play"

df_dec = df.pop(TARGET_COLUMN).astype("category").cat.codes
df = df.astype("category").apply(lambda x: x.cat.codes)

In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(df, df_dec, test_size=0.2)
D_train = xgb.DMatrix(X_train, label=Y_train)
D_test = xgb.DMatrix(X_test, label=Y_test)

In [65]:
df_dec_2 = df_dec.copy()
# df_dec_2[:] = 0
D_train = xgb.DMatrix(df, label=df_dec_2)

In [71]:
param = {
    "eta": 0.3,
    "max_depth": 5,
    "objective": "binary:hinge",
    "verbosity": 1,
}
model = xgb.train(param, D_train, num_boost_round=10)

In [67]:
model.num_boosted_rounds()

0

In [72]:
model.predict(D_train)

array([0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.],
      dtype=float32)

In [44]:
model.get_score(importance_type="total_gain")

{'Outlook': 2.251736879348755,
 'Humidity': 3.964761257171631,
 'Wind': 1.0427981615066528}

In [61]:
import pandas as pd

x = pd.Series(model.get_score(importance_type="total_gain"))
x / x.sum()

Outlook     0.310187
Humidity    0.546163
Wind        0.143650
dtype: float64

In [43]:
model.get_score(importance_type="weight")

{'Outlook': 3.0, 'Humidity': 5.0, 'Wind': 2.0}

In [7]:
from sklearn import tree

cl = tree.DecisionTreeClassifier()
cl.fit(X_train, Y_train)
cl.predict(X_test)

array([1, 1, 1], dtype=int8)

In [8]:
clx = xgb.XGBClassifier()
clx.fit(X_train, Y_train)
clx.predict(X_test)

array([1, 1, 0])

In [5]:
import more_itertools

list(more_itertools.powerset(["O", "T", "H", "W"]))

[(),
 ('O',),
 ('T',),
 ('H',),
 ('W',),
 ('O', 'T'),
 ('O', 'H'),
 ('O', 'W'),
 ('T', 'H'),
 ('T', 'W'),
 ('H', 'W'),
 ('O', 'T', 'H'),
 ('O', 'T', 'W'),
 ('O', 'H', 'W'),
 ('T', 'H', 'W'),
 ('O', 'T', 'H', 'W')]