In [19]:
from collections import namedtuple
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize
from sklearn.model_selection import train_test_split
from typing import Dict, List, Tuple, Union
import pandas as pd


In [20]:
def find_best_split(
    task: str,
    feature_vector: np.ndarray, 
    target_vector: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, float, float]:
    dispersion = lambda y: np.sum((y - np.mean(y))**2)
    def gini(y):
        p1 = np.sum(y)
        p0 = 1. - p1 / y.shape[0]
        p1 /= y.shape[0]
        return 1 - p0**2 - p1**2
    def impurity(task, f_type, X, y, threshold):
        split = np.array(X <= threshold)
        l = y[split]
        r = y[np.logical_not(split)]
        if task == "regression":
            l_criteria = dispersion(l)
            r_criteria = dispersion(r)
        else:
            l_criteria = gini(l)
            r_criteria = gini(r)
        return - l.shape[0] / y.shape[0] * l_criteria - r.shape[0] / y.shape[0] * r_criteria
    #Просят отсортированные
    index = np.argsort(feature_vector)
    feature_vector = feature_vector[index]
    target_vector = target_vector[index]
    thresholds, impurities, threshold_best, impurity_best = [], [], -1, -np.inf
    for i in range(feature_vector.shape[0]):    
        if i == feature_vector.shape[0] - 1:
            break
        #Тут на всякий случай проверяем в обоих случаях, что не случилось разбиения, когда все уехали в 1 поддерево
        cur_threshold = (feature_vector[i] + feature_vector[i+1]) / 2
        if (target_vector[feature_vector <= cur_threshold].shape[0] == 0) or (target_vector[feature_vector > cur_threshold].shape[0] == 0):
            continue
        cur_impurity = impurity(feature_vector, target_vector, cur_threshold)
        thresholds.append(cur_threshold)
        impurities.append(cur_impurity)
        #Обычно надоо минимум, но тут формулка с минусом, поэтому фактически максимум. На картинке понятнее
        if cur_impurity > impurity_best:
            threshold_best, impurity_best = cur_threshold, cur_impurity
    return thresholds, impurities, threshold_best, impurity_best


In [21]:
data = pd.read_csv("sdss_redshift.csv")
x = data[['u', 'g', 'r', 'i', 'z']].to_numpy()
y = data['redshift'].to_numpy()
X_train0, X_test, y_train0, y_test = train_test_split(x, y, test_size=0.5, random_state=np.random.randint(0, np.random.randint(50, 1000)))


In [22]:
print(x)

[[21.37319 19.29492 17.83899 17.3035  16.98967]
 [19.11857 17.86208 17.04089 16.62898 16.32203]
 [20.57162 18.56461 17.61167 17.1362  16.78903]
 ...
 [16.9395  15.90051 15.51208 15.28988 15.13759]
 [20.84348 18.82782 17.77935 17.33892 17.04705]
 [19.89969 18.15171 17.00601 16.51634 16.17697]]


In [23]:
find_best_split("gini", np.ndarray(x), y)

ValueError: maximum supported dimension for an ndarray is 32, found 5878