In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler

### Реализация классификации на градиентном бустинге

In [2]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-np.clip(x, -250, 250)))


class MyGradientBoostingClassifier:
    def __init__(self, n_estimators=100, max_depth=3, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.trees = []

    def fit(self, X, y):
        self.lb_ = LabelBinarizer()
        y_encoded = self.lb_.fit_transform(y).ravel()
        self.classes_ = self.lb_.classes_

        initial_pos_prob = np.clip(y_encoded.mean(), 1e-5, 1 - 1e-5)
        initial_log_odds = np.log(initial_pos_prob / (1.0 - initial_pos_prob))
        F = np.full_like(y_encoded, initial_log_odds, dtype=np.float64)
        self.F0_ = initial_log_odds

        for _ in range(self.n_estimators):
            prob = sigmoid(F)
            grad = prob - y_encoded
            hess = prob * (1.0 - prob) + 1e-6

            pseudo_targets = -grad / hess
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, pseudo_targets, sample_weight=hess)

            F += self.learning_rate * tree.predict(X)
            self.trees.append(tree)

        return self

    def predict(self, X):
        F = np.full(X.shape[0], self.F0_, dtype=np.float64)
        for tree in self.trees:
            F += self.learning_rate * tree.predict(X)

        proba = sigmoid(F)
        threshold = 0.5
        return np.where(proba >= threshold, self.classes_[1], self.classes_[0])
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'learning_rate': self.learning_rate
        }

    def set_params(self, **params):
        for key, value in params.items():
            if key in {'n_estimators', 'max_depth', 'learning_rate'}:
                setattr(self, key, value)
            else:
                raise ValueError(f"Invalid parameter {key} for estimator MyDecisionTreeClassifier.")
        return self

### Реализация регрессии на градиентном бустинге

In [3]:
class MyGradientBoostingRegressor:
    def __init__(self, n_estimators=10, max_depth=3, learning_rate=0.1, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.trees_ = []

    def fit(self, X, y):
        self.trees_ = []

        self.F0_ = np.mean(y, dtype=np.float64)
        F = np.full(shape=y.shape, fill_value=self.F0_, dtype=np.float64)

        for _ in range(self.n_estimators):
            residuals = y - F
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_state=None
            )
            tree.fit(X, residuals)
            predictions = tree.predict(X)
            F += self.learning_rate * predictions
            self.trees_.append(tree)

        return self

    def predict(self, X):
        F = np.full(shape=X.shape[0], fill_value=self.F0_, dtype=np.float64)
        for tree in self.trees_:
            F += self.learning_rate * tree.predict(X)
        return F
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'learning_rate': self.learning_rate,
            'min_samples_split': self.min_samples_split,
        }

    def set_params(self, **params):
        for key, value in params.items():
            if key in {'n_estimators', 'max_depth', 'learning_rate', 'min_samples_split'}:
                setattr(self, key, value)
            else:
                raise ValueError(f"Invalid parameter {key} for estimator MyDecisionTreeClassifier.")
        return self

### Проверка классфикации

In [4]:
df = pd.read_csv('classification.csv')
classification_X = df.drop('Bankrupt?', axis=1)
classification_y = df['Bankrupt?']

classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X, classification_y,
    test_size=0.2,
    random_state=42,
    stratify=classification_y
)

sk_clf = GradientBoostingClassifier()
sk_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_sk = sk_clf.predict(classification_X_test)

my_clf = MyGradientBoostingClassifier()
my_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_my = my_clf.predict(classification_X_test)

print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9692, f1=0.9654, recall=0.3181818181818182
Custom  accuracy=0.9714, f1=0.9670, recall=0.3181818181818182


### Проверка регрессии

In [5]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd'].to_numpy()

regression_X = pd.get_dummies(regression_X, drop_first=True).to_numpy(dtype=np.float32)

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X, regression_y, test_size=0.2, random_state=42
)


sk_reg = GradientBoostingRegressor()
sk_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_sk = sk_reg.predict(regression_X_test)

my_reg = MyGradientBoostingRegressor()
my_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_my = my_reg.predict(regression_X_test)
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

SkLearn RMSE=42456.6259, R2=0.5297
Custom  RMSE=47115.6763, R2=0.4208


### Улучшение

### Классификация

### Скейлинг

In [6]:
scaler = StandardScaler()
classification_X_train_scaled = scaler.fit_transform(classification_X_train)
classification_X_test_scaled = scaler.transform(classification_X_test)

### Подбор гиперпараметров

In [7]:
scorer = make_scorer(recall_score, pos_label=1)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

classification_sk_grid = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

classification_sk_grid.fit(classification_X_train_scaled, classification_y_train)
sk_best_clf = classification_sk_grid.best_estimator_

classification_my_grid = GridSearchCV(
    MyGradientBoostingClassifier(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

classification_my_grid.fit(classification_X_train_scaled, classification_y_train)
my_best_clf = classification_my_grid.best_estimator_

### Предсказание

In [8]:
classification_y_pred_sk = sk_best_clf.predict(classification_X_test_scaled)

classification_y_pred_my = my_best_clf.predict(classification_X_test_scaled)

### Метрики

In [9]:
print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9597, f1=0.9585, recall=0.3181818181818182
Custom  accuracy=0.9677, f1=0.9643, recall=0.3181818181818182


### Регрессия

### Скейлинг

In [10]:
scaler = StandardScaler()
regression_X_train_scaled = scaler.fit_transform(regression_X_train)
regression_X_test_scaled = scaler.transform(regression_X_test)

### Подбор гиперпараметров

In [None]:
scorer = make_scorer(r2_score, pos_label=1)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

regression_sk_grid = GridSearchCV(
    GradientBoostingClassifier(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

regression_sk_grid.fit(regression_X_train_scaled, regression_y_train)
sk_best_reg = regression_sk_grid.best_estimator_

  and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
  type_of_target_y = type_of_target(y)
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
  y_type = type_of_target(y, input_name="y")
Traceback (most recent call last):
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 308, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 400, in _score
    y_pred = method_caller(
        estimator,
    ...<2 

### Предсказание

In [None]:
regression_y_pred_sk = sk_best_reg.predict(regression_X_test_scaled)

### Метрики

In [None]:
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")