In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.preprocessing import StandardScaler
from collections import Counter

### Класификатор на случайном лесе

In [11]:

class MyRandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self._fitted_estimators = []

    def _draw_bootstrap(self, features, targets):
        population_size = len(targets)
        sampling_indices = np.random.randint(0, population_size, size=population_size)
        return features[sampling_indices], targets[sampling_indices]

    def fit(self, X, y):
        X_np = np.asarray(X)
        y_np = np.asarray(y)
        self._fitted_estimators.clear()

        for _ in range(self.n_estimators):
            boot_X, boot_y = self._draw_bootstrap(X_np, y_np)
            weak_learner = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            weak_learner.fit(boot_X, boot_y)
            self._fitted_estimators.append(weak_learner)

        return self

    def predict(self, X):
        votes_matrix = np.stack([
            estimator.predict(X) for estimator in self._fitted_estimators
        ], axis=0)

        predictions = []
        for sample_idx in range(votes_matrix.shape[1]):
            votes_for_sample = votes_matrix[:, sample_idx]
            winner_class, _ = Counter(votes_for_sample).most_common(1)[0]
            predictions.append(winner_class)

        return np.asarray(predictions)
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split
        }

    def set_params(self, **params):
        for key, value in params.items():
            if key in {'n_estimators', 'max_depth', 'min_samples_split'}:
                setattr(self, key, value)
            else:
                raise ValueError(f"Invalid parameter {key} for estimator MyDecisionTreeClassifier.")
        return self

### Регрессор на случайном лесе

In [12]:
class MyRandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self._base_learners = []

    def _resample_with_replacement(self, data_features, data_targets):
        dataset_size = data_targets.shape[0]
        bootstrap_indices = np.random.randint(low=0, high=dataset_size, size=dataset_size)
        return data_features[bootstrap_indices], data_targets[bootstrap_indices]

    def fit(self, X, y):
        X_num = np.asarray(X)
        y_num = np.asarray(y)
        self._base_learners.clear()

        for _ in range(self.n_estimators):
            bootstrapped_X, bootstrapped_y = self._resample_with_replacement(X_num, y_num)
            learner = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            learner.fit(bootstrapped_X, bootstrapped_y)
            self._base_learners.append(learner)

        return self

    def predict(self, X):
        individual_forecasts = np.column_stack([
            model.predict(X) for model in self._base_learners
        ])
        return np.mean(individual_forecasts, axis=1)
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split
        }

    def set_params(self, **params):
        for key, value in params.items():
            if key in {'n_estimators', 'max_depth', 'min_samples_split'}:
                setattr(self, key, value)
            else:
                raise ValueError(f"Invalid parameter {key} for estimator MyDecisionTreeClassifier.")
        return self

### Проверка классификации

In [13]:
df = pd.read_csv('classification.csv')
classification_X = df.drop('Bankrupt?', axis=1)
classification_y = df['Bankrupt?']

classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(
    classification_X, classification_y,
    test_size=0.2,
    random_state=42,
    stratify=classification_y
)

sk_clf = RandomForestClassifier()
sk_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_sk = sk_clf.predict(classification_X_test)

my_clf = MyRandomForestClassifier()
my_clf.fit(classification_X_train, classification_y_train)
classification_y_pred_my = my_clf.predict(classification_X_test)

print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9721, f1=0.9643, recall=0.20454545454545456
Custom  accuracy=0.9699, f1=0.9648, recall=0.2727272727272727




### Проверка регрессии

In [14]:
df = pd.read_csv('regression.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

regression_X = df.drop(columns=['salary_in_usd', 'salary', 'salary_currency'], axis=1)
regression_y = df['salary_in_usd'].to_numpy()

regression_X = pd.get_dummies(regression_X, drop_first=True).to_numpy(dtype=np.float32)

regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(
    regression_X, regression_y, test_size=0.2, random_state=42
)


sk_reg = RandomForestRegressor()
sk_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_sk = sk_reg.predict(regression_X_test)

my_reg = MyRandomForestRegressor()
my_reg.fit(regression_X_train, regression_y_train)
regression_y_pred_my = my_reg.predict(regression_X_test)
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")
print(f"Custom  RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_my)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_my):.4f}")

SkLearn RMSE=42971.8116, R2=0.5182
Custom  RMSE=42945.5555, R2=0.5188


### Улучшение

### Классификация

### Подбор гиперпараметров

In [15]:
scorer = make_scorer(recall_score, pos_label=1)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'min_samples_split': [2, 4]
}

classification_sk_grid = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

classification_sk_grid.fit(classification_X_train, classification_y_train)
sk_best_clf = classification_sk_grid.best_estimator_

classification_my_grid = GridSearchCV(
    MyRandomForestClassifier(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

classification_my_grid.fit(classification_X_train, classification_y_train)
my_best_clf = classification_my_grid.best_estimator_



### Предсказание

In [16]:
classification_y_pred_sk = sk_best_clf.predict(classification_X_test)

classification_y_pred_my = my_best_clf.predict(classification_X_test)



### Метрики

In [17]:
print(f"Sklearn accuracy={accuracy_score(classification_y_test, classification_y_pred_sk):.4f}, f1={f1_score(classification_y_test, classification_y_pred_sk, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_sk, pos_label=1)}")
print(f"Custom  accuracy={accuracy_score(classification_y_test, classification_y_pred_my):.4f}, f1={f1_score(classification_y_test, classification_y_pred_my, average='weighted'):.4f}, recall={recall_score(classification_y_test, classification_y_pred_my, pos_label=1)}")

Sklearn accuracy=0.9699, f1=0.9602, recall=0.13636363636363635
Custom  accuracy=0.9685, f1=0.9609, recall=0.18181818181818182


### Регрессия

### Подбор гиперпараметров

In [18]:
scorer = make_scorer(r2_score, pos_label=1)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'min_samples_split': [2, 4]
}

regression_sk_grid = GridSearchCV(
    RandomForestRegressor(),
    param_grid,
    cv=5,
    scoring=scorer,
    n_jobs=-1
)

regression_sk_grid.fit(regression_X_train, regression_y_train)
sk_best_reg = regression_sk_grid.best_estimator_

Traceback (most recent call last):
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 308, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 408, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/stdneprov/code/ml/4course/venv/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 196, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/usr/local/

### Предсказание

In [19]:
regression_y_pred_sk = sk_best_reg.predict(regression_X_test)

### Метрики

In [20]:
print(f"SkLearn RMSE={np.sqrt(mean_squared_error(regression_y_test, regression_y_pred_sk)):.4f}, R2={r2_score(regression_y_test, regression_y_pred_sk):.4f}")

SkLearn RMSE=45713.0557, R2=0.4548
