## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [None]:
import numpy as np

In [None]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob

    def _generate_splits(self, data: np.ndarray):
        '''
        Create random subsets of indices for each bag and save them to self.indices_list
        '''
        self.indices_list = []
        data_size = len(data)
        for i in range(self.num_bags):  # Обновлено имя переменной цикла
            # Генерация случайных индексов с повторением
            bag_indices = np.random.choice(data_size, size=data_size, replace=True)
            self.indices_list.append(bag_indices)

    def fit(self, model_constructor, data, target):
        '''
        Train a model on each bag.
        The model_constructor parameter expects a callable that returns an unfitted model instance.

        Example usage:

        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set([len(bag) for bag in self.indices_list])) == 1, 'All bags must be of equal length!'
        assert len(self.indices_list[0]) == len(data), 'Each bag must contain the same number of elements as the dataset!'
        self.models_list = []
        for indices in self.indices_list:  # Изменено имя переменной
            model = model_constructor()  # Упрощено название переменной модели
            # Создание подвыборки данных и целей для текущего набора индексов
            bag_data, bag_target = data[indices], target[indices]
            self.models_list.append(model.fit(bag_data, bag_target))
        if self.oob:
            self.data = data
            self.target = target

    def predict(self, data):
        '''
        Calculate the average prediction for each object in the given dataset
        '''
        # Получение предсказаний от всех моделей
        predictions = np.array([model.predict(data) for model in self.models_list])
        return predictions.mean(axis=0)

    def _get_oob_predictions_from_every_model(self):
        '''
        Collect predictions for each training object from models
        that did not use this object during their training
        '''
        predictions_for_each_object = [[] for _ in range(len(self.data))]
        # Ищем индексы объектов, которые не были включены в обучение текущей модели
        for train_indices, model in zip(self.indices_list, self.models_list):
            excluded_indices = set(range(len(self.data))) - set(train_indices)
            for idx in excluded_indices:
                # Получаем предсказание для каждого исключенного объекта
                pred = model.predict(self.data[idx].reshape(1, -1))
                predictions_for_each_object[idx].append(pred[0])

        self.list_of_predictions_lists = np.array(predictions_for_each_object, dtype=object)

    def _get_averaged_oob_predictions(self):
        '''
        Calculate the average out-of-bag predictions for each training object.
        If an object was included in all bags, it will have no predictions.
        '''
        self._get_oob_predictions_from_every_model()
        # Среднее значение предсказаний для объектов с доступными данными
        self.oob_predictions = np.array([
            None if not preds else np.mean(preds)
            for preds in self.list_of_predictions_lists
        ])

    def OOB_score(self):
        '''
        Compute the mean squared error for all training objects
        that have at least one out-of-bag prediction
        '''
        self._get_averaged_oob_predictions()
        # Рассчитываем MSE только для объектов с доступными предсказаниями
        valid_indices = np.array([p is not None for p in self.oob_predictions])
        actual_values = self.target[valid_indices]
        predicted_values = self.oob_predictions[valid_indices]
        return np.mean((actual_values - predicted_values) ** 2)


### Local tests:

In [None]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

#### Simple tests:

In [None]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Simple tests done!')

  0%|          | 0/100 [00:00<?, ?it/s]

Simple tests done!


#### Medium tests

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Medium tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Medium tests done!


#### Complex tests:

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'

print('Complex tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Complex tests done!


In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

0.0004255588285576595

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!