## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

In [2]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
            indices_bag_i = np.random.choice(np.arange(data_length), size=(data_length,), replace=True)
            self.indices_list.append(indices_bag_i)
            # Your Code Here
        
    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            indices_bag_i = self.indices_list[bag]
            data_bag, target_bag = data[indices_bag_i], target[indices_bag_i] # Your Code Here
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        # Your code here
        y_preds = []
        for model in self.models_list:
            y_p = model.predict(data)
        y_preds.append(y_p)
        y_preds = np.array(y_preds) # (n_models, n_data)

        y_pred = np.mean(y_preds, axis=0)
        return y_pred
    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        for i, data_i in enumerate(self.data):
            for idxs, model in zip(self.indices_list, self.models_list):
                if i not in idxs:
                    list_of_predictions_lists[i].append(float(model.predict(data_i.reshape(1, -1))))

        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        self.oob_predictions = np.array([np.mean(di) if len(di)!=0 else np.nan for di in self.list_of_predictions_lists]) # Your Code Here
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        return np.nanmean((self.target - self.oob_predictions)**2) # Your Code Here

In [5]:
x = np.array([3., np.nan])
x

array([ 3., nan])

In [21]:
x[~np.isnan(x)]

array([3.])

### Local tests:

#### Simple tests:

In [3]:
def count_nans(br):
    return np.isnan(br.oob_predictions).sum()

In [4]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert (
        np.mean((predictions - y) ** 2) < 1e-6
    ), "Linear dependency should be fitted with almost zero error!"
    assert bagging_regressor.oob, "OOB feature must be turned on"
    oob_score = bagging_regressor.OOB_score()
    print(count_nans(bagging_regressor))
    assert oob_score < 1e-6, "OOB error for linear dependency should be also close to zero!"
    assert (
        abs(
            np.mean(list(map(len, bagging_regressor.list_of_predictions_lists)))
            / bagging_regressor.num_bags
            - 1 / np.exp(1)
        )
        < 0.1
    ), "Probability of missing a bag should be close to theoretical value!"

print("Simple tests done!")

  0%|          | 0/100 [00:00<?, ?it/s]

24
17
13
17
20
22
24
19
21
26
25
16
22
18
24
22
15
16
15
18
28
20
18
23
24
21
23
18
17
20
18
24
22
20
24
18
18
18
24
24
21
20
30
24
28
21
22
21
18
17
20
20
25
25
28
17
17
23
19
24
22
23
24
21
22
16
18
29
19
25
19
19
23
23
19
19
26
26
20
16
21
18
22
21
27
27
19
28
19
16
21
20
15
24
21
20
31
22
21
20
Simple tests done!


#### Medium tests

In [22]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y) ** 2)
    assert bagging_regressor.oob, "OOB feature must be turned on"
    oob_score = bagging_regressor.OOB_score()
    print(count_nans(bagging_regressor))
    assert (
        oob_score > average_train_error
    ), "OOB error must be higher than train error due to overfitting!"
    assert (
        abs(
            np.mean(list(map(len, bagging_regressor.list_of_predictions_lists)))
            / bagging_regressor.num_bags
            - 1 / np.exp(1)
        )
        < 0.1
    ), "Probability of missing a bag should be close to theoretical value!"

print("Medium tests done!")

  0%|          | 0/10 [00:00<?, ?it/s]

0
0
0
0
0
0
0
0
0
0
Medium tests done!


#### Complex tests:

In [23]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert (
        abs(
            np.mean(list(map(len, bagging_regressor.list_of_predictions_lists)))
            / bagging_regressor.num_bags
            - 1 / np.exp(1)
        )
        < 1e-2
    ), "Probability of missing a bag should be close to theoretical value!"

print("Complex tests done!")

  0%|          | 0/10 [00:00<?, ?it/s]

Complex tests done!


In [24]:
np.mean(
    list(map(len, bagging_regressor.list_of_predictions_lists))
) / bagging_regressor.num_bags - 1 / np.exp(1)

0.0008955588285576299

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!