# Задание 1

Процедура генерация датасета:
Сгенерируйте датасет с 10000 наблюдений и 1000 колонок (сэмплируйте из разных распеределений) и сформируйте из него таргет на сонове 100 колонок + зашумление (общее или небольшое для каждой колонки - постарайтесь сделать так чтобы шум не сильно влиял на корреляции между предикторами и таргетам). Удостоверьтесь, что в датасете существуют колонки, которые не использовались для таргета, но при этом имеют высокую корреляцию с теми, что использовались (покажите это в коде).

Реализуйте forward stage wise регрессию стандартным образом и с помощью QR разложения наиболее быстрым образом (засекайте время для всех опробованных вариантов). Замерьте качество и процент колонок, которые были правильно найдены.

**Дополнительно**: 
Попробуйте генерировать данные таким образом, чтобы ошибка постепенно ухудшалась. Подсказка: увеличивайте шум, используйте нелинейные функции и комбинации предикторов. Попробуйте оценить bias и variance для forward stage-wise regression.

In [158]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression

In [128]:
PREDICTOR_CNT = 1_000
TARGET_PREDICTOR_CNT = 100
SAMPLE_CNT = 10_000

In [129]:
class DistributionGenerator:
    @staticmethod
    def generate_random_exponential(n: int = 10000, scale: float | None = None):
        """
        Generate random exponential distribution
        :param n: number of samples
        :param scale: scale parameter of the exponential distribution
        :return: np.array of random samples
        """

        if scale is None:
            scale = np.random.uniform(0.75, 2.0)
        return np.random.exponential(scale=scale, size=n)

    @staticmethod
    def generate_random_uniform(n: int = 10000, low: float | None = None, length: float | None = None):
        """
        Generate random uniform distribution
        :param n: number of samples
        :param low: lower bound of the uniform distribution
        :param length: length of the uniform distribution
        :return: np.array of random samples
        """

        # Kekw: using uniform to generate limits to uniform
        if low is None:
            low = np.random.uniform(-2.0, -1.0)
        if length is None:
            length = np.random.uniform(1.0, 3.0)
        return np.random.uniform(low=low, high=low + length, size=n)

    @staticmethod
    def generate_random_normal(n: int = 10000, loc: float | None = None, scale: float | None = None):
        """
        Generate random normal distribution
        :param n: number of samples
        :param loc: mean of the normal distribution
        :param scale: standard deviation of the normal distribution
        :return: np.array of random samples
        """

        if loc is None:
            loc = np.random.uniform(-1.0, 1.0)
        if scale is None:
            scale = np.random.uniform(0.2, 2.0)
        return np.random.normal(loc=loc, scale=scale, size=n)

    distribution_fn_list = [
        # generate_random_exponential,
        # generate_random_uniform,
        generate_random_normal,
    ]
    @staticmethod
    def generate_random_distribution(n: int = 10000):
        """
        Generate random distribution
        :param n: number of samples
        :return: np.array of random samples
        """

        # Choose a random distribution function
        distribution_fn = np.random.choice(DistributionGenerator.distribution_fn_list)
        return distribution_fn(n)


    @staticmethod
    def generate_random_noice(n: int = 10000, limit: float = 0.05):
        """
        Generate random noice based on uniform distribution
        :param n: number of samples
        :param limit: limit of the uniform distribution
        :return: np.array of random noice
        """
        return DistributionGenerator.generate_random_uniform(n, -limit, limit * 2)

In [None]:
def create_predictors_targets_table(noise_level: float = 0.05):
    """
    Create a table with predictors and target
    :param noise_level: noise level of the target
    :return: pandas data frame with predictors and target, selected predictors and their coefficients (in tuple form)
    """

    # Generate random data
    data = {
        f"predictor_{i}": DistributionGenerator.generate_random_distribution(SAMPLE_CNT)
        for i in range(PREDICTOR_CNT)
    }

    # Choose random predictors for the target and generate coefficients for them
    all_predictors = list(data.keys())
    selected_predictors = np.random.choice(a=all_predictors, size=TARGET_PREDICTOR_CNT, replace=False)
    predictors_koefs = DistributionGenerator.generate_random_uniform(TARGET_PREDICTOR_CNT, 1.0, 2.0)

    # Generate target based on the selected predictors
    target_predictors_matrix = np.array([
        data[predictor] for predictor in selected_predictors
    ])
    target = predictors_koefs @ target_predictors_matrix

    # Add noice to the target
    noice = DistributionGenerator.generate_random_noice(SAMPLE_CNT, limit=noise_level)
    data["target"] = target + noice

    # Create a pandas data frame
    data_frame = pd.DataFrame(data)
    return data_frame, selected_predictors, predictors_koefs

In [None]:
data_frame, selected_predictors_true, predictors_koefs_true = create_predictors_targets_table()

# Separate predictors and target
X = data_frame.loc[:, data_frame.columns != 'target'].to_numpy()
y = data_frame["target"].to_numpy()

data_frame.head()

Unnamed: 0,predictor_0,predictor_1,predictor_2,predictor_3,predictor_4,predictor_5,predictor_6,predictor_7,predictor_8,predictor_9,...,predictor_991,predictor_992,predictor_993,predictor_994,predictor_995,predictor_996,predictor_997,predictor_998,predictor_999,target
0,-0.002536,-0.28348,4.132056,-2.261734,0.314676,0.328311,-0.531619,-2.600318,-0.533277,-0.683464,...,0.139523,-0.933903,0.774611,0.034123,1.664838,2.84983,0.484816,-0.593783,-0.664478,-4.581401
1,-1.760988,-0.050266,-0.456003,2.051371,-0.085769,1.285934,-0.891529,-0.027588,-1.808646,-0.556348,...,-1.672874,-1.365366,-0.20399,-0.306436,0.510762,1.227865,-1.157598,0.044527,-0.356635,-35.077965
2,0.191722,-2.368395,0.455368,-0.321044,0.136216,0.631543,-0.734875,0.072611,-0.390101,-0.319276,...,-3.166634,-0.985328,-0.219437,-1.204799,0.477092,1.518172,-0.011196,-0.04493,-1.041703,14.53655
3,-0.71863,0.225351,0.861022,1.313876,0.856478,1.253883,0.289715,-2.198547,0.792668,-0.459761,...,-1.027207,-1.04007,-0.493549,0.408184,1.279196,3.148266,-0.42764,-1.328281,-0.39553,-8.19643
4,0.483937,0.244965,3.575081,2.517484,0.32769,1.010386,-0.072754,2.545971,0.133293,-0.369577,...,-1.061963,-1.003118,0.393518,-1.278046,2.244314,-2.31341,2.890069,-0.318256,-0.820135,-31.811734


In [132]:
def show_predictors_targets_table_correlation_info(data_frame: pd.DataFrame, selected_predictors_true: np.ndarray):
    """
    Show correlation information between predictors and target
    :param data_frame: pandas data frame with predictors and target
    :param selected_predictors_true: true selected predictors
    """

    # Calculate correlation table for all the predictors and target
    correlation_table = data_frame.corr()

    # Calculate correlation information for the selected predictors for the target
    selected_correlations = correlation_table["target"][selected_predictors_true].to_numpy()
    min_selected_correlations = selected_correlations.min()

    # Calculate correlation information for the non-selected predictors for the target
    non_selected_correlations = correlation_table["target"].drop(labels=selected_predictors_true).drop(labels="target").to_numpy()
    max_non_selected_correlations = non_selected_correlations.max()

    # Calculate correlation information for the non-selected predictors for the selected predictors
    predictors_correlation_table = correlation_table.drop(columns="target").drop(labels="target")
    numpy_correlation_table = predictors_correlation_table[selected_predictors_true].drop(labels=selected_predictors_true).to_numpy()
    np.fill_diagonal(numpy_correlation_table, 0.0)
    five_most_correlated = sorted(numpy_correlation_table.flatten(), reverse=True)[:5 * 2]

    print("Minimum correlation of selected predictors - target pairs is", min_selected_correlations)
    print("Maximum correlation of non-selected predictors - target pairs is", max_non_selected_correlations)
    print("At least five pairs of non-selected predictors - selected predictors have correlation not less than", five_most_correlated[-1])

show_predictors_targets_table_correlation_info(data_frame, selected_predictors_true)

Minimum correlation of selected predictors - target pairs is 0.011405842900809062
Maximum correlation of non-selected predictors - target pairs is 0.03311436720677513
At least five pairs of non-selected predictors - selected predictors have correlation not less than 0.03752976273208008


In [133]:
def QR_decomposition(X: np.ndarray): # ~3.4s on test (1001, 1003)
    """
    Perform QR decomposition of the matrix X (self-written)
    :param X: matrix to decompose
    :return: Q and R matrices
    """

    Q = np.zeros_like(X)
    R = np.zeros((X.shape[1], X.shape[1]))

    for i in range(X.shape[1]):
        Q[:, i] = X[:, i]
        for j in range(i):
            R[j, i] = Q[:, j] @ Q[:, i]
            Q[:, i] -= R[j, i] * Q[:, j]
        R[i, i] = np.linalg.norm(Q[:, i])
        Q[:, i] /= R[i, i]

    return Q, R

def QR_decomposition_fast(X: np.ndarray): # ~0.2s on test (1001, 1003)
    """
    Perform QR decomposition of the matrix X (numpy)
    :param X: matrix to decompose
    :return: Q and R matrices
    """

    return np.linalg.qr(X)

def test_QR_decomposition():
    """
    Test QR decomposition
    """

    A = np.random.rand(1001, 1003)
    Q, R = QR_decomposition(A)
    D = Q @ R - A
    if max(-D.min(), D.max()) < 1e-6:
        print("QR decomposition is correct")
    else:
        print("QR decomposition is incorrect")

test_QR_decomposition()

QR decomposition is correct


In [None]:
class IncrementalForwardStagewiseRegression:
    def __init__(self, max_iter: int = 1_500_000, tol: float = 1e-3, step: float = 1e-2, step_decay: float = 0.8):
        """
        Initialize the class
        :param max_iter: maximum number of iterations
        :param tol: tolerance for the correlation
        :param step: step size
        :param step_decay: step decay
        Default values are specially selected for fit v3
        For fit v1 and v2 use max_iter: int = 15_000, tol: float = 1e-3, step: float = 1e-3
        """

        self.max_iter = max_iter
        self.tol = tol
        self.step = step
        self.step_decay = step_decay

        self.residual = None
        self.beta = None

        self.X_mean = None
        self.X_std = None

        self.y_mean = None
        self.y_std = None

    def _fit_normalization(self, X: np.ndarray, y: np.ndarray):
        """
        Normalize data
        :param X: matrix of predictors
        :param y: target vector
        """

        self.X_mean = X.mean(axis=0)
        self.X_std = X.std(axis=0)
        self.y_mean = y.mean()
        self.y_std = y.std()

        X = (X - self.X_mean) / self.X_std
        y = (y - self.y_mean) / self.y_std

        return X, y
    
    def _normileze_X(self, X: np.ndarray):
        return (X - self.X_mean) / self.X_std
    
    def _unnormalize_y(self, y: np.ndarray):
        return y * self.y_std + self.y_mean


    def _fit_v1(self, X: np.ndarray, y: np.ndarray):
        """
        Perform incremental forward stagewise regression
        :param X: matrix of predictors
        :param y: target vector
        """

        # Normalize data
        X, y = self._fit_normalization(X, y)

        # Initialize variables
        residual = y.copy()
        self.beta = np.zeros(X.shape[1])

        for _ in range(self.max_iter):
            # Calculate correlations (slow)
            correlations = X.T @ residual
            correlations_magnitude = np.abs(correlations)

            # Find the best predictor (the one with the highest correlation)
            best_predictor = np.argmax(correlations_magnitude)
            best_correlation = correlations_magnitude[best_predictor]

            # Check if the best correlation is less than the tolerance. So the beta is predicting the target well enough
            if best_correlation < self.tol:
                print("Converged at iteration", _)
                break

            # Update beta and residual
            self.beta[best_predictor] += self.step * np.sign(correlations[best_predictor])
            residual -= self.step * np.sign(correlations[best_predictor]) * X[:, best_predictor]
        else:
            print("Warning: maximum number of iterations reached. Limit was", self.max_iter)

    def _fit_v2(self, X: np.ndarray, y: np.ndarray):
        X, y = self._fit_normalization(X, y)

        residual = y.copy()
        self.beta = np.zeros(X.shape[1])

        correlations = X.T @ residual

        for _ in range(self.max_iter):
            correlations_magnitude = np.abs(correlations)

            best_predictor = np.argmax(correlations_magnitude)
            best_correlation = correlations_magnitude[best_predictor]

            if best_correlation < self.tol:
                print("Converged at iteration", _)
                break

            # Update beta and correlations
            self.beta[best_predictor] += self.step * np.sign(correlations[best_predictor])
            correlations -= self.step * np.sign(correlations[best_predictor]) * (X.T @ X[:, best_predictor])
        else:
            print("Warning: maximum number of iterations reached. Limit was", self.max_iter)

    def _fit_v3(
            self,
            X: np.ndarray,
            y: np.ndarray,
        ):
        """
        Perform incremental forward stagewise regression
        (the fastest method without multiplication of the whole matrix)
        :param X: matrix of predictors
        :param y: target vector
        :param max_iter: maximum number of iterations
        :param tol: tolerance for the correlation
        :param step: step size
        :param step_decay: step decay
        """

        X, y = self._fit_normalization(X, y)

        step = self.step
        self.beta = np.zeros(X.shape[1])

        correlations = X.T @ y

        XTX = X.T @ X

        for _ in range(self.max_iter):
            correlations_magnitude = np.abs(correlations)

            best_predictor = np.argmax(correlations_magnitude)
            best_correlation = correlations_magnitude[best_predictor]

            if best_correlation < self.tol:
                print("Converged at iteration", _)
                break

            # Update beta and correlations
            self.beta[best_predictor] += step * np.sign(correlations[best_predictor])
            correlations -= step * np.sign(correlations[best_predictor]) * XTX[:, best_predictor]

            if _ != 0 and _ % 10_000 == 0:
                step *= self.step_decay
        else:
            print("Warning: maximum number of iterations reached. Limit was", self.max_iter)
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Fit the model
        :param X: matrix of predictors
        :param y: target vector
        """

        # self._fit_v1(X, y) # ~19.7s on 15_000 iterations
        # self._fit_v2(X, y) # ~19.3s on 15_000 iterations
        self._fit_v3(X, y) # ~3.4s on 500_000 iterations (No matrix multiplication in the loop)

    def predict(self, X: np.ndarray):
        """
        Predict the target
        :param X: matrix of predictors
        :return: predicted target
        """

        return self._unnormalize_y(self._normileze_X(X) @ self.beta)
    
    def compare_coefficients_with_true(self, selected_predictors_true: np.ndarray):
        """
        Compare coefficients with the true ones
        :param selected_predictors_true: true selected predictors
        """

        # Find the coefficients that are not zero
        coeffs_found = (self.beta > self.tol).sum()
        print(f"Found {coeffs_found} coefficients out of {len(selected_predictors_true)}")

        matched_coeffs = 0

        for predictor in selected_predictors_true:
            predictor_id = int(predictor.split("_")[-1])
            
            # There should be a coefficient comparator (self.beta vs predictors_koefs_true),
            # but beta was trained on normalized data and predictors_koefs_true used on non-normalized data
            # So, we can't compare them =(
            if self.beta[predictor_id] > self.tol:
                matched_coeffs += 1

        print(f"Matched {matched_coeffs} coefficients out of {len(selected_predictors_true)}")

In [135]:
model = IncrementalForwardStagewiseRegression()
model.fit(X, y)
model.compare_coefficients_with_true(selected_predictors_true)

Converged at iteration 490243
Found 100 coefficients out of 100
Matched 100 coefficients out of 100


In [164]:
def test_on_real_data(model):
    """
    Test the model on real data
    :param model: model to test
    """

    data_california = fetch_california_housing()
    X_train, X_test, y_train, y_test = train_test_split(data_california.data, data_california.target, test_size=0.3, random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"MSE: {mse} | MAE: {mae}")

test_on_real_data(IncrementalForwardStagewiseRegression(max_iter=50_000))

MSE: 0.5299552900259948 | MAE: 0.52743384496775


In [165]:
test_on_real_data(LinearRegression())

MSE: 0.5305677824766757 | MAE: 0.5272474538306168
