In [1]:
import scipy.stats as sps
import numpy as np
import math
from tqdm import tqdm
from collections.abc import Iterable, Callable
from utils import generate_n_random_numbers

## Task 1 

Исследовать достаточной размера выборки для квантили

In [2]:
ANALYTICAL_EXPECTATION = 0
DISTROS = ["Uniform[-1, 1]", "Norm(0, 1)", "2xUniform[-1, 1]"]

In [3]:
class UniformSum:
    def __init__(self, loc=0, scale=1) -> None:
        self.r = sps.uniform(loc=loc, scale=scale)

    def rvs(self, size: int) -> np.ndarray:
        return np.add(self.r.rvs(size=size), self.r.rvs(size=size))

In [4]:
GENERATOR = {"Uniform[-1, 1]": sps.uniform(loc=-1, scale=2),
             "Norm(0, 1)": sps.norm(loc=0, scale=1),
             "2xUniform[-1, 1]": UniformSum(loc=-1, scale=2)}

In [5]:
from functools import lru_cache

@lru_cache
def norm_quantile(p, loc=0, scale=1):
    return sps.norm.ppf(p, loc=loc, scale=scale)

@lru_cache
def student_quantile(p, n):
    return sps.t.ppf(p, n)

Функция вычисления минимального достаточного размера выборки для вычисления доверительного интервала для $p\cdot100$%-ой квантили с доверительной вероятностью $Q$.

In [6]:
def sufficient_n_for_quantile(p: float, Q: float) -> int:
    def lower_bound_on_sqrt_n(p, Q):
        z = norm_quantile((Q + 1) / 2)
        return (math.sqrt(1-p) * z + math.sqrt((1-p) * z * z + 4)) / (2 * math.sqrt(p))
    return math.ceil(pow(max(lower_bound_on_sqrt_n(p, Q), lower_bound_on_sqrt_n(1 - p, Q)), 2))

Функция вычисления доверительного интервала для квантили с 4-ой практики (приближённая по локальной теореме Муавра-Лапласа)

In [7]:
def quantile_confidence_interval_approx(sample: list, confidence_probability: float, p=0.5) -> tuple:
    sample = sorted(sample)
    n = len(sample)
    term = math.sqrt(n * p * (1 - p)) * norm_quantile((confidence_probability + 1) / 2)
    m_1, m_2 = int(n * p - term), int(n * p + term)
    return (sample[m_1] if m_1 >= 0 else -np.inf, sample[m_2] if m_2 <= n - 1 else np.inf)

In [None]:
QS = [0.99, 0.95, 0.9]
QUANTILES = [0.01, 0.95]
for p in QUANTILES:
    for Q in QS:
        n = sufficient_n_for_quantile(p, Q)
        print(f"Sufficient n for {int(p*100)}%-quantile with Q={Q} is {n}")
        for d in DISTROS:
            sample = GENERATOR[d].rvs(size=n)
            a, b = quantile_confidence_interval_approx(sample, Q, p)
            x_1, x_n = min(sample), max(sample)
            if a <= x_1 <= b:
                print(f"ERROR. x_(1) = {x_1:.3f} is in confidence interval [{a:.3f}; {b:.3f}] for ditribution {d}")
            elif a <= x_n <= b:
                print(f"ERROR. x_(n) = {x_n:.3f} is in confidence interval [{a:.3f}; {b:.3f}] for ditribution {d}")
            else:
                print(f"OK! x_(1) = {x_1:.3f} and x_(n) = {x_n:.3f} are NOT in confidence interval [{a:.3f}; {b:.3f}] for ditribution {d}")

Sufficient n for 1%-quantile with Q=0.99 is 846
OK! x_(1) = -0.997 and x_(n) = 1.000 are NOT in confidence interval [-0.991; -0.952] for ditribution Uniform[-1, 1]
OK! x_(1) = -2.775 and x_(n) = 3.402 are NOT in confidence interval [-2.672; -2.034] for ditribution Norm(0, 1)
OK! x_(1) = -1.889 and x_(n) = 1.953 are NOT in confidence interval [-1.828; -1.577] for ditribution 2xUniform[-1, 1]
Sufficient n for 1%-quantile with Q=0.95 is 563
OK! x_(1) = -1.000 and x_(n) = 0.998 are NOT in confidence interval [-0.999; -0.981] for ditribution Uniform[-1, 1]
OK! x_(1) = -3.242 and x_(n) = 2.723 are NOT in confidence interval [-2.845; -2.105] for ditribution Norm(0, 1)
OK! x_(1) = -1.872 and x_(n) = 1.922 are NOT in confidence interval [-1.849; -1.489] for ditribution 2xUniform[-1, 1]
Sufficient n for 1%-quantile with Q=0.9 is 446
OK! x_(1) = -0.998 and x_(n) = 0.995 are NOT in confidence interval [-0.995; -0.965] for ditribution Uniform[-1, 1]
OK! x_(1) = -2.842 and x_(n) = 3.534 are NOT in c

## Task 2

Реализуем методы для оценки погрешности, наследуемой результатами статистической обработки данных от неопределенности исходных обрабатываемых данных.

Раннее рассмотренные функции подсчёта доверительного интервала для матождиания.

In [9]:
def expectation_confidence_interval_1(sample: Iterable,
                                      confidence_probability: float,
                                      sigma_squared: float) -> tuple[float, float]:
    n = len(sample)
    term = norm_quantile((1 + confidence_probability) / 2) * math.sqrt(sigma_squared / n)
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)


def estimate_sigma(sample: Iterable) -> float:
    x_mean = np.mean(sample)
    return math.sqrt(sum([pow(xi - x_mean, 2) for xi in sample]) / (len(sample) - 1))

def expectation_confidence_interval_2(sample: Iterable,
                                      confidence_probability: float) -> tuple[float, float]:
    n = len(sample)
    term = student_quantile((1 + confidence_probability) / 2, n - 1) * estimate_sigma(sample) / math.sqrt(n) 
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)


@lru_cache
def calculate_D_coef(n: int, p: float) -> float:
    return math.sqrt(-(math.log((1 - p) / 2) / (2 * n))) - (1 / (6 * n))

def expectation_confidence_interval_3(sample: Iterable,
                                      confidence_probability: float,
                                      shift: int = 0) -> tuple[float, float]:
    n = len(sample)
    sample = sorted(sample)
    a, b = sample[shift], sample[-shift - 1]
    term = (b - a) * calculate_D_coef(n, confidence_probability)
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)

Метод Монте-Карло

In [10]:
def monte_carlo_error_estimate(sample: Iterable,
                               deltas: Iterable,
                               iterations: int,
                               func: Callable, *args) -> float | np.ndarray[float]:
    y_0 = func(sample, *args)
    if len(np.array(y_0).shape) > 0:
        delta_y = [0.0] * len(y_0)
    else:
        delta_y = 0.0
    rs = [sps.uniform(loc=-delta, scale=2 * delta) for delta in deltas]
    for _ in tqdm(range(iterations)):
        new_sample = [x + r.rvs() for x, r in zip(sample, rs)]
        delta_y = np.maximum(delta_y, np.abs(np.subtract(func(new_sample, *args), y_0)))
    return delta_y

Метод линеаризации

In [11]:
def partial_derivatives(func: Callable, variables: Iterable, h: float, *params) -> float | np.ndarray[float]:
    partial_derivatives = []
    for i in range(len(variables)):
        vars_plus_h, vars_minus_h = np.copy(variables), np.copy(variables)
        vars_plus_h[i] += h
        vars_minus_h[i] -= h
        partial_derivatives.append(np.subtract(func(vars_plus_h, *params), func(vars_minus_h, *params)) / (2 * h))
    return partial_derivatives

In [12]:
def linearization_error_estimate(sample: Iterable,
                                 deltas: Iterable,
                                 func: Callable, *args) -> float | np.ndarray[float]:
    derivatives = partial_derivatives(func, sample, 1e-6, *args)
    A = np.matrix(np.abs(derivatives))
    if A.shape[0] == 1:
        return np.sum(np.multiply(derivatives, deltas))
    else:
        return np.array(A.T @ deltas)[0]

Оба метода были реализованы так, чтобы работать для любой функции $f: \mathbb{R}^n \rightarrow \mathbb{R}^m$ с любым количеством дополнительных параметров (в нашем случае это функции $\mathbb{R}^n \rightarrow \mathbb{R}^2$, так как мы получаем две границы интервала по выборке размера $n$, при этом некоторым функциям вычисления границ нужно знать, например, дисперсию распределния выборки, которая не имеет погрешности).

In [13]:
MONTE_CARLO_ITERATIONS = 10 ** 4
ERROR = 0.01
Q = 0.95
n = 100
sigma_squared = {"Uniform[-1, 1]": 1 / 3,
                 "Norm(0, 1)": 1,
                 "2xUniform[-1, 1]": 2 / 3}

In [14]:
for d in DISTROS:
    print(f"{d}:")
    sample = generate_n_random_numbers(100, d)
    deltas = np.abs(np.multiply(sample, ERROR))
    print("Monte-Carlo:")
    error_1 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_1, Q, sigma_squared[d])
    error_2 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_2, Q)
    error_3 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_3, Q, 0)
    print(error_1, error_2, error_3)

    print("Linearization:")
    error_1 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_1, Q, sigma_squared[d])
    error_2 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_2, Q)
    error_3 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_3, Q, 0)
    print(error_1, error_2, error_3)

    print('\n')

Uniform[-1, 1]:
Monte-Carlo:


100%|██████████| 10000/10000 [00:27<00:00, 366.13it/s]
100%|██████████| 10000/10000 [00:27<00:00, 368.75it/s]
100%|██████████| 10000/10000 [00:26<00:00, 378.52it/s]


[0.00116094 0.00116094] [0.00134225 0.00133616] [0.00356786 0.00309058]
Linearization:
[0.00483925 0.00483925] [0.0048758 0.0048027] [0.00722035 0.00720896]


Norm(0, 1):
Monte-Carlo:


100%|██████████| 10000/10000 [00:25<00:00, 387.48it/s]
100%|██████████| 10000/10000 [00:27<00:00, 369.86it/s]
100%|██████████| 10000/10000 [00:26<00:00, 381.74it/s]


[0.00222193 0.00222193] [0.00217457 0.00237288] [0.00767585 0.00739322]
Linearization:
[0.00792337 0.00792337] [0.00786326 0.00798348] [0.01399866 0.0141309 ]


2xUniform[-1, 1]:
Monte-Carlo:


100%|██████████| 10000/10000 [00:25<00:00, 391.71it/s]
100%|██████████| 10000/10000 [00:27<00:00, 369.30it/s]
100%|██████████| 10000/10000 [00:26<00:00, 375.27it/s]

[0.0014785 0.0014785] [0.00156029 0.00188027] [0.00567697 0.00575179]
Linearization:
[0.00580088 0.00580088] [0.00566803 0.00593374] [0.01032603 0.01033735]





