# 11.10.2024

In [1]:
import scipy.stats as sps
from scipy.optimize import minimize, newton
import numpy as np
import math
from tqdm import tqdm
from collections.abc import Iterable, Callable
import os, sys
practice_dir = os.path.join(os.path.dirname(os.path.abspath('')), 'practice')
if not practice_dir in sys.path:
    sys.path.append(practice_dir)
from utils import generate_n_random_numbers

In [2]:
ANALYTICAL_EXPECTATION = 0
DISTROS = ["Uniform[-1, 1]", "Norm(0, 1)", "2xUniform[-1, 1]"]

In [3]:
from functools import lru_cache

@lru_cache
def norm_quantile(p: float, loc: float = 0, scale: float = 1) -> float:
    return sps.norm.ppf(p, loc=loc, scale=scale)

@lru_cache
def student_quantile(p: float, n: int) -> float:
    return sps.t.ppf(p, n)

@lru_cache
def chi_squared_quantile(p: float, n: int) -> float:
    return sps.chi2.ppf(p, n)

## Task 2

Реализуем методы для оценки погрешности, наследуемой результатами статистической обработки данных от неопределенности исходных обрабатываемых данных.

Раннее рассмотренные функции подсчёта доверительного интервала для матождиания.

In [4]:
def expectation_confidence_interval_1(sample: Iterable,
                                      confidence_probability: float,
                                      sigma_squared: float) -> tuple[float, float]:
    n = len(sample)
    term = norm_quantile((1 + confidence_probability) / 2) * math.sqrt(sigma_squared / n)
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)


def sigma_squared_estimate(sample: list) -> float:
    x_mean = np.mean(sample)
    return sum([pow(xi - x_mean, 2) for xi in sample]) / (len(sample) - 1)

def expectation_confidence_interval_2(sample: Iterable,
                                      confidence_probability: float) -> tuple[float, float]:
    n = len(sample)
    term = student_quantile((1 + confidence_probability) / 2, n - 1) * math.sqrt(sigma_squared_estimate(sample) / n) 
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)


@lru_cache
def calculate_D_coef(n: int, p: float) -> float:
    return math.sqrt(-(math.log((1 - p) / 2) / (2 * n))) - (1 / (6 * n))

def expectation_confidence_interval_3(sample: Iterable,
                                      confidence_probability: float,
                                      shift: int = 0) -> tuple[float, float]:
    n = len(sample)
    sample = sorted(sample)
    a, b = sample[shift], sample[-shift - 1]
    term = (b - a) * calculate_D_coef(n, confidence_probability)
    x_mean = np.mean(sample)
    return (x_mean - term, x_mean + term)

Раннее рассмотренная функция вычисления доверительного интервала для дисперсии

In [5]:
def sigma_squared_confidence_interval(sample: Iterable, confidence_probability: float) -> tuple[float, float]:
    deviation_estimate = sigma_squared_estimate(sample)
    n = len(sample)
    return (deviation_estimate * (n - 1) / chi_squared_quantile((1 + confidence_probability) / 2, n - 1), 
            deviation_estimate * (n - 1) / chi_squared_quantile((1 - confidence_probability) / 2, n - 1))

Метод Монте-Карло

In [6]:
def monte_carlo_error_estimate(sample: Iterable,
                               deltas: Iterable,
                               iterations: int,
                               func: Callable,
                               *args) -> float | np.ndarray[float]:
    y_0 = func(sample, *args)
    if len(np.array(y_0).shape) > 0:
        delta_y = [0.0] * len(y_0)
    else:
        delta_y = 0.0
    rs = [sps.uniform(loc=-delta, scale=2 * delta) for delta in deltas]
    for _ in tqdm(range(iterations)):
        new_sample = [x + r.rvs() for x, r in zip(sample, rs)]
        delta_y = np.maximum(delta_y, np.abs(np.subtract(func(new_sample, *args), y_0)))
    return delta_y

Метод линеаризации

In [7]:
def partial_derivatives(func: Callable, variables: Iterable, h: float, *params) -> float | np.ndarray[float]:
    partial_derivatives = []
    for i in range(len(variables)):
        vars_plus_h, vars_minus_h = np.copy(variables), np.copy(variables)
        vars_plus_h[i] += h
        vars_minus_h[i] -= h
        partial_derivatives.append(np.subtract(func(vars_plus_h, *params), func(vars_minus_h, *params)) / (2 * h))
    return partial_derivatives

def linearization_error_estimate(sample: Iterable,
                                 deltas: Iterable,
                                 func: Callable,
                                 *args) -> float | np.ndarray[float]:
    derivatives = partial_derivatives(func, sample, 1e-6, *args)
    A = np.matrix(np.abs(derivatives))
    if A.shape[0] == 1:
        return np.sum(np.multiply(derivatives, deltas))
    else:
        return np.array(A.T @ deltas)[0]

Метод Крейновича

In [8]:
def maximum_likelihood_parameters_estimation(sample: Iterable) -> tuple[float, float]:
    """Find MLE of Cauchy distribution parameters loc and scale."""
    def neglikelihood(params, data):
        return -sps.cauchy.logpdf(data, loc=params[0], scale=params[1]).sum()
    res = minimize(neglikelihood, [0, 0.00001], args=(sample))
    return tuple(res.x)

# without using the scipy.optimize.minimize method by solving the equation with the Newton method
def maximum_likelihood_parameters_estimation_2(delta_y: Iterable) -> float:
    def f(x: float, delta_y_squared: np.ndarray[float]) -> float:
        return np.divide(x * x, np.add(x * x, delta_y_squared)).sum() - len(delta_y_squared) / 2
        
    delta_y_squared = np.multiply(delta_y, delta_y)
    res = newton(f, 0.00001, args=(delta_y_squared,) )
    return res

def kreinovich_error_estimate(sample: Iterable,
                              deltas: Iterable,
                              N: int,
                              func: Callable,
                              *args) -> float | np.ndarray[float]:
    k = 10 ** -2
    y_0 = func(sample, *args)
    delta_y = np.zeros((N, len(y_0)) if len(np.array(y_0).shape) > 0 else (N,))
    rs = [sps.cauchy(loc=x_i, scale=k * delta_i) for x_i, delta_i in zip(sample, deltas)]
    
    for j in range(N):
        new_sample = [r.rvs() for r in rs]
        delta_y[j] = np.subtract(func(new_sample, *args), y_0)
    if len(np.array(y_0).shape) > 0:
        delta_y = np.array(np.matrix(delta_y).T)
    else:
        delta_y = [delta_y]

    d = [maximum_likelihood_parameters_estimation_2(delta_y_i) for delta_y_i in delta_y]
    return np.divide(d, k).astype(float) if len(d) > 1 else d[0] / k

Все методы были реализованы так, чтобы они работали для любой функции $f: \mathbb{R}^n \rightarrow \mathbb{R}^m$ с любым количеством дополнительных параметров (в нашем случае это функции $\mathbb{R}^n \rightarrow \mathbb{R}^2$, так как мы получаем две границы интервала по выборке размера $n$, при этом некоторым функциям вычисления границ нужно знать, например, дисперсию распределния выборки, которая не имеет погрешности).

In [9]:
MONTE_CARLO_ITERATIONS = 10 ** 4
KREINOVICH_ITERATIONS = 300
ERROR = 0.01
Q = 0.95
n = 100
sigma_squared = {"Uniform[-1, 1]": 1 / 3,
                 "Norm(0, 1)": 1,
                 "2xUniform[-1, 1]": 2 / 3}

In [10]:
for d in DISTROS:
    print(f"{d}:")
    sample = generate_n_random_numbers(n, d)
    deltas = np.abs(np.multiply(sample, ERROR))
    print("Monte-Carlo:")
    error_1 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_1, Q, sigma_squared[d])
    error_2 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_2, Q)
    error_3 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, expectation_confidence_interval_3, Q, 0)
    error_4 = monte_carlo_error_estimate(sample, deltas, MONTE_CARLO_ITERATIONS, sigma_squared_confidence_interval, Q)
    print(error_1, error_2, error_3, error_4)

    print("Linearization:")
    error_1 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_1, Q, sigma_squared[d])
    error_2 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_2, Q)
    error_3 = linearization_error_estimate(sample, deltas, expectation_confidence_interval_3, Q, 0)
    error_4 = linearization_error_estimate(sample, deltas, sigma_squared_confidence_interval, Q)
    print(error_1, error_2, error_3, error_4)

    print("Kreinovich:")
    error_1 = kreinovich_error_estimate(sample, deltas, KREINOVICH_ITERATIONS, expectation_confidence_interval_1, Q, sigma_squared[d])
    error_2 = kreinovich_error_estimate(sample, deltas, KREINOVICH_ITERATIONS, expectation_confidence_interval_2, Q)
    error_3 = kreinovich_error_estimate(sample, deltas, KREINOVICH_ITERATIONS, expectation_confidence_interval_3, Q, 0)
    error_4 = kreinovich_error_estimate(sample, deltas, KREINOVICH_ITERATIONS, sigma_squared_confidence_interval, Q)
    print(error_1, error_2, error_3, error_4)

    print('\n')

Uniform[-1, 1]:
Monte-Carlo:


100%|██████████| 10000/10000 [00:28<00:00, 349.42it/s]
100%|██████████| 10000/10000 [00:33<00:00, 298.47it/s]
100%|██████████| 10000/10000 [00:28<00:00, 348.95it/s]
100%|██████████| 10000/10000 [00:28<00:00, 354.74it/s]


[0.00132566 0.00132566] [0.00125077 0.00130159] [0.00323788 0.00330852] [0.00179024 0.0031339 ]
Linearization:
[0.00503728 0.00503728] [0.00506795 0.0050066 ] [0.00750805 0.00750707] [0.0052802  0.00924323]
Kreinovich:
[0.00486151 0.00486151] [0.00507097 0.00498534] [0.00760246 0.00724767] [0.0053538  0.00937208]


Norm(0, 1):
Monte-Carlo:


100%|██████████| 10000/10000 [00:27<00:00, 365.23it/s]
100%|██████████| 10000/10000 [00:28<00:00, 354.76it/s]
100%|██████████| 10000/10000 [00:27<00:00, 361.61it/s]
100%|██████████| 10000/10000 [00:27<00:00, 368.25it/s]


[0.00191207 0.00191207] [0.00206041 0.00176784] [0.00637968 0.00605098] [0.00424878 0.00743769]
Linearization:
[0.00660383 0.00660383] [0.00689657 0.0063111 ] [0.01172147 0.01160905] [0.01099735 0.01925137]
Kreinovich:
[0.00666844 0.00666844] [0.00686994 0.00626807] [0.01390438 0.014493  ] [0.01092274 0.01912076]


2xUniform[-1, 1]:
Monte-Carlo:


100%|██████████| 10000/10000 [00:27<00:00, 367.87it/s]
100%|██████████| 10000/10000 [00:27<00:00, 367.85it/s]
100%|██████████| 10000/10000 [00:30<00:00, 332.54it/s]
100%|██████████| 10000/10000 [00:35<00:00, 278.51it/s]


[0.00225075 0.00225075] [0.00234944 0.00244394] [0.00590617 0.00574992] [0.00407164 0.0071276 ]
Linearization:
[0.00732272 0.00732272] [0.00716269 0.00748274] [0.01191343 0.01191777] [0.01199601 0.02099957]
Kreinovich:
[0.00763952 0.00763952] [0.00660919 0.00700813] [0.01148424 0.01120536] [0.01194381 0.0209082 ]


