-
Notifications
You must be signed in to change notification settings - Fork 1
/
ks_test.py
74 lines (56 loc) · 2.26 KB
/
ks_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Copyright (c) 2022. RISC Software GmbH.
# All rights reserved.
from typing import Optional
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from tqdm import tqdm
from catabra.ood.base import FeaturewiseOODDetector
class KSTest(FeaturewiseOODDetector):
"""
Two sample Kolmogorov-Smirnov test [1].
Hypothesis test for the following question:
"How likely is it that we would see two sets of samples like this if they were drawn from the same (but unknown)
probability distribution?"
References
----------
.. [1] https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
"""
def __init__(self, subset=1, p_val=0.05, random_state: int=None, verbose=True):
"""
Initialization of KS Test
:param: p_val: p-value to decide statistical significance
"""
super().__init__(subset=subset, verbose=verbose)
self._p_val = p_val
self._random_state = np.random.randint(1000) if random_state is None else random_state
self._subset_indices = None
self._num_cols: Optional[np.ndarray] = None
self._train_data: pd.DataFrame = None
@property
def p_val(self):
return self._p_val
@property
def random_state(self):
return self._random_state
@property
def num_cols(self) -> Optional[np.ndarray]:
return self._num_cols
def _fit_transformer(self, X: pd.DataFrame):
cnts = X.apply(lambda x: x.nunique())
X = X.drop(list(cnts[cnts <= 2].index), axis=1)
self._num_cols = X.select_dtypes(np.number).columns.values
def _transform(self, X: pd.DataFrame):
return X[self._num_cols]
def _fit_transformed(self, X: pd.DataFrame, y: pd.Series):
self._train_data = X
def _predict_transformed(self, X):
return ((1 - self._predict_proba_transformed(X)) <= self._p_val).astype(int)
def _predict_proba_transformed(self, X):
results = pd.Series(np.arange(self._train_data.shape[1]), index=self._train_data.columns)
progress = tqdm(total=X.shape[1])
def __apply_ks_test(i: int):
ks = ks_2samp(self._train_data.iloc[:, i], X.iloc[:, i])[1]
progress.update()
return ks
return 1 - results.apply(__apply_ks_test)