Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
sronilsson committed Mar 27, 2024
2 parents 27fa9a5 + 9b0640b commit 500a031
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 30 deletions.
14 changes: 12 additions & 2 deletions simba/SimBA.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,18 @@ def __init__(self, config_path: str):
config_path=self.config_path
),
)
btn_agg_boolean_conditional_statistics = Button(processmovementdupLabel, text="AGGREGATE BOOLEAN CONDITIONAL STATISTICS", fg="grey", command=lambda: BooleanConditionalSlicerPopUp(config_path=self.config_path))
spontaneous_alternation_pop_up_btn = Button(processmovementdupLabel, text="SPONTANEOUS ALTERNATION", fg="navy", command=lambda: SpontaneousAlternationPopUp(config_path=self.config_path))
btn_agg_boolean_conditional_statistics = Button(
processmovementdupLabel,
text="AGGREGATE BOOLEAN CONDITIONAL STATISTICS",
fg="grey",
command=lambda: BooleanConditionalSlicerPopUp(config_path=self.config_path),
)
spontaneous_alternation_pop_up_btn = Button(
processmovementdupLabel,
text="SPONTANEOUS ALTERNATION",
fg="navy",
command=lambda: SpontaneousAlternationPopUp(config_path=self.config_path),
)

# organize
processmovementdupLabel.grid(row=0, column=3, sticky=NW)
Expand Down
193 changes: 165 additions & 28 deletions simba/mixins/statistics_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ def __init__(self):

@staticmethod
@jit(nopython=True)
def _hist_1d(data: np.ndarray, bin_count: int, range: np.ndarray, normalize: Optional[bool] = False) -> np.ndarray:
def _hist_1d(
data: np.ndarray,
bin_count: int,
range: np.ndarray,
normalize: Optional[bool] = False,
) -> np.ndarray:
"""
Jitted helper to compute 1D histograms with counts or rations (if normalize is True)
Expand Down Expand Up @@ -435,10 +440,26 @@ def kullback_leibler_divergence(
:parameter Literal bucket_method: Estimator determining optimal bucket count and bucket width. Default: The maximum of the Sturges and Freedman-Diaconis estimators
:returns float: Kullback-Leibler divergence between ``sample_1`` and ``sample_2``
"""
check_valid_array(data=sample_1, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_valid_array(data=sample_2, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_str(name=f'{self.__class__.__name__} bucket_method', value=bucket_method, options=Options.BUCKET_METHODS.value)
check_int(name=f'{self.__class__.__name__} fill value', value=fill_value, min_value=1)
check_valid_array(
data=sample_1,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_valid_array(
data=sample_2,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_str(
name=f"{self.__class__.__name__} bucket_method",
value=bucket_method,
options=Options.BUCKET_METHODS.value,
)
check_int(
name=f"{self.__class__.__name__} fill value", value=fill_value, min_value=1
)
bin_width, bin_count = bucket_data(data=sample_1, method=bucket_method)
sample_1_hist = self._hist_1d(
data=sample_1,
Expand Down Expand Up @@ -557,9 +578,23 @@ def jensen_shannon_divergence(
>>> 0.30806541358219786
"""

check_valid_array(data=sample_1, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_valid_array(data=sample_2, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_str(name=f'{self.__class__.__name__} bucket_method', value=bucket_method, options=Options.BUCKET_METHODS.value)
check_valid_array(
data=sample_1,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_valid_array(
data=sample_2,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_str(
name=f"{self.__class__.__name__} bucket_method",
value=bucket_method,
options=Options.BUCKET_METHODS.value,
)
bin_width, bin_count = bucket_data(data=sample_1, method=bucket_method)
sample_1_hist = self._hist_1d(
data=sample_1,
Expand Down Expand Up @@ -666,9 +701,23 @@ def wasserstein_distance(
>>> Statistics().wasserstein_distance(sample_1=sample_1, sample_2=sample_2)
>>> 0.020833333333333332
"""
check_valid_array(data=sample_1, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_valid_array(data=sample_2, source=Statistics.jensen_shannon_divergence.__name__, accepted_ndims=(1,), accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float))
check_str(name=f'{self.__class__.__name__} bucket_method', value=bucket_method, options=Options.BUCKET_METHODS.value)
check_valid_array(
data=sample_1,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_valid_array(
data=sample_2,
source=Statistics.jensen_shannon_divergence.__name__,
accepted_ndims=(1,),
accepted_dtypes=(np.float32, np.float64, np.int32, np.int64, int, float),
)
check_str(
name=f"{self.__class__.__name__} bucket_method",
value=bucket_method,
options=Options.BUCKET_METHODS.value,
)
bin_width, bin_count = bucket_data(data=sample_1, method=bucket_method)
sample_1_hist = self._hist_1d(
data=sample_1,
Expand Down Expand Up @@ -754,7 +803,13 @@ def rolling_wasserstein_distance(
return results

@staticmethod
def total_variation_distance(x: np.ndarray, y: np.ndarray, bucket_method: Optional[Literal["fd", "doane", "auto", "scott", "stone", "rice", "sturges", "sqrt"]] = "auto"):
def total_variation_distance(
x: np.ndarray,
y: np.ndarray,
bucket_method: Optional[
Literal["fd", "doane", "auto", "scott", "stone", "rice", "sturges", "sqrt"]
] = "auto",
):
"""
Calculate the total variation distance between two probability distributions.
Expand All @@ -775,12 +830,52 @@ def total_variation_distance(x: np.ndarray, y: np.ndarray, bucket_method: Option
>>> 0.3999999761581421
"""

check_valid_array(data=x, source=Statistics.total_variation_distance.__name__, accepted_ndims=(1,), accepted_dtypes=(np.int64, np.int32, np.int8, np.float32, np.float64, int, float))
check_valid_array(data=y, source=Statistics.total_variation_distance.__name__, accepted_ndims=(1,), accepted_dtypes=(np.int64, np.int32, np.int8, np.float32, np.float64, int, float))
check_str(name=f"{Statistics.total_variation_distance.__name__} method", value=bucket_method, options=Options.BUCKET_METHODS.value)
check_valid_array(
data=x,
source=Statistics.total_variation_distance.__name__,
accepted_ndims=(1,),
accepted_dtypes=(
np.int64,
np.int32,
np.int8,
np.float32,
np.float64,
int,
float,
),
)
check_valid_array(
data=y,
source=Statistics.total_variation_distance.__name__,
accepted_ndims=(1,),
accepted_dtypes=(
np.int64,
np.int32,
np.int8,
np.float32,
np.float64,
int,
float,
),
)
check_str(
name=f"{Statistics.total_variation_distance.__name__} method",
value=bucket_method,
options=Options.BUCKET_METHODS.value,
)
bin_width, bin_count = bucket_data(data=x, method=bucket_method)
s1_h = Statistics._hist_1d(data=x, bin_count=bin_count, range=np.array([0, int(bin_width * bin_count)]), normalize=True)
s2_h = Statistics._hist_1d(data=y, bin_count=bin_count, range=np.array([0, int(bin_width * bin_count)]), normalize=True)
s1_h = Statistics._hist_1d(
data=x,
bin_count=bin_count,
range=np.array([0, int(bin_width * bin_count)]),
normalize=True,
)
s2_h = Statistics._hist_1d(
data=y,
bin_count=bin_count,
range=np.array([0, int(bin_width * bin_count)]),
normalize=True,
)
return 0.5 * np.sum(np.abs(s1_h - s2_h))

def population_stability_index(
Expand Down Expand Up @@ -1608,7 +1703,9 @@ def sliding_kendall_tau(
return results

@staticmethod
def local_outlier_factor(data: np.ndarray, k: Union[int, float] = 5, contamination: float = 1e-10) -> np.ndarray:
def local_outlier_factor(
data: np.ndarray, k: Union[int, float] = 5, contamination: float = 1e-10
) -> np.ndarray:
"""
Compute the local outlier factor of each observation.
Expand Down Expand Up @@ -1661,7 +1758,9 @@ def local_outlier_factor(data: np.ndarray, k: Union[int, float] = 5, contaminati

@staticmethod
@jit(nopython=True)
def _hbos_compute(data: np.ndarray, histograms: typed.Dict, histogram_edges: typed.Dict) -> np.ndarray:
def _hbos_compute(
data: np.ndarray, histograms: typed.Dict, histogram_edges: typed.Dict
) -> np.ndarray:
"""
Jitted helper to compute Histogram-based Outlier Score (HBOS) called by ``simba.mixins.statistics_mixin.Statistics.hbos``.
Expand Down Expand Up @@ -1726,13 +1825,25 @@ def hbos(
)
min_vals, max_vals = np.min(data, axis=0), np.max(data, axis=0)
data = (data - min_vals) / (max_vals - min_vals) * (1 - 0) + 0
histogram_edges = typed.Dict.empty(key_type=types.int64, value_type=types.float64[:])
histogram_edges = typed.Dict.empty(
key_type=types.int64, value_type=types.float64[:]
)
histograms = typed.Dict.empty(key_type=types.int64, value_type=types.int64[:])
for i in range(data.shape[1]):
bin_width, bin_count = bucket_data(data=data[:, i].flatten(), method=bucket_method)
histograms[i] = self._hist_1d(data=data[:, i].flatten(), bin_count=bin_count, range=np.array([0, int(bin_width * bin_count)])).astype(np.int64)
histogram_edges[i] = np.arange(0, 1 + bin_width, bin_width).astype(np.float64)
results = self._hbos_compute(data=data, histograms=histograms, histogram_edges=histogram_edges)
bin_width, bin_count = bucket_data(
data=data[:, i].flatten(), method=bucket_method
)
histograms[i] = self._hist_1d(
data=data[:, i].flatten(),
bin_count=bin_count,
range=np.array([0, int(bin_width * bin_count)]),
).astype(np.int64)
histogram_edges[i] = np.arange(0, 1 + bin_width, bin_width).astype(
np.float64
)
results = self._hbos_compute(
data=data, histograms=histograms, histogram_edges=histogram_edges
)
return results.astype(np.float32)

def rolling_shapiro_wilks(
Expand Down Expand Up @@ -2292,8 +2403,34 @@ def hellinger_distance(
>>> Statistics().hellinger_distance(x=x, y=y, bucket_method='auto')
"""

check_valid_array(data=x, source=Statistics.hellinger_distance.__name__, accepted_ndims=(1,), accepted_dtypes=(np.int64, np.int32, np.int8, np.float32, np.float64, int, float))
check_valid_array(data=y, source=Statistics.hellinger_distance.__name__, accepted_ndims=(1,), accepted_dtypes=(np.int64, np.int32, np.int8, np.float32, np.float64, int, float))
check_valid_array(
data=x,
source=Statistics.hellinger_distance.__name__,
accepted_ndims=(1,),
accepted_dtypes=(
np.int64,
np.int32,
np.int8,
np.float32,
np.float64,
int,
float,
),
)
check_valid_array(
data=y,
source=Statistics.hellinger_distance.__name__,
accepted_ndims=(1,),
accepted_dtypes=(
np.int64,
np.int32,
np.int8,
np.float32,
np.float64,
int,
float,
),
)
check_str(
name=f"{Statistics.hellinger_distance.__name__} method",
value=bucket_method,
Expand Down Expand Up @@ -2687,8 +2824,8 @@ def sliding_mad_median_rule(
# data = np.vstack([sample_1, sample_2])
# Statistics().hbos(data=data)

#sample_1 = np.random.normal(loc=10, scale=2, size=1000).astype(np.float64)
#sample_2 = np.random.normal(loc=12, scale=2, size=10000).astype(np.float64)
# sample_1 = np.random.normal(loc=10, scale=2, size=1000).astype(np.float64)
# sample_2 = np.random.normal(loc=12, scale=2, size=10000).astype(np.float64)

# sample_1 = np.random.randint(0, 100, (100, )).astype(np.float64)
# sample_2 = np.random.randint(110, 200, (100, )).astype(np.float64)
Expand Down

0 comments on commit 500a031

Please sign in to comment.