In [3]:
import numpy as np
import pandas as pd
from sklearn.mixture import BayesianGaussianMixture
from scipy.stats import chi2

In [229]:
class BGMMBDT:
    def __init__(self, criteria: float=0.5, max_iter: int=1000):
        self.list_model = []
        self.list_cluster = []
        self.criteria = criteria
        self.max_iter = max_iter
    
    def recursive_BGMM(self, array_value: np.array):
        if array_value.shape[0] < 2: return array_value
        model = BayesianGaussianMixture(n_components=2,
                                        covariance_type="full",
                                        weight_concentration_prior_type="dirichlet_process",
                                        # weight_concentration_prior=0.5,
                                        init_params="random",
                                        max_iter=self.max_iter).fit(array_value)
        arrayCluster = model.predict(array_value)
        df_values = pd.DataFrame(zip(array_value, arrayCluster), columns=["value", "cluster"])
        df_cluster = df_values.groupby("cluster").count()
        if df_cluster.shape[0] < 2: return array_value
        # df_cluster["means"] = model.means_
        # df_cluster["covs"] = model.covariances_.squeeze(axis=2)
        df_cluster["means"] = np.median(array_value).item()
        df_cluster.sort_values(by="value", ascending=False, inplace=True)
        # if int(df_cluster.at[1, "value"]) / int(df_cluster.at[0, "value"]) > 0.05: return array_value

        self.list_model.append(model)
        self.list_cluster.append(df_cluster)
        array_value = np.array(df_values.loc[df_values["cluster"] == df_cluster.index[0], "value"])
        return self.recursive_BGMM(array_value.reshape(-1, 1))

    def predict(self, array_value: np.array, weight_type: str="linear"):
        self.recursive_BGMM(array_value)
        assert weight_type.lower() == "linear" or weight_type.lower() == "exp", "weight_type is not specific!\n Choose between 'linear' or 'exp' please."

        depth = len(self.list_cluster)
        if depth < 2: return np.zeros_like(array_value)

        df_pred = pd.DataFrame()
        for idx, (model, df_cluster) in enumerate(zip(self.list_model, self.list_cluster)):
            LLMean = model.score_samples(np.array(df_cluster.at[0, "means"]).reshape(-1, 1))
            LLValues = model.score_samples(array_value)
            LR = 2 * np.abs(LLMean - LLValues)
            p_values = chi2.sf(LR, 1)
            prob_values = model.predict_proba(array_value)[:, df_cluster.index[0]]

            df_p = pd.DataFrame(zip(p_values, prob_values), columns = ["p_values", "prob"])
            pred_name = "pred" + str(idx)
            df_p[pred_name] = 0
            df_p.loc[(df_p["p_values"] < 0.05) & (df_p["prob"] < 0.025), pred_name] = 1
            df_pred = pd.concat((df_pred, df_p.loc[:, pred_name]), axis=1)            

        if weight_type == "linear":
            array_weights = np.linspace(1, depth, num=depth)
        elif weight_type == "exp":
            array_weights = np.exp(np.linspace(1, depth, num=depth))

        df_pred["sum"] = np.dot(np.array(df_pred), array_weights[::-1] / depth)
        df_pred["sum"] = df_pred["sum"].apply(lambda x: 1 if x >= self.criteria else 0)

        return np.array(df_pred["sum"])



In [230]:
example = np.random.normal(3, 1, 13)
example[10] = 159
example[12] = 17
example

array([  3.41032132,   3.65350052,   2.92570422,   2.79700845,
         3.3710318 ,   3.68635043,   3.17104684,   2.49863282,
         2.99616349,   1.71346827, 159.        ,   1.2601849 ,
        17.        ])

In [231]:
bgmm = BGMMBDT(criteria=0.5)
bgmm.predict(example.reshape(-1, 1), weight_type="linear")

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64)