# Example: Dirichlet Mixture Model

In [1]:
import os
os.chdir("../..")

In [2]:
# import necessary modules

from fmvmm.mixtures.DMM_Soft import DMM_Soft
from fmvmm.mixtures.DMM_Hard import DMM_Hard
from fmvmm.utils.utils_mixture import sample_mixture_distribution
from fmvmm.distributions import dirichlet
import numpy as np
from fmvmm.utils.utils_dmm import wald_confidence_intervals_dmm

In [3]:
# Let us first generate some data from a dirichlet mixture model
np.random.seed(5)
pis = [0.3,0.5,0.2]
a1 =[5,5,5]
a2 = [7,15,225]
a3 = [50,10,4]
alphas = [[a1],[a2],[a3]]

data, label = sample_mixture_distribution(1000, dirichlet.rvs, pis,alphas)

In [10]:
# First Let us fit Soft DMM to the Data

model1 = DMM_Soft(n_clusters= 3)
model1.fit(data)

Soft DMM Fitting Done Successfully


In [11]:
# To get the MLE of the parameters:

pi_soft, alpha_soft = model1.get_params()

print("pi vaues: ", pi_soft)
print("alpha vaues: ", alpha_soft)

pi vaues:  [0.19865103 0.49999099 0.30135797]
alpha vaues:  [[46.48586852657687, 9.326472163419565, 3.5831104772858464], [7.174275983298732, 15.24567720274679, 227.31921065696392], [5.063675843350417, 4.985150567409754, 5.021060358640115]]


In [12]:
# To get the standard errors:

im, se = model1.get_info_mat(method="louis")

print("standard errors: ", se)

standard errors:  [ 0.0141898   0.02236069  0.01742507  3.62160822  0.7100066   0.26542584
  0.32325353  0.68776851 10.26276965  0.30369949  0.30603333  0.31169018]


In [13]:
# Confidence Intervals
ci = wald_confidence_intervals_dmm(model1.get_params(),im, alpha=0.05)
for i, (lo, hi) in enumerate(ci):
    print(f"Param {i+1}: ({lo:.4f}, {hi:.4f})")

Param 1: (0.1714, 0.2290)
Param 2: (0.4642, 0.5357)
Param 3: (0.2691, 0.3375)
Param 4: (39.9030, 54.1547)
Param 5: (8.0337, 10.8272)
Param 6: (3.0989, 4.1430)
Param 7: (6.5679, 7.8367)
Param 8: (13.9556, 16.6551)
Param 9: (208.0688, 248.3506)
Param 10: (4.5021, 5.6953)
Param 11: (4.4200, 5.6225)
Param 12: (4.4459, 5.6707)


In [14]:
# To get the standard errors:

im, se = model1.get_info_mat(method="score")

print("standard errors: ", se)

standard errors:  [ 0.01422412  0.02236069  0.0174485   3.70923347  0.73825561  0.26801094
  0.35474044  0.72245549 11.08354264  0.30881938  0.30520742  0.2972184 ]


In [15]:
# Confidence Intervals
ci = wald_confidence_intervals_dmm(model1.get_params(),im, alpha=0.05)
for i, (lo, hi) in enumerate(ci):
    print(f"Param {i+1}: ({lo:.4f}, {hi:.4f})")

Param 1: (0.1713, 0.2292)
Param 2: (0.4642, 0.5358)
Param 3: (0.2690, 0.3376)
Param 4: (39.7559, 54.3551)
Param 5: (7.9862, 10.8917)
Param 6: (3.0945, 4.1489)
Param 7: (6.5116, 7.9044)
Param 8: (13.8935, 16.7295)
Param 9: (206.6016, 250.1144)
Param 10: (4.4932, 5.7066)
Param 11: (4.4215, 5.6207)
Param 12: (4.4710, 5.6387)


In [16]:
# To check classification performance keeping in mind label switching:

from fmvmm.utils.utils_mixture import clustering_metrics

clustering_metrics(label,model1.predict())

{'accuracy': 0.998, 'precision': 0.998, 'recall': 0.998, 'f_score': 0.998}

In [17]:
# To get AIC, BIC, ICL

print("AIC", model1.aic())
print("BIC", model1.bic())
print("ICL", model1.icl())

AIC -6414.132866285573
BIC -6360.1475582167695
ICL -6369.228712990898


In [18]:
# We can similarly fit Hard DMM

model2 = DMM_Hard(n_clusters= 3)
model2.fit(data)

Hard DMM Fitting Done Successfully


In [19]:
# To get the MLE of the parameters:

pi_soft, alpha_soft = model2.get_params()

print("pi vaues: ", pi_soft)
print("alpha vaues: ", alpha_soft)

pi vaues:  [0.2 0.5 0.3]
alpha vaues:  [array([46.16704257,  9.29445097,  3.57438085]), array([  7.17364518,  15.24409011, 227.29315746]), array([5.13307   , 5.07683489, 5.13265107])]


In [20]:
# To get the standard errors:

im, se = model2.get_info_mat(method="louis")

print("standard errors: ", se)

standard errors:  [ 0.01422577  0.02236087  0.0173891   3.58281425  0.70486838  0.2638148
  0.32321474  0.68766865 10.26110342  0.30847479  0.31205762  0.31892394]


In [21]:
# Confidence Intervals
ci = wald_confidence_intervals_dmm(model2.get_params(),im, alpha=0.05)
for i, (lo, hi) in enumerate(ci):
    print(f"Param {i+1}: ({lo:.4f}, {hi:.4f})")

Param 1: (0.1726, 0.2305)
Param 2: (0.4642, 0.5358)
Param 3: (0.2678, 0.3361)
Param 4: (39.6528, 53.7514)
Param 5: (8.0107, 10.7839)
Param 6: (3.0930, 4.1307)
Param 7: (6.5673, 7.8359)
Param 8: (13.9541, 16.6533)
Param 9: (208.0458, 248.3211)
Param 10: (4.5627, 5.7747)
Param 11: (4.5006, 5.7268)
Param 12: (4.5441, 5.7974)


In [22]:
# To get the standard errors:

im, se = model2.get_info_mat(method="score")

print("standard errors: ", se)

standard errors:  [ 0.01414214  0.02236068  0.01732051  3.42785225  0.70191528  0.25899917
  0.3545411   0.72193125 11.07409056  0.30833331  0.30256038  0.29484352]


In [23]:
# Confidence Intervals
ci = wald_confidence_intervals_dmm(model2.get_params(),im, alpha=0.05)
for i, (lo, hi) in enumerate(ci):
    print(f"Param {i+1}: ({lo:.4f}, {hi:.4f})")

Param 1: (0.1729, 0.2302)
Param 2: (0.4643, 0.5357)
Param 3: (0.2679, 0.3359)
Param 4: (39.9146, 53.3990)
Param 5: (8.0157, 10.7772)
Param 6: (3.1012, 4.1198)
Param 7: (6.5114, 7.9033)
Param 8: (13.8928, 16.7268)
Param 9: (206.5924, 250.0681)
Param 10: (4.5630, 5.7744)
Param 11: (4.5172, 5.7059)
Param 12: (4.5861, 5.7443)


In [24]:
# To check classification performance keeping in mind label switching:


clustering_metrics(label,model2.predict())

{'accuracy': 0.998, 'precision': 0.998, 'recall': 0.998, 'f_score': 0.998}

In [25]:
# To get AIC, BIC, ICL

print("AIC", model2.aic())
print("BIC", model2.bic())
print("ICL", model2.icl())

AIC -6413.908448703747
BIC -6359.923140634944
ICL -6368.641537267349


## High Dimensional Case

Only for Soft DMM it is currently implemented 

In [24]:
import numpy as np

pis = [0.4762, 0.2857, 0.2381]

a1=np.random.uniform(10,20,1000)
a2=np.random.uniform(20,200,1000)
a3=np.random.uniform(10,100,1000)

alphas = [[a1],[a2],[a3]]

data, label = sample_mixture_distribution(1000, dirichlet.rvs, pis,alphas)

In [25]:
# Method: Highdimensional 

model3 = DMM_Soft(n_clusters= 3, method="highdimensional")
model3.fit(data)

Soft DMM Fitting Done Successfully


In [26]:
# To check classification performance 

clustering_metrics(label,model3.predict())

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f_score': 1.0}

In [27]:
#Execution Time in Seconds

model3.execution_time

5.802024841308594