# `hdp_py` Examples

Below are some examples of the implemented functionality for this package.  See the `README` for installation instructions.

In [22]:
from hdp_py import HDP, get_data
import numpy as np
import pandas as pd

np.random.seed(0)
N, L, Jmax, n = 200, 100, 10, 10

In [23]:
# Poisson data model
j_pois = np.random.choice(Jmax, N)
x_pois = np.random.poisson(j_pois, N)[:, None]  # each group j is Poisson(j)
print(f"j = {j_pois[:10]}")
print(f"x = \n{x_pois[:10]}")

j = [5 0 3 3 7 9 3 5 2 4]
x = 
[[ 4]
 [ 0]
 [ 0]
 [ 3]
 [ 5]
 [11]
 [ 1]
 [ 8]
 [ 1]
 [ 4]]


In [24]:
# Multinomial data model
j_mult = np.random.choice(Jmax, N)
x_mult = np.random.choice(n, (N,L))
print(f"j = {j_mult[:10]}")
print(f"x = \n{x_mult}")

j = [3 4 9 3 0 0 6 4 3 3]
x = 
[[8 3 2 ... 8 6 6]
 [3 4 4 ... 3 5 8]
 [9 5 0 ... 5 7 3]
 ...
 [4 0 0 ... 3 8 8]
 [4 2 2 ... 3 0 3]
 [7 5 7 ... 9 8 1]]


In [25]:
# Categorical data model
j_cat = np.random.choice(Jmax, N)
one_idx = np.random.choice(L, N)
x_cat = np.zeros((N,L), dtype='int')
x_cat[range(N), one_idx] = 1
print(f"j = {j_cat[:10]}")
print(f"x = \n{x_cat}")

j = [5 2 3 5 3 1 2 1 0 9]
x = 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [45]:
# Initializing models
hdp_pois = HDP.HDP(gamma=0.5, alpha0=0.5, f='poisson', hypers=(1,10))
hdp_mult = HDP.HDP(gamma=0.5, alpha0=0.5, f='multinomial', hypers=(L, np.full(L, 0.5)))
# Both the below are special cases of the multinomial class
hdp_cat = HDP.HDP(gamma=0.5, alpha0=0.5, f='categorical', hypers=(L, np.full(L, 0.5)))
hdp_cat2 = HDP.HDP(gamma=0.5, alpha0=0.5, f='categorical_fast', hypers=(L, np.full(L, 0.5)))

In [46]:
# CFR sampling - method returns the object itself
%time hdp_pois = hdp_pois.gibbs_cfr(x_pois, j_pois, iters=10)
%time hdp_mult = hdp_mult.gibbs_cfr(x_mult, j_mult, iters=10, Tmax=10)
%time hdp_cat = hdp_cat.gibbs_cfr(x_cat, j_cat, iters=10, Tmax=10, Kmax=10)
%time hdp_cat2 = hdp_cat2.gibbs_cfr(x_cat, j_cat, iters=10, Tmax=10, Kmax=10)

Wall time: 5.97 s
Wall time: 10.6 s
Wall time: 5.29 s
Wall time: 4.83 s


In [47]:
# Access the CFR samples like this (first column are t values, second columnn are k values)
print(hdp_pois.cfr_samples[0,:10,:])  # initial iteration for first 10 observations
print(hdp_cat.cfr_samples[:,0,1])     # k value path for first observation

[[ 4 72]
 [28 13]
 [14  9]
 [14  9]
 [26 97]
 [ 1 97]
 [15  4]
 [29 97]
 [17 44]
 [ 0 44]]
[5 7 9 9 3 0 2 7 2 5]


In [54]:
# Direct sampling - method returns the object itself
%time hdp_pois = hdp_pois.gibbs_direct(x_pois, j_pois, iters=100)
%time hdp_mult = hdp_mult.gibbs_direct(x_mult, j_mult, iters=100)
%time hdp_cat = hdp_cat.gibbs_direct(x_cat, j_cat, iters=100, Kmax=10)
%time hdp_cat2 = hdp_cat2.gibbs_direct(x_cat, j_cat, iters=100, Kmax=10)

Wall time: 10.5 s
Wall time: 38.3 s
Wall time: 6.08 s
Wall time: 1.5 s


In [51]:
# Access the direct samples like this (each row are the k values of one iteration)
# k values of final iteration (notice clustering)
print(f"k =\n{hdp_pois.direct_samples[-1,:]}")
# corresponding beta weights for final iteration
print(f"beta =\n{hdp_pois.beta_samples[-1,:].round(3)}")

k =
[ 0 92 71 71  0  0 71  0 71  0  0  0  0  0 71  0  0  0  0 71  0  0  0  0
  0 71 92 71  0 92 71 71  0 71 71 71 71  0 92 71  0  0 92  0  0 71 71  0
 71 92 92 71  0  0  0  0 71 71 71  0  0 71 71  0  0  0 71  0  0 71 92 71
  0  0 71 71  0 71  0 71 71 71  0  0 71  0  0  0 92 71  0  0 71 92  0 92
 71  0  0 71 92 71 71 92  0  0  0 92 71  0 71  0 71 71  0 71 71  0 71 71
  0 71 71  0  0 71 71 92 92  0 92  0 71 71  0  0  0  0 71 71 92  0  0  0
  0 71  0  0 71 92 71  0  0  0  0 92  0  0  0 92  0  0  0 71 71  0 92 71
  0  0  0  0  0  0  0 71  0  0  0  0 71  0  0  0 71 71  0  0 71  0  0  0
 92  0  0  0  0  0 71 71]
beta =
[0.791 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.  

In [52]:
# Note that this does not remove the CFR samples
print(f"k =\n{hdp_pois.cfr_samples[-1,:,1]}")

k =
[37 26  9 10 37  4 10 37  9 14 37 21 37 37 15 21 37 16 37 15 37  4 37  4
 14  9 26  9 37 26 84  9 37 15  9 10 21 97 26 15  4  4 26 14 37 21  9 16
 84 26 26 37 37  4 21 37 36 15 36  4 37 15 15 16 16  4  9 21 37  9 26  9
  4  4 36 36 21 14 14  9 36 14 37 14 10 37  4 37 26 15 37  4 10 26 37 26
 15  9 14 84 26 10  9 26 37 37  4 26 59 16  9  4 84 10 21  9 10 14 15 84
  4 15 36 21 37  9 10 26 26 21 26 21 10 10 37 37 37 84  9  9 26 37 37 21
 37  9 37 14 10 26 36 10 21  4 37 26 37 37  4 26  4 21 37 10 15 37 26 36
  4 21 37 37 37 37  4 37 37 21 21  4 15 21 37 37  9 84 21 21 15 21  4 16
 26 37 14 21 37 37  9 21]


In [55]:
# Can also resume iterations where we left off, if using direct sampling
# NOTE: Must be using the same data and Kmax arguments
print(hdp_cat.direct_samples.shape)
hdp_cat.gibbs_direct(x_cat, j_cat, iters=50, Kmax=10, resume=True)
print(hdp_cat.direct_samples.shape)

(100, 200)
(150, 200)


In [58]:
# Can also toggle verbose output for direct sampler (shown are beta weights)
hdp_cat = hdp_cat.gibbs_direct(x_cat, j_cat, iters=50, Kmax=10, verbose=True)

[0.12  0.109 0.029 0.049 0.084 0.032 0.065 0.278 0.062 0.164 0.008]
[0.212 0.119 0.042 0.022 0.065 0.05  0.055 0.209 0.168 0.044 0.015]
[0.253 0.098 0.    0.071 0.067 0.036 0.068 0.107 0.181 0.029 0.089]
[0.433 0.099 0.    0.133 0.015 0.025 0.02  0.13  0.113 0.022 0.01 ]
[0.371 0.144 0.    0.069 0.011 0.007 0.001 0.231 0.036 0.104 0.027]
[0.377 0.147 0.    0.138 0.009 0.    0.015 0.17  0.031 0.111 0.002]
[0.292 0.019 0.    0.098 0.008 0.    0.058 0.203 0.05  0.195 0.077]
[0.366 0.038 0.034 0.075 0.055 0.    0.111 0.122 0.085 0.019 0.096]
[0.485 0.024 0.166 0.146 0.07  0.    0.017 0.021 0.062 0.004 0.004]
[0.374 0.197 0.    0.028 0.214 0.    0.008 0.02  0.125 0.014 0.021]
[0.261 0.031 0.    0.01  0.309 0.    0.009 0.064 0.142 0.165 0.009]
[0.162 0.003 0.    0.011 0.321 0.    0.016 0.006 0.323 0.061 0.098]
[0.23  0.074 0.    0.092 0.143 0.    0.173 0.093 0.049 0.109 0.036]
[0.27  0.071 0.    0.041 0.063 0.    0.016 0.037 0.252 0.178 0.072]
[0.205 0.03  0.    0.02  0.017 0.251 0.012 0.216

In [69]:
# Accessing nematode data (from original paper)
x_nematode, j_nematode = get_data.get_nematode(max_docs=10)
print(x_nematode.shape)
print(f"\nVocabulary =\n{pd.Series(x_nematode.columns)}")

(670, 419)

Vocabulary =
0              oxygen
1             elegans
2               cells
3      caenorhabditis
4             tension
5            nematode
6         dauerlarvae
7              living
8             species
9               three
10            pharynx
11              stage
12           critical
13             rachis
14        consumption
15               free
16             adults
17            nuclear
18            classes
19           bacteria
20            complex
21           presence
22           neurones
23            thermal
24               cell
25                  c
26           electron
27             stress
28          nucleolus
29         conditions
            ...      
389       specialized
390           failure
391         apparatus
392          constant
393             least
394          produced
395           layered
396    interpretation
397      mitochondria
398               dna
399                 q
400          synaptic
401           neurone
402    

In [64]:
# Accessing Reuters data (manually downloaded and in package)
x_reut, j_reut = get_data.get_reuters(max_docs=10)
print(x_reut.shape)
print(f"\nVocabulary =\n{pd.Series(x_reut.columns)}")

(1028, 562)

Vocabulary =
0              dlrs
1             total
2       bankamerica
3               mln
4             would
5             stock
6             price
7               new
8              prev
9              debt
10             york
11              feb
12              nil
13         february
14              apr
15            sales
16          company
17            times
18              per
19              mar
20            cocoa
21         offering
22            smith
23         computer
24              may
25         analysts
26             also
27                v
28              oil
29             crop
           ...     
532            form
533            term
534     dificulties
535        mortgage
536           joint
537            ends
538        reaction
539             dry
540         appears
541          season
542           years
543      cumulative
544     outstanding
545    registration
546            even
547          ltbpgt
548            sale
549         sh

In [74]:
# Running direct sampler on the nematode data
L_nematode = x_nematode.shape[1]

hdp_nematode = HDP.HDP(f='categorical_fast', hypers=(L, np.full(L, 0.5)))
# Data returned is a pandas dataframe, so cast to a numpy array of ints
%time hdp_nematode = hdp_nematode.gibbs_direct(x_nematode.to_numpy(dtype='int'), j_nematode, iters=50, Kmax=10, verbose=True)

[0.081 0.094 0.073 0.136 0.151 0.071 0.075 0.113 0.092 0.108 0.006]
[0.125 0.118 0.018 0.093 0.199 0.055 0.103 0.082 0.095 0.108 0.004]
[0.143 0.118 0.058 0.037 0.14  0.115 0.114 0.034 0.041 0.2   0.   ]
[0.173 0.021 0.01  0.059 0.115 0.11  0.023 0.047 0.112 0.196 0.135]
[0.069 0.048 0.037 0.051 0.284 0.059 0.083 0.071 0.027 0.213 0.056]
[0.038 0.003 0.035 0.188 0.165 0.067 0.016 0.075 0.003 0.377 0.032]
[0.064 0.011 0.063 0.102 0.164 0.019 0.04  0.137 0.002 0.38  0.019]
[0.026 0.013 0.024 0.065 0.077 0.057 0.104 0.01  0.003 0.615 0.006]
[0.021 0.    0.037 0.111 0.021 0.035 0.078 0.032 0.    0.652 0.014]
[0.055 0.022 0.065 0.074 0.116 0.044 0.024 0.022 0.001 0.543 0.035]
[0.053 0.028 0.04  0.054 0.19  0.08  0.024 0.003 0.    0.519 0.011]
[0.005 0.004 0.022 0.11  0.039 0.041 0.08  0.042 0.    0.615 0.041]
[0.    0.006 0.011 0.026 0.036 0.032 0.038 0.098 0.    0.752 0.001]
[0.    0.011 0.027 0.029 0.034 0.022 0.002 0.014 0.    0.843 0.018]
[0.    0.047 0.005 0.    0.006 0.006 0.046 0.076