# Maximum Likelihood Estimation with Categorical Distribution

## Import modules

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import time
import glob

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython import display

import tensorflow as tf
import warnings

os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Setting hyperparameters

In [2]:
# Training Flags (hyperparameter configuration)
max_epochs = 100
batch_size = 128
learning_rate = 1e-3

## Make a toy dataset (categorical distribution)

**categorical distribution**

$X$ is a random variable
$$\Pr(X=i|\mathbf{p})=p_i, where \; \mathbf{p}=(p_1,\cdots,p_k)$$

**Probability mass function**
$$f(\mathbf{x};\mathbf{p})=\prod_{i=1}^{k} p_i^{x_i}$$

In [3]:
N = 10000 # the number of samples
C = 5     # the number of categories
true_probs = [0.1, 0.2, 0.1, 0.4, 0.2]
train_data = np.random.choice(C, size=N, p=true_probs)
print(train_data)
train_data = train_data.astype(np.float32)
train_data = np.expand_dims(train_data, axis=1)
print(train_data.shape)

[3 2 3 ... 3 3 3]
(10000, 1)


## Set up dataset with `tf.data`

### create input pipeline with `tf.data.Dataset`

In [4]:
# for train
N = len(train_data)
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
train_dataset = train_dataset.shuffle(buffer_size=N)
train_dataset = train_dataset.batch(batch_size=batch_size, drop_remainder=True)
print(train_dataset)

<BatchDataset shapes: (128, 1), types: tf.float32>


## Create the parameters to learn

**Log-likelihood in Bernoulli distribution**
$$\log f(\mathbf{x}|\mathbf{p})=\sum_{i=1}^{k}\log p_i^{x_i}$$

**Variables**

* `logit`: The probabilities can be obtained by applying softmax to the logit values.

In [5]:
logit = tf.Variable(np.array([0., 0., 0., 0., 0.]), dtype=tf.float32) # initial value
print(logit)

probs = tf.nn.softmax(logit)
print(probs)

<tf.Variable 'Variable:0' shape=(5,) dtype=float32, numpy=array([0., 0., 0., 0., 0.], dtype=float32)>
tf.Tensor([0.2 0.2 0.2 0.2 0.2], shape=(5,), dtype=float32)


In [6]:
def log_pmf(sample, probs):
    log_likelihood = None
    for i in range(C):
        if log_likelihood is None:
            log_likelihood = tf.cast((data == i), tf.float32) * tf.math.log(probs[i])
        else:
            log_likelihood = log_likelihood + tf.cast((data == i), tf.float32) * tf.math.log(probs[i])

    return log_likelihood
  

## Define the loss functions and the optimizer

In [14]:
optimizer = tf.keras.optimizers.SGD(learning_rate)

## Training

### Define training one step function

In [15]:
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
@tf.function
def train_step(data):
    with tf.GradientTape() as tape:
        negative_log_likelihood = -tf.reduce_mean(log_pmf(data, tf.nn.softmax(logit)))

    gradients = tape.gradient(negative_log_likelihood, [logit])
    optimizer.apply_gradients(zip(gradients, [logit]))

    return negative_log_likelihood

### Training full steps

In [16]:
print('Start Training.')
global_step = tf.Variable(0, trainable=False)

for epoch in range(max_epochs):
    for step, data in enumerate(train_dataset):

        negative_log_likelihood = train_step(data)
        global_step.assign_add(1)

    print('epoch :', epoch, 'NLL :', negative_log_likelihood.numpy())

print('Training Done.')

Start Training.
epoch : 0 NLL : 1.5092432
epoch : 1 NLL : 1.5075796
epoch : 2 NLL : 1.50599
epoch : 3 NLL : 1.504471
epoch : 4 NLL : 1.5030192
epoch : 5 NLL : 1.501632
epoch : 6 NLL : 1.5003058
epoch : 7 NLL : 1.4990379
epoch : 8 NLL : 1.4978256
epoch : 9 NLL : 1.4966663
epoch : 10 NLL : 1.4955578
epoch : 11 NLL : 1.4944972
epoch : 12 NLL : 1.4934825
epoch : 13 NLL : 1.4925119
epoch : 14 NLL : 1.4915829
epoch : 15 NLL : 1.4906938
epoch : 16 NLL : 1.4898427
epoch : 17 NLL : 1.4890277
epoch : 18 NLL : 1.4882476
epoch : 19 NLL : 1.4875004
epoch : 20 NLL : 1.4867848
epoch : 21 NLL : 1.486099
epoch : 22 NLL : 1.4854424
epoch : 23 NLL : 1.484813
epoch : 24 NLL : 1.4842098
epoch : 25 NLL : 1.4836316
epoch : 26 NLL : 1.4830773
epoch : 27 NLL : 1.4825456
epoch : 28 NLL : 1.4820359
epoch : 29 NLL : 1.481547
epoch : 30 NLL : 1.4810779
epoch : 31 NLL : 1.4806279
epoch : 32 NLL : 1.480196
epoch : 33 NLL : 1.4797815
epoch : 34 NLL : 1.4793835
epoch : 35 NLL : 1.4790016
epoch : 36 NLL : 1.4786346
epo

## Print the results

In [17]:
print('Results')
print('estimated probabilities :', tf.nn.softmax(logit).numpy())
print('true probabilites :', true_probs)

Results
estimated probabilities : [0.12301521 0.18276438 0.08774455 0.39012673 0.21634908]
true probabilites : [0.1, 0.2, 0.1, 0.4, 0.2]
