
Lab: `tensoflow` 範例
================

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import distributions as tfd

## Linear Regression

In [2]:
# define a function to generate x and y
def generate_linear_reg_data(
    n_sample, weight, intercept = 0, sd_residual = 1,
    dtype = tf.float64, seed = None):
    weight = tf.constant(weight, dtype = dtype)
    weight = tf.reshape(weight, shape = (-1, 1))
    n_feature = weight.shape[0]
    x = tf.random.normal(shape = (n_sample, n_feature),
                         seed = seed, dtype = dtype)
    e = tf.random.normal(shape = (n_sample, 1),
                         seed = seed, dtype = dtype)
    y = intercept + x @ weight + e
    return x, y

# run generate_data
n_sample = 10000
weight_true = [-1, 2, 0]
dtype = tf.float64

x, y = generate_linear_reg_data(
    n_sample = n_sample, weight = weight_true,
    intercept = 0, sd_residual = 1,
    dtype = dtype, seed = 48)

In [3]:
# start optimization
n_feature = len(weight_true)
learning_rate = .1
epochs = 500
tol = 10**(-4)

optimizer = tf.optimizers.SGD(learning_rate = learning_rate)

intercept = tf.Variable(tf.zeros((), dtype = dtype), 
                        name = "intercept")
weight = tf.Variable(tf.zeros((n_feature, 1), dtype = dtype), 
                     name = "weight")

for epoch in tf.range(epochs):
    with tf.GradientTape() as tape:
        y_hat = intercept + x @ weight
        loss_value = tf.reduce_mean((y - y_hat)**2)
    gradients = tape.gradient(loss_value, [intercept, weight])
    optimizer.apply_gradients(zip(gradients, [intercept, weight]))
    #print(weight)
    if (tf.reduce_max(
            [tf.reduce_mean(
                tf.math.abs(x)) for x in gradients]).numpy()) < tol:
        print("{n} Optimizer Converges After {i} Iterations".format(
            n=optimizer.__class__.__name__, i=epoch))
        break

print("intercept", intercept.numpy())
print("weight", weight.numpy())

SGD Optimizer Converges After 45 Iterations
intercept -0.012025053092215557
weight [[-0.99697044]
 [ 1.99617999]
 [ 0.02433336]]


## Logistic Regression

In [4]:
# define a function to generate x and y
def generate_logistic_reg_data(
    n_sample, weight, intercept = 0, 
    dtype = tf.float64, seed = None):
    weight = tf.constant(weight, dtype = dtype)
    weight = tf.reshape(weight, shape = (-1, 1))
    n_feature = weight.shape[0]
    x = tf.random.normal(shape = (n_sample, n_feature),
                         seed = seed, dtype = dtype)
    logits = intercept + x @ weight
    y = tfd.Bernoulli(logits=logits, dtype=dtype).sample()
    return x, y

# run generate_data
n_sample = 10000
weight_true = [-1, 2, 0]
dtype = tf.float64

x, y = generate_logistic_reg_data(
    n_sample = n_sample, 
    weight = weight_true,intercept = 0, 
    dtype = dtype, seed = 48)

In [5]:
# define a tf.Module to collect parameters
class LinearModel(tf.Module):
  def __init__(self, n_feature, dtype = tf.float64):
    super().__init__()
    self.weight = tf.Variable(tf.zeros((n_feature, 1), 
                                       dtype = dtype), 
                              name = "weight")
    self.intercept = tf.Variable(tf.zeros((), dtype = dtype), 
                                 name = "intercept")
  def __call__(self, x):
    return self.intercept + x @ self.weight

In [6]:
n_feature = len(weight_true)
learning_rate = .5
epochs = 500
tol = 10**(-4)

linear_model = LinearModel(n_feature, dtype)
optimizer = tf.optimizers.SGD(learning_rate = learning_rate)

for epoch in tf.range(epochs):
    with tf.GradientTape() as tape:
        logits = linear_model(x)
        loss_value = - tf.reduce_mean(
            tfd.Bernoulli(logits=logits).log_prob(y))
    gradients = tape.gradient(
        loss_value, linear_model.trainable_variables)
    optimizer.apply_gradients(
        zip(gradients, linear_model.trainable_variables))
    if (tf.reduce_max(
            [tf.reduce_mean(
                tf.math.abs(x)) for x in gradients]).numpy()) < tol:
        print("{n} Optimizer Converges After {i} Iterations".format(
            n=optimizer.__class__.__name__, i=epoch))
        break

print("intercept", linear_model.intercept.numpy())
print("weight", linear_model.weight.numpy())

SGD Optimizer Converges After 215 Iterations
intercept 0.038359231456371566
weight [[-0.99361795]
 [ 2.00713225]
 [-0.0315113 ]]


## Factor Analysis

In [7]:
def generate_fa_data(n_sample, n_factor, n_item, 
                     ld, psi = None, rho = None, 
                     dtype = tf.float64):
    if (n_item % n_factor) != 0:
        n_item = n_factor * (n_item // n_factor)
    loading = np.zeros((n_item, n_factor))
    item_per_factor = (n_item // n_factor)
    for i in range(n_factor):
        for j in range(i * item_per_factor,
                       (i + 1) * item_per_factor):
            loading[j, i] = ld
    loading = tf.constant(loading, dtype = dtype)
    if rho is None:
        cor = tf.eye(n_factor, dtype = dtype)
    else:
        unit = tf.ones((n_factor, 1), dtype = dtype)
        identity = tf.eye(n_factor, dtype = dtype)
        cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity
    if psi is None:
        uniqueness = 1 - tf.linalg.diag_part(loading @ cor @ tf.transpose(loading))
    else:
        uniqueness = psi * tf.ones((n_item, ), dtype = dtype)
    
    mean = tf.zeros(n_item, dtype = dtype)
    cov = loading @ cor @ tf.transpose(loading) + tf.linalg.diag(uniqueness)
    dist_x = tfd.MultivariateNormalTriL(
        loc = mean, scale_tril = tf.linalg.cholesky(cov))
    x = dist_x.sample(n_sample)
    return x

n_sample = 10000
n_factor = 4
n_item = 12
ld = .7
dtype = tf.float64

x = generate_fa_data(n_sample, n_factor, 
                     n_item, ld,
                     dtype = dtype)
sample_mean = tf.reduce_mean(x, axis = 0)
sample_cov = tf.transpose(x - sample_mean) @ (x - sample_mean) / n_sample

In [8]:
sample_mean

<tf.Tensor: shape=(12,), dtype=float64, numpy=
array([-0.00753584, -0.00187876, -0.00568033,  0.00640062, -0.02027605,
        0.00150894,  0.0221699 ,  0.02965379,  0.01331654, -0.02272707,
       -0.01132167, -0.02347031])>

In [9]:
# define a tf.Module to coollect parameters
class FactorModel(tf.Module):
  def __init__(self, n_item, n_factor, 
               dtype = tf.float64):
    super().__init__()
    self.intercept = tf.Variable(
        tf.zeros(n_item, dtype = dtype), name = "intercept")
    self.loading = tf.Variable(
        tf.random.uniform((n_item, n_factor), dtype = dtype), 
        name = "loading")
    self.uniqueness = tf.Variable(
        tf.fill(n_item, value = tf.constant(.2, dtype = dtype)), 
        name = "uniqueness")
  def __call__(self):
      model_mean = self.intercept
      model_cov = self.loading @ tf.transpose(self.loading) + tf.linalg.diag(self.uniqueness)
      return model_mean, model_cov

In [10]:
learning_rate = .5
epochs = 500
tol = 10**(-4)

factor_model = FactorModel(n_item, n_factor, dtype)
optimizer = tf.optimizers.SGD(learning_rate = learning_rate)

for epoch in tf.range(epochs):
    with tf.GradientTape() as tape:
        model_mean, model_cov = factor_model()
        mvn = tfd.MultivariateNormalTriL(
            loc = model_mean, 
            scale_tril = tf.linalg.cholesky(model_cov))
        loss_value = - tf.reduce_mean(mvn.log_prob(x))
    gradients = tape.gradient(
        loss_value, factor_model.trainable_variables)
    optimizer.apply_gradients(
        zip(gradients, factor_model.trainable_variables))
    if (tf.reduce_max(
            [tf.reduce_mean(
                tf.math.abs(x)) for x in gradients]).numpy()) < tol:
        print("{n} Optimizer Converges After {i} Iterations".format(
            n=optimizer.__class__.__name__, i=epoch))
        break

print("intercept", factor_model.intercept.numpy())
print("loading", factor_model.loading.numpy())
print("uniqueness", factor_model.uniqueness.numpy())

SGD Optimizer Converges After 112 Iterations
intercept [-0.00753584 -0.00187876 -0.00568033  0.00640062 -0.02027605  0.00150894
  0.0221699   0.02965379  0.01331654 -0.02272707 -0.01132167 -0.02347031]
loading [[ 0.31055331  0.61526413 -0.13135377  0.06137542]
 [ 0.33408213  0.61058939 -0.11385572  0.07724464]
 [ 0.31761933  0.60357108 -0.11599456  0.05329567]
 [ 0.12770225 -0.13202705 -0.16211331  0.66349195]
 [ 0.11347517 -0.14448353 -0.16478664  0.64915067]
 [ 0.1132218  -0.13675867 -0.16407347  0.64705477]
 [ 0.24230104 -0.0420092   0.65659548  0.10148097]
 [ 0.25207928 -0.02581754  0.64480493  0.11331565]
 [ 0.25448579 -0.03213921  0.64685363  0.10958368]
 [-0.57544496  0.30693552  0.19453592  0.20522389]
 [-0.5692919   0.31433119  0.19352999  0.20114726]
 [-0.5487563   0.30090974  0.17519249  0.20209436]]
uniqueness [0.50749634 0.48467768 0.52422564 0.50956861 0.50447733 0.53200612
 0.49749027 0.49267242 0.51261474 0.4969636  0.50349664 0.5473236 ]


## Two-Parameter Logistic Model

In [11]:
def generate_2pl_data(n_sample, n_factor, n_item, 
                      alpha, beta, rho, 
                      dtype = tf.float64):
    if (n_item % n_factor) != 0:
        n_item = n_factor * (n_item // n_factor)
    item_per_factor = (n_item // n_factor)
    intercept = tf.fill((n_item,), value = tf.constant(alpha, dtype = dtype))
    loading = np.zeros((n_item, n_factor))
    for i in range(n_factor):
        for j in range(i * item_per_factor,
                       (i + 1) * item_per_factor):
            loading[j, i] = ld
    loading = tf.constant(loading, dtype = dtype)
    if rho is None:
        cor = tf.eye(n_factor, dtype = dtype)
    else:
        unit = tf.ones((n_factor, 1), dtype = dtype)
        identity = tf.eye(n_factor, dtype = dtype)
        cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity
    dist_eta = tfd.MultivariateNormalTriL(
        loc = tf.zeros(n_factor, dtype = dtype), scale_tril = tf.linalg.cholesky(cor))
    eta = dist_eta.sample(n_sample)
    logits = intercept + eta @ tf.transpose(loading)
    x = tfd.Bernoulli(logits=logits, dtype=dtype).sample()
    return x

In [12]:
n_sample = 10000
n_factor = 5
n_item = 25
alpha = .2
beta = .7 
rho = 0
dtype = tf.float64
x = generate_2pl_data(n_sample, n_factor, n_item, 
                      alpha, beta, rho, 
                      dtype = dtype)

In [13]:
class TwoPLModel(tf.Module):
    def __init__(self, n_item, n_factor, 
                 dtype = tf.float64):
        super().__init__()
        self.dtype = dtype
        self.intercept = tf.Variable(
            tf.zeros(n_item, dtype = self.dtype), name = "intercept")
        self.loading = tf.Variable(
            tf.random.uniform((n_item, n_factor), dtype = self.dtype), 
            name = "loading")
    def __call__(self, x):
        n_sample = len(x)
        joint_prob = tfd.JointDistributionSequential([
            tfd.Independent(
                tfd.Normal(
                    loc = tf.zeros((n_sample, n_factor), dtype=self.dtype),
                    scale = 1.0), 
                reinterpreted_batch_ndims=1),
            lambda eta: tfd.Independent(
                tfd.Bernoulli(
                    logits= self.intercept + eta @ tf.transpose(self.loading), 
                    dtype=self.dtype), 
                reinterpreted_batch_ndims=1)])             
        joint_prob._to_track=self
        return joint_prob

In [14]:
two_pl_model = TwoPLModel(n_item, n_factor)
joint_prob = two_pl_model(x)

def target_log_prob_fn(*eta):
    return joint_prob.log_prob(eta + (x,))

hmc=tfp.mcmc.HamiltonianMonteCarlo(
    target_log_prob_fn = target_log_prob_fn,
    step_size = .015,
    num_leapfrog_steps=3)
current_state = joint_prob.sample()[:-1]
kernel_results = hmc.bootstrap_results(current_state)

@tf.function(autograph=False,
             experimental_compile=True)
def one_e_step(current_state, kernel_results):
    next_state, next_kernel_results = hmc.one_step(
        current_state=current_state,
        previous_kernel_results=kernel_results)
    return next_state, next_kernel_results

optimizer=tf.optimizers.RMSprop(learning_rate=.01)

@tf.function(autograph=False, 
             experimental_compile=True)
def one_m_step(current_state):
    with tf.GradientTape() as tape:
        loss_value = -tf.reduce_mean(
            target_log_prob_fn(*current_state))
    gradients = tape.gradient(loss_value, two_pl_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, two_pl_model.trainable_variables))
    return loss_value

In [15]:
import time
num_warmup_start = 1
num_warmup_iter = 1
num_iters = 1
num_accepted = 0
loss_history = np.zeros([num_iters])
tStart = time.time()
# Run warm-up stage.
for t in range(num_warmup_start):
    current_state, kernel_results = one_e_step(
        current_state, kernel_results)
    num_accepted += kernel_results.is_accepted.numpy().prod()
    if t % 500 == 0:
        print("Warm-Up Iteration: {:>3} Acceptance Rate: {:.3f}".format(
            t, num_accepted / (t + 1)))
num_accepted = 0  # reset acceptance rate counter

# Run training.
for t in range(num_iters):
    for _ in range(num_warmup_iter):
        current_state, kernel_results = one_e_step(current_state, kernel_results)
    loss_value = one_m_step(current_state)
    num_accepted += kernel_results.is_accepted.numpy().prod()
    loss_history[t] = loss_value.numpy()
    if t % 50 == 0:
        print("Iteration: {:>4} Acceptance Rate: {:.3f} Loss: {:.3f}".format(
            t, num_accepted / (t + 1), loss_history[t]))
tEnd = time.time()

Warm-Up Iteration:   0 Acceptance Rate: 1.000


Iteration:    0 Acceptance Rate: 1.000 Loss: 28.280


In [16]:
print(tEnd - tStart)
print(np.around(two_pl_model.trainable_variables[0].numpy(), decimals=2))
print(np.around(two_pl_model.trainable_variables[1].numpy(), decimals=2))

2.686702251434326
[0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03
 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03]
[[ 0.1   0.38  0.71  0.34  0.82]
 [ 0.6   0.18  0.33  0.78  0.04]
 [ 0.63  0.63  0.13  0.31  0.71]
 [ 0.24  0.46  0.32  0.33  0.52]
 [ 0.69  0.04  0.41  0.66  0.05]
 [ 0.36  0.24  0.09  0.29  0.88]
 [ 0.83  0.44  0.64  0.25  0.43]
 [ 0.69  0.5   0.49  0.26  0.28]
 [ 0.37  0.11  0.21  0.66  0.78]
 [ 0.63  0.24  0.7   0.84  0.14]
 [ 0.83  0.76  0.65  0.07  0.6 ]
 [ 0.71  0.78  0.15  0.85  0.88]
 [ 0.04  0.11  0.05  0.72  0.88]
 [ 0.11  0.44  0.24  0.46  0.41]
 [ 0.09  0.28  0.8   0.88  0.92]
 [ 0.13  0.87  0.79  0.06  0.03]
 [ 0.22  0.97  0.92  0.3   0.15]
 [ 0.3   0.4   0.27  0.71  0.56]
 [ 0.05  0.26  0.07  0.65  0.05]
 [ 0.65  0.4   0.42 -0.01  0.23]
 [ 0.78  0.29  0.47  0.64  0.62]
 [ 0.06  0.58  0.7   0.21  0.1 ]
 [ 0.13  0.52  0.03 -0.02  0.72]
 [ 0.94  0.63  0.16  0.09  0.95]
 [ 0.04  0.47  0.62  0.66  0.13]]


## Grade Response Model

In [17]:
def create_cd(n_category, dtype):
    c1 = tf.linalg.diag(
        tf.fill([n_category - 1],
                tf.constant([1], dtype = dtype)),
        k = 0, num_rows= n_category - 1, num_cols= n_category)
    c2 = tf.linalg.diag(
        tf.fill([n_category - 1],
            tf.constant([1], dtype = dtype)),
        k = 1, num_rows= n_category - 1, num_cols= n_category)
    c = c1 - c2
    d = tf.squeeze(tf.linalg.diag(
        tf.constant([1], dtype = dtype),
        k = n_category - 1, num_rows= 1, num_cols= n_category))
    return c, d

def grm_irf(eta, intercept, loading, c, d):
    tau = tf.expand_dims(eta @ tf.transpose(loading), axis = 2) + intercept
    probs = tf.math.sigmoid(tau) @ c + d
    return probs

In [18]:
def generate_grm_data(n_sample, n_factor, n_item,
                      nu, ld, rho,
                      dtype = tf.float64):
    if (n_item % n_factor) != 0:
        n_item = n_factor * (n_item // n_factor)
    item_per_factor = (n_item // n_factor)
    n_category = len(nu) + 1
    intercept = tf.tile(tf.constant([nu], dtype = dtype),
                        multiples = [n_item, 1])
    loading = np.zeros((n_item, n_factor))
    for i in range(n_factor):
        for j in range(i * item_per_factor,
                       (i + 1) * item_per_factor):
            loading[j, i] = ld
    loading = tf.constant(loading, dtype = dtype)
    if rho is None:
        cor = tf.eye(n_factor, dtype = dtype)
    else:
        unit = tf.ones((n_factor, 1), dtype = dtype)
        identity = tf.eye(n_factor, dtype = dtype)
        cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity
    dist_eta = tfd.MultivariateNormalTriL(
        loc = tf.zeros(n_factor, dtype = dtype),
        scale_tril = tf.linalg.cholesky(cor))
    eta = dist_eta.sample(n_sample)
    c, d = create_cd(n_category, dtype)
    probs = grm_irf(eta, intercept, loading, c, d)
    x = tfd.Categorical(probs=probs, dtype=dtype).sample()
    return x

In [19]:
n_sample = 10000
n_factor = 5
n_item = 15
n_category = 3
nu = [-.5, .5]
ld = .7
rho = 0
dtype = tf.float64
x = generate_grm_data(n_sample, n_factor, n_item,
                      nu, ld, rho, dtype = dtype)

In [20]:
class GRM(tf.Module):
    def __init__(self, n_item,
                 n_factor, n_category,
                 dtype = tf.float64):
        super().__init__()
        self.n_item = n_item
        self.n_factor = n_factor
        self.n_category = n_category
        self.dtype = dtype
        self.intercept = tf.Variable(
            tf.tile(tf.sort(tf.random.uniform((1, self.n_category - 1),
                  minval = -1, maxval = 1,
                  dtype = self.dtype)), multiples = [self.n_item, 1]), name = "intercept")
        self.loading = tf.Variable(
            tf.random.uniform((self.n_item, self.n_factor), dtype = self.dtype),
            name = "loading")
    def __call__(self, x):
        n_sample = len(x)
        c, d = create_cd(self.n_category, self.dtype)
        joint_prob = tfd.JointDistributionSequential([
            tfd.Independent(
                tfd.Normal(
                    loc = tf.zeros((n_sample, n_factor), dtype=self.dtype),
                    scale = 1.0),
                reinterpreted_batch_ndims=1),
            lambda eta: tfd.Independent(
                tfd.Categorical(
                    probs = grm_irf(eta, self.intercept, self.loading, c, d),
                        dtype = self.dtype),
                reinterpreted_batch_ndims=1)])
        joint_prob._to_track=self
        return joint_prob

In [21]:
grm = GRM(n_item, n_factor, n_category)
joint_prob = grm(x)

In [22]:
def target_log_prob_fn(*eta):
    return joint_prob.log_prob(eta + (x,))

hmc=tfp.mcmc.HamiltonianMonteCarlo(
    target_log_prob_fn = target_log_prob_fn,
    step_size = .015,
    num_leapfrog_steps=3)
current_state = joint_prob.sample()[:-1]
kernel_results = hmc.bootstrap_results(current_state)

def one_e_step(current_state, kernel_results):
    next_state, next_kernel_results = hmc.one_step(
        current_state=current_state,
        previous_kernel_results=kernel_results)
    return next_state, next_kernel_results

optimizer=tf.optimizers.RMSprop(learning_rate=.01)

def one_m_step(current_state):
    with tf.GradientTape() as tape:
        loss_value = -tf.reduce_mean(
            target_log_prob_fn(*current_state))
    gradients = tape.gradient(loss_value, grm.trainable_variables)
    optimizer.apply_gradients(zip(gradients, grm.trainable_variables))
    return loss_value

In [23]:
import time
num_warmup_start = 1
num_warmup_iter = 1
num_iters = 1
num_accepted = 0
loss_history = np.zeros([num_iters])
tStart = time.time()
# Run warm-up stage.
for t in range(num_warmup_start):
    current_state, kernel_results = one_e_step(
        current_state, kernel_results)
    num_accepted += kernel_results.is_accepted.numpy().prod()
    if t % 500 == 0:
        print("Warm-Up Iteration: {:>3} Acceptance Rate: {:.3f}".format(
            t, num_accepted / (t + 1)))
num_accepted = 0  # reset acceptance rate counter

# Run training.
for t in range(num_iters):
    for _ in range(num_warmup_iter):
        current_state, kernel_results = one_e_step(current_state, kernel_results)
    loss_value = one_m_step(current_state)
    num_accepted += kernel_results.is_accepted.numpy().prod()
    loss_history[t] = loss_value.numpy()
    if t % 50 == 0:
        print("Iteration: {:>4} Acceptance Rate: {:.3f} Loss: {:.3f}".format(
            t, num_accepted / (t + 1), loss_history[t]))
tEnd = time.time()

Warm-Up Iteration:   0 Acceptance Rate: 1.000


Iteration:    0 Acceptance Rate: 1.000 Loss: 29.825


In [24]:
print(tEnd - tStart)
print(np.around(grm.trainable_variables[0].numpy(), decimals=2))
print(np.around(grm.trainable_variables[1].numpy(), decimals=2))





0.48525071144104004
[[-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]
 [-0.62 -0.38]]
[[0.76 0.92 0.11 0.27 0.08]
 [0.85 0.13 0.85 0.31 0.09]
 [0.26 0.39 0.24 0.64 0.07]
 [0.12 0.16 0.18 0.68 0.42]
 [0.33 0.01 0.72 0.43 0.46]
 [0.46 0.31 0.82 0.77 0.54]
 [0.31 0.67 0.37 0.55 0.67]
 [0.77 0.39 0.33 0.66 0.19]
 [0.37 0.35 0.77 0.52 0.83]
 [0.33 0.77 0.81 0.77 0.26]
 [0.22 0.67 0.55 0.53 0.85]
 [0.66 0.56 0.45 0.3  0.36]
 [0.24 0.52 0.11 0.78 0.59]
 [0.55 0.94 0.8  0.38 0.76]
 [0.84 0.95 0.4  0.38 0.42]]
