# BigTable MTLM - dataset perplexity testing

Aim of this notebook:
1. Generate student + question datasets
2. Generate N sets of encounters
3. Report on agreement between these sets

The error between the datasets (for large N) is the inherent probabilistic error in the model
- How does this translate to tolerances in the $\alpha$ and $\delta$ parameters

## Model perplexity
A model $q$ is used to predict the values of a set of samples, $\mathbf{x}$.  Perplexity is defined as:

\\[{perplex}_{q}(\mathbf{x}) = b^{-\frac{1}{N}\Sigma_{i=1}^{N}{log_{b}(q(x_i))}}\\]

Perplexity is a measure of `surprise' as a divergence from the predictions that are seen in the true values.


In [2]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict, Counter
from copy import copy
from math import exp, sqrt, log
from random import random, shuffle, choice, randint, uniform
import numpy
import math

from keras import Input, Model
from keras.callbacks import EarlyStopping
from keras.constraints import non_neg, max_norm
from numpy import array, mean, ones
from pandas import concat
from pandas import DataFrame
from keras.models import Sequential
from keras.layers import LSTM, multiply, subtract, add, Activation, Lambda, Flatten
from keras.layers import Dense, concatenate, MaxPooling1D, LocallyConnected1D, Reshape, Dropout
from keras.optimizers import Adam, SGD
from keras import backend as K
from keras import constraints

import tensorflow as tf

from utils import generate_student_name, create_qs, create_students, generate_attempts, calculate_pass_probability, attempt_q

import random

from matplotlib import pyplot as plt

# p = 1 / (1 + e^-z)
# -ln((1/ p) - 1) = z
-log((1/0.993) - 1)


Using TensorFlow backend.


4.954820514989862

In [3]:
# # %%capture
# from IPython.display import clear_output

# serieses = []
# min_errs = []
# n_items = 100
# n_students = 1000
# # desired confusion prop/n mx
# # .5  0 
# #  0 .5

# def gen_run(n_traits, minb, maxb, mu_th, sd_th, min_active_traits, max_active_traits):
#     qs = create_qs(n_items, n_traits, (min_active_traits, max_active_traits), minb, maxb)
# #     #     qs, q_table = create_qs_from_blobs(n_qs, 2, n_traits)
#     ss = create_students(n_students, n_traits, mu_th, sd_th)

#     x = []

#     for _ in range(1):
#         xa, _,_,_ = generate_attempts(qs,ss) # this is our x list of samples
#         x.extend(xa)
    
# #     gaussian_pts = numpy.random.uniform((mu_th-3.0*sd_th),(mu_th+3.0*sd_th), n_students)
# #     uniform_pts = numpy.random.uniform(minb, maxb, n_items)
# #     gaussian_pts = numpy.repeat(gaussian_pts, n_items)
# #     uniform_pts = numpy.tile(uniform_pts, n_students)     
    
#     tp,fp,tn,fn=0,0,0,0
#     base = 2
#     summa=0
#     N = len(x)
#     probs = []
#     for tup in x:
#         (psi_id, q_id, passed, passed) = tup
#         p = calculate_pass_probability(ss[psi_id].thetas, qs[q_id].betas)
#         summa += log((p if passed else (1-p)), base)
#         probs.append(p)
        
#         pp = uniform(0,1)
#         if pp <= p:
#             if passed:
#                 tp+=1
#             else:
#                 fp+=1
#         else:
#             if passed:
#                 fn+=1
#             else:
#                 tn+=1

#     acc = (tp+tn)/len(x)
#     print("model acc:",acc)
#     print(tp,fp)
#     print(fn,tn)

#     ppx = pow( base, (-summa/N))
#     print("perplexity is {}".format(ppx))
#     return ((fn + fp) + abs(tp-tn)), probs
    
# dims_scores = {}
# best_probs = {}
# param_freedom = 10
# random.seed()
# seen = set()
# mini = 1
# maxi = 15
# #dimslist = [1,2,3,5,10,25,100]:

# dims = 100

# inv_sigmoid = lambda pr : ( -log((1/pr) -1) )
# min_sprd = inv_sigmoid(0.02**(1/dims))
# mid_sprd = inv_sigmoid(0.5**(1/dims))
# max_sprd = inv_sigmoid(0.98**(1/dims))

# min_b = 1
# max_th = min_b + max_sprd


# best_run = -1

# minb_spd = 2
# maxb_spd = 6

# for maxi in [20]:
#     i=0
#     while i < 100:
#         rnge = uniform(minb_spd, maxb_spd)
#         minb = -rnge/2
#         maxb = rnge/2
        
#         mu_th = uniform(mid_sprd, max_sprd)
#         sd_th = uniform(1,2)
    
#         i+=1
#         print(">>>",i)
#         outz = gen_run(dims, minb, maxb, mu_th, sd_th, dims, dims)
#         balance, probz = outz[0], outz[1]

#         loss = numpy.mean([abs(p-0.5) for p in probz])
        
#         sd = numpy.std(numpy.array(probz))
        
#         print("?",loss,sd)
#         if (dims not in dims_scores) or (dims_scores[dims][0] >= loss and dims_scores[dims][2] < sd):
#             print("+++")
#             dims_scores[dims] = (loss, balance, sd, mu_th,sd_th,minb,maxb)
#             best_probs[dims] = probz
#             best_run = i-1
            

In [4]:
# %%capture
from IPython.display import clear_output

serieses = []
min_errs = []
n_items = 100
n_students = 1000
# desired confusion prop/n mx
# .5  0 
#  0 .5

def gen_run(n_traits, minb, maxb, mu_th, sd_th, min_active_traits, max_active_traits):
    qs = create_qs(n_items, n_traits, (min_active_traits, max_active_traits), minb, maxb)
#     #     qs, q_table = create_qs_from_blobs(n_qs, 2, n_traits)
    ss = create_students(n_students, n_traits, mu_th, sd_th)

    x = []

    for _ in range(1):
        xa, _,_,_ = generate_attempts(qs,ss) # this is our x list of samples
        x.extend(xa)
    
#     gaussian_pts = numpy.random.uniform((mu_th-3.0*sd_th),(mu_th+3.0*sd_th), n_students)
#     uniform_pts = numpy.random.uniform(minb, maxb, n_items)
#     gaussian_pts = numpy.repeat(gaussian_pts, n_items)
#     uniform_pts = numpy.tile(uniform_pts, n_students)     
    
    tp,fp,tn,fn=0,0,0,0
    base = 2
    summa=0
    N = len(x)
    probs = []
    for tup in x:
        (psi_id, q_id, passed, passed) = tup
        p = calculate_pass_probability(ss[psi_id].thetas, qs[q_id].betas)
        summa += log((p if passed else (1-p)), base)
        probs.append(p)
        
        pp = uniform(0,1)
        if pp <= p:
            if passed:
                tp+=1
            else:
                fp+=1
        else:
            if passed:
                fn+=1
            else:
                tn+=1

    acc = (tp+tn)/len(x)
    print("model acc:",acc)
    print(tp,fp)
    print(fn,tn)

    ppx = pow( base, (-summa/N))
    print("perplexity is {}".format(ppx))
    return ((fn + fp) + abs(tp-tn)), probs
    
dims_scores = {}
best_probs = {}
param_freedom = 10
random.seed()
seen = set()
mini = 1
maxi = 15
#dimslist = [1,2,3,5,10,25,100]:

dims = 100

inv_sigmoid = lambda pr : ( -log((1/pr) -1) )
min_sprd = inv_sigmoid(0.02**(1/dims))
mid_sprd = inv_sigmoid(0.5**(1/dims))
max_sprd = inv_sigmoid(0.98**(1/dims))

min_b = 1
max_th = min_b + max_sprd


best_run = -1

minb_spd = 2
maxb_spd = 6

for maxi in [20]:
    i=0
    while i < 100:
        rnge = uniform(minb_spd, maxb_spd)
        minb = -rnge/2
        maxb = rnge/2
        
        mu_th = uniform(mid_sprd, max_sprd)
        sd_th = uniform(1,2)
    
        i+=1
        print(">>>",i)
        
        use_uniform_for_students = True
        if use_uniform_for_students:
            student_0 = mu_th-3.0*sd_th
            student_1 = mu_th+3.0*sd_th
            student_pts = numpy.random.uniform(student_0, student_1, (n_students,dims))
        else:
            student_0 = mu_th
            student_1 = sd_th
            student_pts = numpy.random.normal(mu_th, sd_th, (n_students,dims))
            
        uniform_pts = numpy.random.uniform(minb, maxb, (n_items,dims))
#         student_pts = student_pts.reshape(n_students, dims)
#         uniform_pts = uniform_pts.reshape(n_items, dims )
        student_pts = numpy.repeat(student_pts, n_items, axis=0 )
        uniform_pts = numpy.tile(uniform_pts, (n_students,1) )
        diffs = 1.0 / (1.0 + numpy.exp(-(student_pts - uniform_pts)))
        probz = numpy.prod(diffs, axis=1)

        loss = numpy.abs(numpy.mean([(p-0.5) for p in probz]))
        
        sd = numpy.std(numpy.array(probz))
        
        print("?",loss,sd)
        if (dims not in dims_scores) or (dims_scores[dims][0] >= loss and dims_scores[dims][1]*.66 < sd):
            print("+++")
            dims_scores[dims] = (loss, sd, student_0, student_1,minb,maxb)
            best_probs[dims] = probz
            best_run = i-1
            

>>> 1
? 0.247208539901 0.0572729505148
+++
>>> 2
? 0.322355493047 0.0984122235313
>>> 3
? 0.480964345258 0.0163271269316
>>> 4
? 0.499951278162 0.000371541714793
>>> 5
? 0.416459204538 0.0491047869127
>>> 6
? 0.449982376221 0.0336117305072
>>> 7
? 0.387886523446 0.0801339335003
>>> 8
? 0.134201646069 0.0732322097142
+++
>>> 9
? 0.0375483453513 0.0782483395124
+++
>>> 10
? 0.347887947553 0.0287116607152
>>> 11
? 0.466762811568 0.0245767190481
>>> 12
? 0.499465451808 0.00249474344661
>>> 13
? 0.314367770625 0.0720249731627
>>> 14
? 0.351604026674 0.0577283454374
>>> 15
? 0.322766130908 0.0745420230274
>>> 16
? 0.0935611345081 0.0789404567425
>>> 17
? 0.215259694552 0.0745226612423
>>> 18
? 0.0145097429241 0.0745585228034
+++
>>> 19
? 0.243778589506 0.0832370633066
>>> 20
? 0.374202523597 0.0662192258914
>>> 21
? 0.473351731782 0.0271154965416
>>> 22
? 0.40279396225 0.0630082471811
>>> 23
? 0.493192172324 0.00894768628497
>>> 24
? 0.425777437081 0.0538069118354
>>> 25
? 0.499379768841 0.0

In [1]:
clear_output()
for dim in dims_scores:
    print(best_run)
    tup = dims_scores[dim]
    print(dim, tup)
    
#     print(best_probs[dim])
    
    probs = best_probs[dim]
    plt.hist(probs)
    plt.show()
    
    
    (loss, sd, st0 , st1 ,minb,maxb) = tup
    offset = 1 - minb
    #print((offset+minb, offset+maxb, offset+mu_th-3.0*sd_th, offset+mu_th+3.0*sd_th))
    print((offset+minb, offset+maxb, offset+st0, offset+st1))

NameError: name 'clear_output' is not defined

In [None]:
underlying = [inv_sigmoid(p) for p in probs]
plt.hist(underlying)
plt.show()

In [None]:
# # gaussian_pts = numpy.random.normal(5.15,1.51,1000)
# n_students = 100
# n_items = 100
# n_traits = 100
# gaussian_pts = numpy.random.normal(mu_th, sd_th, (n_students*n_traits))
# uniform_pts = numpy.random.uniform(minb, maxb, (n_items*n_traits))
# gaussian_pts = gaussian_pts.reshape(n_students, n_traits)
# uniform_pts = uniform_pts.reshape(n_items, n_traits )
# gaussian_pts = numpy.repeat(gaussian_pts, n_items, axis=0 )
# uniform_pts = numpy.tile(uniform_pts, (n_students,1) )

# print(gaussian_pts)
# print(uniform_pts)

# diffs = 1.0 / (1.0 + numpy.exp(-(gaussian_pts - uniform_pts)))
# pps = numpy.prod(diffs, axis=1)

# print(pps.shape)
# print(pps)

# # print((pt, u) for pt, u in zip(gaussian_pts, uniform_pts))
# # transformed_pts = [ numpy.prod(1.0 / (1.0 + exp(-(pt - u)))) for pt,u in zip(gaussian_pts,uniform_pts)]
# f,axes = plt.subplots(1,2)
# axes[0].hist(pps)
# plt.show()