In [61]:
import numpy as np
import math
import collections
import random
from scipy.optimize import linprog
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
def hist(samp_list, min_s, max_s):
    h = [0] * (max_s - min_s + 1)
    for i in samp_list:
        h[i-min_s] += 1
    return h

def makeFingerPrint(samp_list):
    max_s = max(samp_list)
    min_s = min(samp_list)
    h1 = hist(samp_list, min_s, max_s)
    f = hist(h1, 0, max(h1))
    f = f[1:]
    return f

In [140]:
#FOR UNSEEN
def poisspdf(x, l):
    v1 = math.pow(l, x)
    v2 = math.factorial(x)
    v3 = math.pow(math.e, -l)
    final = (v1 / v2) * v3
    return final

def getSampleSize(f):
    k = 0
    for i,entry in enumerate(f):
        k = k + entry * (i+1)
    return k

def getFirstNonZeroIndex(f):
    for i,entry in enumerate(f):
        if(entry != 0):
            return i+1

def getLastNonZeroIndex(f):
    for i, entry in reversed(list(enumerate(f))):
        if(entry != 0):
            return i+1
        return -1

def getProbabilityMass(x, histx):
    pmass = 0
    for i, x_entry in enumerate(x):
        pmass = pmass + x_entry * histx[i]
    return pmass

def init_XLP(xLPmax, xLPmin, gridFactor):
    max_pow = math.ceil(math.log(xLPmax/xLPmin) / math.log(gridFactor))
    xLP = []
    for i in range(0, max_pow+1):
        xLP.append(xLPmin * math.pow(gridFactor, i))
    return xLP

def init_objf(szLPx, szLPf, fLP):
    objf = [0] * (szLPx + 2*szLPf)
    j = 0
    for i in range(szLPx, len(objf), 2):
        objf[i] = 1 / math.sqrt(fLP[j] + 1)
        j = j + 1
    j = 0
    for i in range(szLPx+1, len(objf), 2):
        objf[i] = 1 / math.sqrt(fLP[j] + 1)
        j = j + 1
    return objf

def init_A_b(szLPf, szLPx, xLP, fLP, k):
    #A = [[0] * (szLPx + 2 * szLPf)] * 2*szLPf
    A = []
    for i in range(2*szLPf):
        tmp = []
        for j in range(szLPx + 2 * szLPf):
            tmp.append(0)
        A.append(tmp)
    
    b = [0] * (2*szLPf)
    for i in range (0, szLPf):
        for j in range(0, szLPx):
            A[2 * i][j] = poisspdf(i+1, k * xLP[j])
        for k in range(0, szLPx):
            A[2*i+1][k] = -1 * A[2*i][k]
        A[2 * i][szLPx + 2 * i] = -1
        A[(2 * i)  + 1][szLPx + (2 * i)  + 1] = -1
        b[2 * i] = fLP[i]
        b[(2 * i)  + 1] = -fLP[i]
    
    return A,b

def rescale_cond(A, Aeq, xLP, szLPx):
    for i in range(0, szLPx):
        for j in range(0, len(A)):
            A[j][i] = A[j][i]/xLP[i]
        Aeq[i] = Aeq[i]/xLP[i]
    return A, Aeq

    
def unseen(f):
    k = getSampleSize(f)
    
    gridFactor = 1.05
    alpha = 0.5
    
    xLPmin = 1 / (k * max(10,k))
    print(xLPmin)
    
    min_i = getFirstNonZeroIndex(f) #Returning index + 1
    print("min_i", min_i)
    
    if min_i > 1:
        xLPmin = min_i/k
    print("xLPmin", xLPmin)
    
    maxLPIters = 100
    x = [0]
    histx = [0]
    
    fLP = [0] * len(f)
    for i in range(0, len(f)):
        i_m = i+1
        if(f[i] > 0):
            wind = [max(1, i_m - math.ceil(math.sqrt(i_m))), 
                    min(i_m + math.ceil(math.sqrt(i_m)), len(f))]
            
            sum_f = sum(f[wind[0] : wind[1]+1])
            if( sum_f < math.sqrt(i_m)):
                x.append(i_m/k)
                histx.append(f[i])
                fLP[i] = 0
            else:
                fLP[i] = f[i]
    print("fLP", fLP)
    print("histx", histx)
    print("x", x)
    
    #If no LP portion, retun the empirical histogram
    fmax = getLastNonZeroIndex(f)
    if(fmax == -1):
        x.pop(0)
        histx.pop(0)
        return [x, histx]
    
    #Setting up first LP
    LPmass = 1 - getProbabilityMass(x, histx)
    print("LPMASS", LPmass)
    
    z_flp = [0] * fmax
    fLP.extend(z_flp)
    print("New FLP", fLP)
    
    szLPf = len(fLP)
    print("szLFf",szLPf)
    
    xLPmax = fmax/k
    print("xLPmax", xLPmax)
    
    xLP = init_XLP(xLPmax, xLPmin, gridFactor)
    
    szLPx = len(xLP)
    print("szLPx", szLPx)
    
    objf = init_objf(szLPx, szLPf, fLP)
    
    A, b = init_A_b(szLPf, szLPx, xLP, fLP, k)
    print(b)
    
    Aeq = [0] * (szLPx+2*szLPf)
    for i in range(0,szLPx):
        Aeq[i] = xLP[i]
    beq = LPmass
    
    A, Aeq = rescale_cond(A, Aeq, xLP, szLPx)
    #print("Aeq",Aeq)
    
    options = {'maxiter':1000, 'disp':False}
    lb = [0] * (szLPx+2*szLPf)
    ub = [float('Inf')] * (szLPx+2*szLPf)
    #print(len(ub))
    #print(len(A),len(A[0]))
    print("objf", len(objf))
    res = linprog(objf, A, b, [Aeq], beq, list(zip(lb, ub)), options={'disp':False}, method='interior-point')
    
    print("\n\n\n")
    print(res)

In [141]:
f = [18, 1]
unseen(f)

0.0025
min_i 1
xLPmin 0.0025
fLP [18, 0]
histx [0, 1]
x [0, 0.1]
LPMASS 0.9
New FLP [18, 0, 0, 0]
szLFf 4
xLPmax 0.1
szLPx 77
[18, -18, 0, 0, 0, 0, 0, 0]
objf 85




     con: array([-1.37013734e-11])
     fun: 4.5743076968785426
 message: 'Optimization terminated successfully.'
     nit: 11
   slack: array([ 1.56564761e+01,  1.85242044e-10, -1.15227133e-10,  1.14454956e-01,
       -4.49076315e-11,  2.95565459e-01, -2.18430829e-11,  5.72445332e-01])
  status: 0
 success: True
       x: array([2.33824449e-13, 8.62479926e-13, 8.59914896e-13, 7.50216358e-13,
       6.46013269e-13, 5.62372832e-13, 4.96235283e-13, 4.42705964e-13,
       3.97962557e-13, 3.59416866e-13, 3.25409690e-13, 2.94910317e-13,
       2.67294267e-13, 2.42190908e-13, 2.19381641e-13, 1.98732794e-13,
       1.80152077e-13, 1.63561220e-13, 1.48879829e-13, 1.36017125e-13,
       1.24869209e-13, 1.15320115e-13, 1.07245265e-13, 1.00516187e-13,
       9.50054806e-14, 9.05911523e-14, 8.71595243e-14, 8.46060995e-14,
       8.283

In [138]:
n = 100
k = 20
samp = np.random.randint(1, n, k)
f = makeFingerPrint(samp)
print(f)

[18, 1]
