In [61]:
import numpy as np
import math
import collections
import random
from scipy.optimize import linprog
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
def hist(samp_list, min_s, max_s):
    h = [0] * (max_s - min_s + 1)
    for i in samp_list:
        h[i-min_s] += 1
    return h

def makeFingerPrint(samp_list):
    max_s = max(samp_list)
    min_s = min(samp_list)
    h1 = hist(samp_list, min_s, max_s)
    f = hist(h1, 0, max(h1))
    f = f[1:]
    return f

In [173]:
#FOR UNSEEN
def poisspdf(x, l):
    v1 = math.pow(l, x)
    v2 = math.factorial(x)
    v3 = math.pow(math.e, -l)
    final = (v1 / v2) * v3
    return final

def getSampleSize(f):
    k = 0
    for i,entry in enumerate(f):
        k = k + entry * (i+1)
    return k

def getFirstNonZeroIndex(f):
    for i,entry in enumerate(f):
        if(entry != 0):
            return i+1

def getLastNonZeroIndex(f):
    for i, entry in reversed(list(enumerate(f))):
        if(entry != 0):
            return i+1
        return -1

def getProbabilityMass(x, histx):
    pmass = 0
    for i, x_entry in enumerate(x):
        pmass = pmass + x_entry * histx[i]
    return pmass

def init_XLP(xLPmax, xLPmin, gridFactor):
    max_pow = math.ceil(math.log(xLPmax/xLPmin) / math.log(gridFactor))
    xLP = []
    for i in range(0, max_pow+1):
        xLP.append(xLPmin * math.pow(gridFactor, i))
    return xLP

def init_objf(szLPx, szLPf, fLP):
    objf = [0] * (szLPx + 2*szLPf)
    j = 0
    for i in range(szLPx, len(objf), 2):
        objf[i] = 1 / math.sqrt(fLP[j] + 1)
        j = j + 1
    j = 0
    for i in range(szLPx+1, len(objf), 2):
        objf[i] = 1 / math.sqrt(fLP[j] + 1)
        j = j + 1
    return objf

def init_A_b(szLPf, szLPx, xLP, fLP, k):
    #A = [[0] * (szLPx + 2 * szLPf)] * 2*szLPf
    A = []
    for i in range(2*szLPf):
        tmp = []
        for j in range(szLPx + 2 * szLPf):
            tmp.append(0)
        A.append(tmp)
    
    b = [0] * (2*szLPf)
    for i in range (0, szLPf):
        for j in range(0, szLPx):
            A[2 * i][j] = poisspdf(i+1, k * xLP[j])
        for k in range(0, szLPx):
            A[2*i+1][k] = -1 * A[2*i][k]
        A[2 * i][szLPx + 2 * i] = -1
        A[(2 * i)  + 1][szLPx + (2 * i)  + 1] = -1
        b[2 * i] = fLP[i]
        b[(2 * i)  + 1] = -fLP[i]
    
    return A,b

def rescale_cond(A, Aeq, xLP, szLPx):
    for i in range(0, szLPx):
        for j in range(0, len(A)):
            A[j][i] = A[j][i]/xLP[i]
        Aeq[i] = Aeq[i]/xLP[i]
    return A, Aeq

    
def unseen(f):
    k = getSampleSize(f)
    
    gridFactor = 1.05
    alpha = 0.5
    
    xLPmin = 1 / (k * max(10,k))
    print(xLPmin)
    
    min_i = getFirstNonZeroIndex(f) #Returning index + 1
    print("min_i", min_i)
    
    if min_i > 1:
        xLPmin = min_i/k
    print("xLPmin", xLPmin)
    
    maxLPIters = 1000
    x = [0]
    histx = [0]
    
    fLP = [0] * len(f)
    for i in range(0, len(f)):
        i_m = i+1
        if(f[i] > 0):
            wind = [max(1, i_m - math.ceil(math.sqrt(i_m))), 
                    min(i_m + math.ceil(math.sqrt(i_m)), len(f))]
            
            sum_f = sum(f[wind[0] : wind[1]+1])
            if( sum_f < math.sqrt(i_m)):
                x.append(i_m/k)
                histx.append(f[i])
                fLP[i] = 0
            else:
                fLP[i] = f[i]
    print("fLP", fLP)
    print("histx", histx)
    print("x", x)
    
    #If no LP portion, retun the empirical histogram
    fmax = getLastNonZeroIndex(f)
    if(fmax == -1):
        x.pop(0)
        histx.pop(0)
        return [x, histx]
    
    #Setting up first LP
    LPmass = 1 - getProbabilityMass(x, histx)
    print("LPMASS", LPmass)
    
    z_flp = [0] * fmax
    fLP.extend(z_flp)
    print("New FLP", fLP)
    
    szLPf = len(fLP)
    print("szLFf",szLPf)
    
    xLPmax = fmax/k
    print("xLPmax", xLPmax)
    
    xLP = init_XLP(xLPmax, xLPmin, gridFactor)
    
    szLPx = len(xLP)
    print("szLPx", szLPx)
    
    objf = init_objf(szLPx, szLPf, fLP)
    
    A, b = init_A_b(szLPf, szLPx, xLP, fLP, k)
    print(b)
    
    Aeq = [0] * (szLPx+2*szLPf)
    for i in range(0,szLPx):
        Aeq[i] = xLP[i]
    beq = LPmass
    
    A, Aeq = rescale_cond(A, Aeq, xLP, szLPx)
    #print("Aeq",Aeq)
    
    options = {'maxiter':1000, 'disp':False}
    lb = [0] * (szLPx+2*szLPf)
    ub = [float('Inf')] * (szLPx+2*szLPf)
    #print(len(ub))
    #print(len(A),len(A[0]))
    print("objf", len(objf))
    res = linprog(objf, A, b, [Aeq], beq, list(zip(lb, ub)), options={'disp':False, 'presolve':True}, method='interior-point')
    print(res)
    
    #Second LP
    objf2 = [0] * len(objf)
    for i in range(0, szLPx):
        objf2[i] = 1
    
    A2 = []
    for row in A:
        A2.append(row)
    A2.append(objf)
    
    b.append(res['fun'])
    b2 = b
    for i in range(0, szLPx):
        objf2[i] = objf2[i]/xLP[i]
    
    res = linprog(objf2, A2, b2, [Aeq], beq, list(zip(lb, ub)), options={'disp':False}, method='interior-point')
    print(res)

In [174]:
f = [4,1]
unseen(f)

0.016666666666666666
min_i 1
xLPmin 0.016666666666666666
fLP [4, 0]
histx [0, 1]
x [0, 0.3333333333333333]
LPMASS 0.6666666666666667
New FLP [4, 0, 0, 0]
szLFf 4
xLPmax 0.3333333333333333
szLPx 63
[4, -4, 0, 0, 0, 0, 0, 0]
objf 71
     con: array([-1.16658783e-09])
     fun: 1.224235450503509
 message: 'Optimization terminated successfully.'
     nit: 12
   slack: array([ 2.51458078e+00,  4.00507716e-09, -4.09876783e-09,  7.58394452e-03,
       -2.74158818e-09,  2.58768460e-02, -1.69502284e-09,  6.62199563e-02])
  status: 0
 success: True
       x: array([6.61608348e-12, 2.20118541e-11, 1.88347728e-11, 1.39798398e-11,
       1.06639528e-11, 8.72728731e-12, 7.66538607e-12, 7.12045270e-12,
       6.88982430e-12, 6.85880508e-12, 6.94312221e-12, 7.06204769e-12,
       7.14206091e-12, 7.13633520e-12, 7.03931100e-12, 6.88347654e-12,
       6.72087298e-12, 6.60162773e-12, 6.56024314e-12, 6.61260238e-12,
       6.76041031e-12, 6.99813588e-12, 7.31903407e-12, 7.71901166e-12,
       8.19856345e-


Solving system with option 'sym_pos':True failed. It is normal for this to happen occasionally, especially as the solution is approached. However, if you see this frequently, consider setting option 'sym_pos' to False.


Solving system with option 'sym_pos':False failed. This may happen occasionally, especially as the solution is approached. However, if you see this frequently, your problem may be numerically challenging. If you cannot improve the formulation, consider setting 'lstsq' to True. Consider also setting `presolve` to True, if it is not already.



In [153]:
n = 10000
k = 1000
samp = np.random.randint(1, n, k)
f = makeFingerPrint(samp)
print(f)

[906, 47]
