In [1]:
import numpy as np
import scipy as sp
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=2.0, rc={"lines.linewidth": 4.0})
sns.set_style('ticks')

import pywsl.ssl.pnu_mr as pnu
import pywsl.cpe.cpe_ene as cpe

In [2]:
def calc_err(f_dec, x_tp, x_tn, prior):
    g_p, g_n = f_dec(x_tp), f_dec(x_tn)
    return prior*np.mean(g_p <= 0) + (1-prior)*np.mean(g_n >= 0)


def gendata(n_l, prior_l, n_u, prior_u, n_t):
    d = 2
    mu_p, mu_n = np.array([1, 1]), np.array([-1, -1])

    n_p = np.random.binomial(n_l, prior_l)
    n_n = n_l - n_p
    x_p = np.random.randn(n_p, 2) + mu_p
    x_n = np.random.randn(n_n, 2) + mu_n

    n_up = np.random.binomial(n_u, prior_u)
    n_un = n_u - n_up
    x_up = np.random.randn(n_up, 2) + mu_p
    x_un = np.random.randn(n_un, 2) + mu_n

    x = np.r_[x_p, x_n, x_up, x_un]
    y = np.r_[np.ones(n_p), -np.ones(n_n), np.zeros(n_u)]

    x_tp = np.random.randn(n_t, 2) + mu_p
    x_tn = np.random.randn(n_t, 2) + mu_n

    return x, y, x_tp, x_tn

In [17]:
if __name__ == "__main__":
    np.random.seed(1)

    n_l = 10
    n_u = 300
    n_t = 1000

    prior_l = .5
    prior_u = .3

    eta_list = np.arange(-.9, 1, .1)

    n_trial = 20

    best_err = np.inf
    errs1 = np.empty(n_trial)
    errs2 = np.empty((n_trial, len(eta_list)))
    priors = np.empty(n_trial)
    for ite in range(n_trial):
        x, y, x_tp, x_tn = gendata(n_l, prior_l, n_u, prior_u, n_t)
        priorh = cpe.cpe(x[y != 0, :], y[y != 0], x[y == 0, :])
        f_dec, outs, funcs = pnu.PNU_SL_FastCV(x, y, priorh, eta_list, 
                                               lambda_list=[.1], 
                                               model='lm', nargout=3)
        errs1[ite] = 100*calc_err(f_dec, x_tp, x_tn, prior_u)
        if errs1[ite] < best_err:
            best_err = errs1[ite]
            best_w = outs['w']
            best_x, best_y = x, y

        for ite_eta in range(len(eta_list)):
            errs2[ite, ite_eta] = 100*calc_err(funcs[ite_eta],
                                               x_tp, x_tn, prior_u)
        priors[ite] = priorh

    print("Average of misclassification rates: {:.1f} ({:.2f})".format(
        np.mean(errs1), np.std(errs1)/np.sqrt(n_trial)))
    print("Average of estimated class-priors: {:.2f} ({:.2f})".format(
        np.mean(priors), np.std(priors)/np.sqrt(n_trial)))


array([[ 0.19782716,  0.55112219],
       [-0.10593508, -0.65451545],
       [-1.3634686 ,  2.13534535],
       [-0.01701414,  1.63736181],
       [ 0.14009339,  2.77260763],
       [-2.11036305, -0.81878573],
       [-0.43565513, -1.56651023],
       [-0.2700244 , -0.62700621],
       [-0.46618909, -1.0919733 ],
       [ 0.91382039, -0.66920287],
       [ 0.14994762,  1.96082   ],
       [ 0.78258182,  1.15851488],
       [ 1.87341823,  0.88861663],
       [-0.03803876, -0.00947983],
       [-0.05825656,  1.65628408],
       [ 0.93750841, -0.73865429],
       [ 1.103163  ,  0.37833315],
       [ 1.27571804, -0.09067489],
       [ 0.39001475,  1.30641238],
       [ 2.69182613,  0.25204626],
       [ 0.41920278,  0.88924603],
       [ 3.04202875,  1.44752069],
       [ 1.68338423,  1.02288597],
       [ 1.85723427,  1.18393058],
       [ 0.58388842,  2.25005005],
       [ 2.24829979,  0.24232586],
       [ 1.58829416,  1.34685933],
       [ 2.3670327 ,  1.67371607],
       [-0.2915627 ,

In [16]:
n_trial

20

## data preprocessing

In [21]:
path = '/home/sengpei/semi-supervision/data/'

In [22]:
from sklearn.datasets import load_svmlight_file

In [23]:
X, y = load_svmlight_file(path+'SUSY.bz2')

In [25]:
import random

In [26]:
ntotal = 5000

In [29]:
ind = random.sample(list(range(len(y))),ntotal)

In [70]:
print(X[4960035])

  (0, 0)	0.6810314059257507
  (0, 1)	-1.9932888746261597
  (0, 2)	1.5844615697860718
  (0, 3)	0.8059409260749817
  (0, 4)	-0.10363026708364487
  (0, 5)	1.4425417184829712
  (0, 6)	0.3280912935733795
  (0, 7)	-0.7577651739120483
  (0, 8)	0.4615054726600647
  (0, 9)	0.14023545384407043
  (0, 10)	1.0199768543243408
  (0, 11)	0.6071593761444092
  (0, 12)	0.5282389521598816
  (0, 13)	1.3285313844680786
  (0, 14)	0.8983045220375061
  (0, 15)	1.1674401760101318
  (0, 16)	0.2827155590057373
  (0, 17)	0.03520910069346428


In [76]:
pey = y[ind]

In [40]:
X.shape[0]

5000000

In [80]:
xpe = X.todense()
%time

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


In [65]:
pex = xpe[ind,:]

In [None]:
p_ind = np.where(pey==1)
n_ind = np.where(pey==0)

In [None]:
xxp = pex[p_ind[0],:]
xxn = nex[n_ind[0],:]

## p, n, u labelling

In [None]:
 n_p = np.random.binomial(n_l, prior_l)
    n_n = n_l - n_p
    x_p = np.random.randn(n_p, 2) + mu_p
    x_n = np.random.randn(n_n, 2) + mu_n

    n_up = np.random.binomial(n_u, prior_u)
    n_un = n_u - n_up
    x_up = np.random.randn(n_up, 2) + mu_p
    x_un = np.random.randn(n_un, 2) + mu_n

    x = np.r_[x_p, x_n, x_up, x_un]
    y = np.r_[np.ones(n_p), -np.ones(n_n), np.zeros(n_u)]

    x_tp = np.random.randn(n_t, 2) + mu_p
    x_tn = np.random.randn(n_t, 2) + mu_n

In [None]:
n_p = np.random.binomial(n_l, prior_l)
n_n = n_l - n_p
n_up = np.random.binomial(n_u, prior_u)
n_un = n_u - n_up

In [97]:
p_ind = np.where(pey==1)

In [98]:
len(p_ind[0])

2274

In [99]:
xxp = pex[p_ind[0],:]

In [104]:
x_p = xxp[0,n_p]
x_n = xxn[0,n_n]
x_up = xxp[n_p,n_p+n_up]
x_un = xxn[n_n,n_n+n_np]

NameError: name 'xxn' is not defined

## linear model

In [109]:
d = xxp.shape[1]

18

In [132]:
w = np.random.uniform(-1,1,xxp.shape[1])
w0 = np.random.uniform(-1,1,xxp.shape[0])

In [136]:
q = np.sign(0.01 + np.sign( np.dot(xxp,w)+w0))

In [140]:
r = np.ones(xxp.shape[0])

In [142]:
np.equal(r,q)

matrix([[ True,  True,  True, ..., False, False, False]])

In [143]:
np.count_nonzero(np.equal(r,q)[0])

1395