In [4]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [5]:
import os
import scipy.sparse

# Load Data

In [7]:
#**************
# with NaNs
# tX
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-pca-with-dummy-vars-with-NaNs.csv")
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-whitened-with-dummy-vars-with-NaNs.csv")
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-standardized-with-dummy-vars-with-NaNs.csv")
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-standardized-with-dummy-vars-with-NaNs-removed-corr.csv")
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-whitened-using-PCA-with-dummy-vars-with-NaNs.csv")
tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-pca-with-dummy-vars-med-imput-NaNs.csv")
# y
y_path=os.path.join( os.getcwd(), "..", "data", "y-labels.csv")


#**************
# no NaNs
# tX
#tX_path=os.path.join( os.getcwd(), "..", "data", "training-data-standardized-with-dummy-vars-no-NaNs-removed-corr.csv")
# y
#y_path=os.path.join( os.getcwd(), "..", "data", "y-labels-no-NaNs.csv")

#tX = np.loadtxt(tX_path)
#y = np.loadtxt(y_path)

In [8]:
N_SAMPLES = tX.shape[0]
print(N_SAMPLES)
print(y.shape[0])
print(tX.shape)

250000
250000
(250000, 4)


In [9]:
y_colors = np.array(['b']*N_SAMPLES)
y_colors[y==0] = 'r'
print("Found {b} boson events out of {N} total events".format(b=np.sum(y), N=N_SAMPLES))
# If I understood correctly, 1 are bosons, i.e. boson events will be colored in blue

Found 85667.0 boson events out of 250000 total events


## Compute Polynomial Basis

In [141]:
def multivar_poly_basis(x, degree):
    n_feat = x.shape[1]
    n_samp = x.shape[0]

    #do a first iteration to avoid concatenating empty array problems
    temp0 = x[0,:]
    temp = x[0,:]
    for deg in range(1,degree):
        temp = np.outer(temp0,temp)
        upper_tri_indices = np.triu_indices(n=temp.shape[0],m=temp.shape[1])
        temp = temp[upper_tri_indices]
        
    x_ret = temp
    
    for sample in range(1,n_samp):
        temp0 = x[sample,:].reshape(n_feat,1)
        temp = x[sample,:].reshape(n_feat,1)
#        print(temp.shape)
        for deg in range(1,degree):
            temp = np.outer(temp0,temp)
            upper_tri_indices = np.triu_indices(n=temp.shape[0],m=temp.shape[1])
            temp = temp[upper_tri_indices]
        x_ret = np.r_[x_ret, temp]
        
    x_ret = x_ret.reshape(n_samp, int(x_ret.size/n_samp))
    return x_ret
        

In [144]:
## test
a = np.array([2,5,11]).reshape((1,3))
print(a)
print("----")

multivar_poly_basis(a,3)

[[ 2  5 11]]
----


array([[   8,   20,   44,   50,  110,  242,   50,  110,  125,  275,  605,
         242,  275,  605, 1331]])

# Logistic Regression

In [10]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1/(1+np.exp(-t))

In [11]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    Xw = tx.dot(w)
#    print(Xw.shape)
#    print(y.shape)
    LOG_part = np.log( 1 + np.exp( Xw ) )
#    print(LOG_part.shape)
    PROD_part = np.multiply(y.reshape(N_SAMPLES,1), Xw )
#    print(PROD_part.shape)
    a = LOG_part - PROD_part
    return np.sum( a )

In [12]:
def calculate_gradient(y, tx, w,lambda_=0.):
    """compute the gradient of loss."""
    return tx.T.dot( sigmoid(tx.dot(w)) - y.reshape((tx.shape[0],1)) ) + lambda_*w

## Gradient Descent

In [None]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """

    loss = calculate_loss(y, tx, w)
    grad = calculate_gradient(y,tx, w)
    w -= gamma*grad
    
    return loss, w

In [None]:
from helpers import de_standardize
from plots import visualization

def logistic_regression_gradient_descent_demo(y, x):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.001
    losses = []

    # build tx
    tx = np.c_[.ones((y.shape[0], 1)), x]
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criteria
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    return losses, w

In [None]:
losses, w_star = logistic_regression_gradient_descent_demo(y, whit_tX)
#print(w_star.shape)
#print(w_star)

In [None]:
f = plt.figure()
ax=f.add_subplot(2,1,1)
ax.scatter(whit_tX[:,0],whit_tX[:,1], c=y_colors, alpha=0.1)

ax=f.add_subplot(2,1,2)
prediction = sigmoid(whit_tX[:,0:1].dot(w_star[1:2]))
prediction = prediction < 0.5
#ax.plot(prediction)
ax.scatter(whit_tX[:,0],whit_tX[:,1], c=prediction, alpha=0.1)



In [None]:
error = np.sum(np.abs(prediction - y.reshape(N_SAMPLES,1)))
print(error)
print(error/N_SAMPLES)

## Newton's Method

In [13]:
def calculate_hessian(y, tx, w):
    """return the hessian of the loss function."""
    S1 = sigmoid(tx.dot(w))
    S1 = S1.reshape((N_SAMPLES,1))
#    print(S1.shape)
    S2 = 1.0 - sigmoid(tx.dot(w))
#    print(S2.shape)
    S = np.multiply(S1,S2)
#    print(S.shape)
    S = scipy.sparse.spdiags(S[:,0], 0, N_SAMPLES, N_SAMPLES)
    return tx.T.dot(S.dot(tx))

In [14]:
def logistic_regression(y, tx, w,lambda_):
    """return the loss, gradient, and hessian."""
    loss = calculate_loss(y, tx, w)
    grad = calculate_gradient(y,tx, w,lambda_)
    hess = calculate_hessian(y, tx, w)
    return loss, grad, hess

In [15]:
def learning_by_newton_method(y, tx, w, gamma, lambda_):
    """
    Do one step on Newton's method.
    return the loss and updated w.
    """
    loss, grad, hess = logistic_regression(y,tx,w,lambda_)
    w -= gamma * np.linalg.inv(hess).dot(grad);
    return loss, w

In [16]:
def logistic_regression_newton_method_demo(y, x, w_initial):
    # init parameters
    max_iter = 1000
    gamma = 0.25
    threshold = 1e-8
    lambda_ = 0.9
    losses = []

    # build tx
    tx = np.c_[np.ones((y.shape[0], 1)), x]
    w = w_initial

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_newton_method(y, tx, w, gamma, lambda_)
        # log info
        if iter % 5 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criteria
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2])/np.abs(losses[-1]) < threshold:
            print("[Exit Condition Met]: Current iteration={i}, the loss={l}".format(i=iter, l=loss)) 
            break
    return losses, w

In [17]:
w0 = np.random.rand(tX.shape[1]+1,1)
losses, w_star = logistic_regression_newton_method_demo(y, tX, w0)

Current iteration=0, the loss=190558.58276489575
Current iteration=5, the loss=155689.22516863496
Current iteration=10, the loss=154096.73553226635
Current iteration=15, the loss=154003.1957477467
Current iteration=20, the loss=153997.84546961935
Current iteration=25, the loss=153997.54200611537
[Exit Condition Met]: Current iteration=29, the loss=153997.52548001002


## Compute Predictions

In [18]:
tX_pred = np.c_[np.ones((y.shape[0], 1)), tX]
prediction = sigmoid(tX_pred.dot(w_star))
prediction = np.array( [ int(x) for x in (prediction > 0.5)] )

## Plot predictions

In [None]:
f = plt.figure()
ax=f.add_subplot(2,2,1)
ax.scatter(tX[:,0],tX[:,1], c=y_colors, alpha=0.1)

ax=f.add_subplot(2,2,2)
ax.scatter(tX[:,2],tX[:,3], c=y_colors, alpha=0.1)

prediction_colors = np.array(['b']*N_SAMPLES)
prediction_colors[prediction==0] = 'r'

ax=f.add_subplot(2,2,3)
ax.scatter(tX[:,0],tX[:,1], c=prediction_colors, alpha=0.1)

ax=f.add_subplot(2,2,4)
ax.scatter(tX[:,2],tX[:,3], c=prediction_colors, alpha=0.1)

<matplotlib.collections.PathCollection at 0x10c8e4160>

## Compute training error

In [19]:
error = np.sum(np.abs(prediction - y))
print(prediction)
print(y)
print(error)
print(error/N_SAMPLES)

[0 0 0 ..., 0 0 0]
[ 1.  0.  0. ...,  1.  0.  0.]
79688.0
0.318752
