In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn as skl
import sklearn.datasets
from sklearn import metrics
import scipy.spatial 
%matplotlib notebook

## import and process data

In [2]:
votes = (np.loadtxt('house-votes-84.data', delimiter = ',', dtype =object, unpack=True)).T
labels = votes[:,0]
labels[labels == 'democrat'] = -1
labels[labels == 'republican'] = 1
labels = labels.astype(int)

votes = votes[:,1:]
votes[votes == 'y'] = 1
votes[votes == 'n'] = -1
votes[votes == '?'] = 0

## create laplacian matrix

In [3]:
def eta(tt, rr):

    val = np.exp( - (tt**2)/(2*rr**2) ) 
    return val.astype(float)

In [4]:
dist = scipy.spatial.distance_matrix( votes, votes, p =2)

r = 1.47

W = eta( dist, r)


In [5]:
# compute Laplacian matrices 

d = np.sum(W, axis=1) # degree vector

D = np.diag(d)
Ds = np.diag(1/np.sqrt(d))
Di = np.diag(1/d)


L = D - W # unnormalized

Ln = np.dot(Ds, np.dot((D - W),Ds)) # normalized

In [6]:
# compute eigendecompositions 

ll, VV = np.linalg.eigh(L)

# we need to sort the eigenvalues and vectors 

idx = ll.argsort()
l = ll[idx]
V = VV[:, idx]

lln, VVn = np.linalg.eigh(Ln)

idxn = lln.argsort()
ln = lln[idxn]
Vn = VVn[:, idxn]

print(l[0:10])

print(ln[0:10])

[8.55621835e-15 2.98407096e-01 7.67640672e-01 1.11556639e+00
 1.23996052e+00 1.32183227e+00 1.35833389e+00 1.37583021e+00
 1.46424318e+00 1.48165670e+00]
[-4.57226242e-17  1.51450796e-02  3.48761972e-01  4.28767571e-01
  4.34213036e-01  4.54007755e-01  4.93587540e-01  5.04138962e-01
  5.13289174e-01  5.22794357e-01]


# plot eigenvalues 

fig, ax = plt.subplots(figsize=(4,4))

ax.plot(np.log(l[1:None]))
ax.set_xlabel('index')
ax.set_ylabel('$\lambda$')

# 3D plot of Laplacian embedding with first three eigenvectors

fig1 = plt.figure(figsize=(4,4))
ax1 = fig1.add_subplot(projection='3d')

ax1.scatter( V[:,1]/np.abs(V[:,1]).max(), V[:,2]/np.abs(V[:,2]).max(), V[:,3]/np.abs(V[:,3]).max()  )
ax1.set_xlabel('$q_1$')
ax1.set_ylabel('$q_2$')
ax1.set_zlabel('$q_3$')

## determine optimal value of sigma

In [7]:
q1 = V[:,1]
q1[q1 <= 0] = -1
q1[q1 > 0] = 1
q1 = q1.astype(int)

labels = labels.astype(int)

In [8]:
V[:,1].shape

(435,)

In [9]:
delta = np.linspace(.01,4,400, endpoint = True)

q1_acc_sig = np.zeros(len(delta))
for i in np.arange(len(delta)):
    
    r = i
    W = eta( dist, delta[i])

    # compute Laplacian matrices 

    d = np.sum(W, axis=1) # degree vector

    D = np.diag(d)
    L = D - W # unnormalized
    
    # compute eigendecompositions 

    ll, VV = np.linalg.eigh(L)

    # we need to sort the eigenvalues and vectors 

    idx = ll.argsort()
    l = ll[idx]
    V = VV[:, idx]
    
    q1 = V[:,1]
    q1[q1 <= 0] = -1
    q1[q1 > 0] = 1
    q1 = q1.astype(int)
    q1_acc_sig[i] = metrics.accuracy_score(labels,q1)

In [10]:
for i in np.arange(len(q1_acc_sig)):
    if q1_acc_sig[i] < .5:
        q1_acc_sig[i] = 1-q1_acc_sig[i]

fig2, ax2 = plt.subplots()

ax2.scatter(delta, q1_acc_sig,s = 2)
ax2.plot(delta[q1_acc_sig.argmax()],q1_acc_sig[q1_acc_sig.argmax()], 'r*', markersize = 10,
       label = '$\sigma^*$')
ax2.legend(loc = 'right')

In [11]:
delta[q1_acc_sig.argmax()]

1.47

In [12]:
q1_acc_sig[q1_acc_sig.argmax()]

0.8804597701149426

## Semi-supervised learning

In [13]:
dist = scipy.spatial.distance_matrix( votes, votes, p =2)

r = delta[q1_acc_sig.argmax()]

W = eta( dist, r)


In [14]:
# compute Laplacian matrices 

d = np.sum(W, axis=1) # degree vector

D = np.diag(d)

L = D - W # unnormalized


In [15]:
# compute eigendecompositions 

ll, VV = np.linalg.eigh(L)

# we need to sort the eigenvalues and vectors 

idx = ll.argsort()
l = ll[idx]
V = VV[:, idx]

print(l[0:10])


[8.55621835e-15 2.98407096e-01 7.67640672e-01 1.11556639e+00
 1.23996052e+00 1.32183227e+00 1.35833389e+00 1.37583021e+00
 1.46424318e+00 1.48165670e+00]


## compute varying accuracy of J and M

In [16]:
from sklearn.linear_model import Ridge, LinearRegression

In [17]:
M = [2,3,4,5,6]
J = [5,10,20,40]

In [18]:
print(delta[q1_acc_sig.argmax()])

accuracy = np.zeros((len(J),len(M)))
labels = labels.astype(int)

for m in range(len(M)):
    for j in range(len(J)):
        A = V[0:J[j], 0:M[m]]
        b = labels[0:J[j]]
        
        SSLRidge = Ridge(alpha = 1e-8, fit_intercept = False).fit(A,b)
        
        F = V[:, 0:M[m]]
        
        pred = SSLRidge.predict(F)
        
        pred = np.sign(pred)
        
        accuracy[j][m] = metrics.accuracy_score(labels,pred)
        
        print ('M:' + str(M[m]) + ' J: ' + str(J[j]))
        print('Accuracy: ' + str(accuracy[j][m]))

1.47
M:2 J: 5
Accuracy: 0.8758620689655172
M:2 J: 10
Accuracy: 0.8781609195402299
M:2 J: 20
Accuracy: 0.8804597701149425
M:2 J: 40
Accuracy: 0.8827586206896552
M:3 J: 5
Accuracy: 0.8896551724137931
M:3 J: 10
Accuracy: 0.7701149425287356
M:3 J: 20
Accuracy: 0.8436781609195402
M:3 J: 40
Accuracy: 0.8735632183908046
M:4 J: 5
Accuracy: 0.4160919540229885
M:4 J: 10
Accuracy: 0.8528735632183908
M:4 J: 20
Accuracy: 0.8758620689655172
M:4 J: 40
Accuracy: 0.864367816091954
M:5 J: 5
Accuracy: 0.8597701149425288
M:5 J: 10
Accuracy: 0.774712643678161
M:5 J: 20
Accuracy: 0.825287356321839
M:5 J: 40
Accuracy: 0.8804597701149425
M:6 J: 5
Accuracy: 0.8781609195402299
M:6 J: 10
Accuracy: 0.696551724137931
M:6 J: 20
Accuracy: 0.8091954022988506
M:6 J: 40
Accuracy: 0.8827586206896552


In [19]:
accuracy.max()

0.8896551724137931

## plot q1

In [20]:
index = labels.argsort()
votes_sort = votes[index]
labels_sort = labels[index]

xd = votes_sort[0:267, :]
xr = votes_sort[267:, :]
yd = labels_sort[0:267]
yr = labels_sort[267:]

# 3D plot of Laplacian embedding with first three eigenvectors

fig3 = plt.figure(figsize=(3,3))
ax3 = fig3.add_subplot(projection='3d')

dem = np.where(labels == -1)
rep = np.where(labels == 1)

ax3.scatter( V[dem,1]/np.abs(V[:,1]).max(), V[dem,2]/np.abs(V[:,2]).max(), V[dem,3]/np.abs(V[:,3]).max(),
          label = 'democrat', s = 5)
ax3.set_xlabel('$q_1$')
ax3.set_ylabel('$q_2$')
ax3.set_zlabel('$q_3$')

ax3.scatter( V[rep,1]/np.abs(V[:,1]).max(), V[rep,2]/np.abs(V[:,2]).max(), V[rep,3]/np.abs(V[:,3]).max() , 
          'r0', label = 'republican', s = 5)
ax3.legend()

fig4 = plt.figure(figsize=(5,3))
ax4 = fig4.add_subplot()

ax4.scatter(np.arange(V[dem,1].shape[1]), V[dem,1]/np.abs(V[:,1]).max(),
          label = 'Democrat', s = 5)
ax4.scatter(np.arange(V[rep,1].shape[1]), V[rep,1]/np.abs(V[:,1]).max(),
          label = 'Republican', s = 5)
ax4.legend(loc = 'lower center', bbox_to_anchor=(.5, 1.05))
fig4.tight_layout()

sum(sum(V[dem,1]<=0))/267

sum(sum(V[rep,1]>0))/(435-267)

In [None]:
#fig.savefig('eigen.png')
#fig1.savefig('lap_q3.png')
#fig2.savefig('sigma.png')
#fig4.savefig('q1.png')

In [22]:
pred.shape

(435,)

In [23]:
std_j = np.zeros(4)
std_m = np.zeros(5)
mean_m = np.zeros(5)
mean_j = np.zeros(4)

In [24]:
for i in range(4):
    std_j[i] = accuracy[i,:].std()
    mean_j[i] = accuracy[i,:].mean()
for i in range(5):
    std_m[i] = accuracy[:,i].std()
    mean_m[i] = accuracy[:,i].mean()

In [25]:
print('std j: ' + str(std_j))
print('std m: ' + str(std_m))
print('mean j: ' + str(mean_j))
print('mean m: ' + str(mean_m))

std j: [0.18415501 0.06478681 0.02779994 0.00706312]
std m: [0.00257019 0.04587273 0.19427923 0.04002823 0.07522152]
mean j: [0.78390805 0.79448276 0.84689655 0.87678161]
mean m: [0.87931034 0.84425287 0.75229885 0.83505747 0.81666667]


In [26]:
delta[q1_acc_sig.argmax()]

1.47