# Part 2 - Laplacian Interpolation

## Libraries

In [304]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from tabulate import tabulate

## Import Data

In [521]:
df_50 = pd.read_csv('dtrain13_50.csv')
df_100 = pd.read_csv('dtrain13_100.csv')
df_200 = pd.read_csv('dtrain13_200.csv')
df_400 = pd.read_csv('dtrain13_400.csv')

## Data Generator

In [522]:
def shuffle_data(X, y, seed=None):
    '''
    Function to shuffle the data.
    X: features
    y: labels
    '''
    if seed:                     
        np.random.seed(seed)
    idx = np.arange(len(X))          
    np.random.shuffle(idx)           
    return X[idx], y[idx]

def generate_data(data, n_label):
    '''
    Function to sample n_label labelled samples uniformly at random from the original dataset.
    data: the original dataset
    '''
    #split the data into positive and negative classes
    data_pos = data[data['label']==1].copy()
    data_neg = data[data['label']==-1].copy()
    #split the data into features X and target y
    X_pos = data_pos.iloc[:,1:].values.copy()
    y_pos = data_pos['label'].values.copy()
    X_neg = data_neg.iloc[:,1:].values.copy()
    y_neg = data_neg['label'].values.copy()    
    #shuffle the data
    X_pos, y_pos = shuffle_data(X_pos, y_pos)
    X_neg, y_neg = shuffle_data(X_neg, y_neg)
    #select the first n_label samples as labelled samples
    X_pos_train, y_pos_train = X_pos[:n_label], y_pos[:n_label]
    X_pos_test, y_pos_test = X_pos[n_label:], y_pos[n_label:]
    X_neg_train, y_neg_train = X_neg[:n_label], y_neg[:n_label]
    X_neg_test, y_neg_test = X_neg[n_label:], y_neg[n_label:]
    #merge the two classes
    X_train = np.append(X_pos_train, X_neg_train, axis=0)
    y_train = np.append(y_pos_train, y_neg_train, axis=0)
    X_test = np.append(X_pos_test, X_neg_test, axis=0)
    y_test = np.append(y_pos_test, y_neg_test, axis=0)
    return X_train, X_test, y_train, y_test

## Weight Matrix & Degree Matrix

In [523]:
def compute_weight_KNN(X_train, X_test, k=3):
    '''
    Function to compute the weight matrix using the KNN method.
    X_train: training data
    X_test: test data  
    k: number of nearest neighbours used to make predictions (default=3)
    '''
    X = np.append(X_train, X_test, axis=0)          #merge the training and test sets
    m, n = X.shape[0], X.shape[1]                   #get the sizes of the data matrix X
    #distance between a and b: (a-b)^2 = a∙a + b∙b - 2a∙b
    A_dots = (X*X).sum(axis=1).reshape((m,1))*np.ones(shape=(1,m))
    B_dots = (X*X).sum(axis=1)*np.ones(shape=(m,1))
    D = A_dots + B_dots - 2*X.dot(X.T)
    idx = np.argsort(D, axis=1)                     #sort the index by the distance
    W = np.zeros((m,m))                             #initialize the weight matrix
    for i in range(m):                              #for each sample
        W[i, idx[i][1:4]] = 1                       #set the nearest 3 neighbours of the ith sample to 1
        W[idx[i][1:4], i] = 1                       #do the same for the 3 neigbours 
    return W

def compute_D(W):
    '''
    Function to compute the Degree Matrix D from the Weight Matrix W.
    W: the weight matrix
    '''
    m = W.shape[0]                  #size of the weight matrix
    D = np.zeros((m,m))             #initialize the degree matrix
    for i in range(m): 
        D[i][i] = np.sum(W[i])      #sum of all edges for the ith sample
    return D

## Laplacian Interpolation

In [524]:
def LI(X_train, X_test, y_train, y_test):
    '''
    Function to train a Laplacian Interpolation algorithm and make predictions on the test set.
    X_train: training data
    y_train: training labels
    X_test: test data
    y_test: test labels
    '''
    W = compute_weight_KNN(X_train, X_test)                   #compute the weight matrix W
    D = compute_D(W)                                          #compute the degree matrix D
    L = D - W                                                 #compute the graph laplacian L
    l, m = len(y_train), len(y_train)+len(y_test)             #get the sizes of labelled and unlabelled data
    v = np.zeros(m)                                           #initialize the solution vector
    v[:l] = y_train                                           #same labels for the training set
    for i in range(l, m):
        v[i] = np.sum(W[i]*v) / D[i][i]                       #the harmonic solution
    y_pred = np.sign(v)                                       #make predictions based on the sign of the vector v
    y_pred_test = y_pred[l:]                                  #obtain the predictions on the test set
    error_rate = np.sum(y_pred_test!=y_test)/len(y_test)      #compute the generalization error on the test set
    return error_rate

## Laplacian Kernel Interpolation

In [525]:
def LKI(X_train, X_test, y_train, y_test):
    '''
    Function to train a Laplacian Kernel Interpolation algorithm and make predictions on the test set.
    X_train: training data
    y_train: training labels
    X_test: test data
    y_test: test labels
    '''
    W = compute_weight_KNN(X_train, X_test)                    #compute the weight matrix
    D = compute_D(W)                                           #compute the degree matrix
    L = D - W                                                  #compute the graph laplacian
    l, m = len(y_train), len(y_train)+len(y_test)              #get the sizes of labelled and unlabelled data
    L_plus = np.linalg.pinv(L)                                 #compute the pseudoinverse of matrix L
    K = L_plus[:l,:l]                                          #compute the kernel matrix K
    K_plus = np.linalg.pinv(K)                                 #compute the pseudoinverse of matrix K
    y_L = y_train                                              #obtain the labels of the labelled samples
    alpha = K_plus.dot(y_L)                                    #compute the alpha vector
    E = np.zeros((m,m))                                        #initialize the edge matrix E
    for i in range(m):
        E[i][i+1:] = W[i][i+1:]                                #edge exists when W[i][j]>0 and i  <j
    v = np.zeros(m)                                            #initialize the solution vector v
    for i in range(l):
        v += alpha[i] * E[i].T.dot(L_plus)                     #compute the solution vector v with the labelled data
    y_pred = np.sign(v)                                        #make predictions based on the sign of the vector v
    y_pred_test = y_pred[l:]                                   #obtain the predictions on the test set
    error_rate = np.sum(y_pred_test!=y_test)/len(y_test)       #compute the generalization error on the test set
    return error_rate

## Evaluation

In [526]:
def evaluate(data, n_label, LI, LKI, runs=20):
    '''
    Function to compute the mean empirical generalization error of an algorithm.
    data: the entire dataset
    n_label: number of labelled samples
    algorithm: the algorithm to be evaluated
    runs: the number of repeated experiements
    '''
    errors_LI, errors_LKI = [], []
    for run in range(runs):                 
        X_train, X_test, y_train, y_test = generate_data(data, n_label)     #randomly sample the training and test sets
        error_LI = LI(X_train, X_test, y_train, y_test)                     #make predictions with the LI algorithm
        errors_LI.append(error_LI)
        error_LKI = LKI(X_train, X_test, y_train, y_test)                   #make predictions with the LKI algorithm
        errors_LKI.append(error_LKI)
    #compute the mean generalization errors and the standard deviations
    LI_mean, LI_std = np.mean(errors_LI), np.std(errors_LI)
    LKI_mean, LKI_std = np.mean(errors_LKI), np.std(errors_LKI)
    return LI_mean, LI_std, LKI_mean, LKI_std

## Experiments

In [483]:
np.random.seed(0)
label_nums = [1, 2, 4, 8, 16]
data_sizes = [50, 100, 200, 400]
data_list = [df_50, df_100, df_200, df_400]

results_LI, results_LKI = [], []
LI_means, LI_stds, LKI_means, LKI_stds = np.zeros((4,5)), np.zeros((4,5)), np.zeros((4,5)), np.zeros((4,5))

for i in range(len(data_list)):                    #for each dataset
    
    result_LI, result_LKI = [int(data_sizes[i])], [int(data_sizes[i])]
    
    for j in tqdm(range(len(label_nums))):               #for different numbers of labelled samples
        
        LI_mean, LI_std, LKI_mean, LKI_std = evaluate(data_list[i], label_nums[j], LI, LKI, runs=20)
        LI_means[i][j] = LI_mean
        LI_stds[i][j] = LI_std
        LKI_means[i][j] = LKI_mean
        LKI_stds[i][j] = LKI_std
        
        result_LI.append(f"{'{:.4f}'.format(LI_mean)}±{'{:.4f}'.format(LI_std)}")
        result_LKI.append(f"{'{:.4f}'.format(LKI_mean)}±{'{:.4f}'.format(LKI_std)}")
    
    results_LI.append(result_LI)
    results_LKI.append(result_LKI)

#tabulate the results
print(tabulate(results_LI, 
               headers = ["Samples per label", 
                          "L per class = 1", 
                          "L per class = 2",
                          "L per class = 4",
                          "L per class = 8",
                          "L per class = 16"], 
               tablefmt = "simple_outline",
               stralign = "center",
               numalign = "center"))
print(tabulate(results_LKI, 
               headers = ["Samples per label", 
                          "L per class = 1", 
                          "L per class = 2",
                          "L per class = 4",
                          "L per class = 8",
                          "L per class = 16"], 
               tablefmt = "simple_outline",
               stralign = "center",
               numalign = "center"))

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.07s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:39<00:00, 19.88s/it]

┌─────────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬────────────────────┐
│  Samples per label  │  L per class = 1  │  L per class = 2  │  L per class = 4  │  L per class = 8  │  L per class = 16  │
├─────────────────────┼───────────────────┼───────────────────┼───────────────────┼───────────────────┼────────────────────┤
│         50          │   0.8077±0.0534   │   0.6198±0.0903   │   0.4739±0.0863   │   0.2768±0.0770   │   0.1110±0.0356    │
│         100         │   0.8500±0.0478   │   0.7342±0.0895   │   0.5880±0.0628   │   0.3663±0.0481   │   0.2140±0.0434    │
│         200         │   0.9095±0.0372   │   0.8448±0.0390   │   0.6954±0.0404   │   0.5508±0.0508   │   0.3660±0.0467    │
│         400         │   0.9604±0.0133   │   0.8939±0.0296   │   0.8172±0.0318   │   0.6724±0.0486   │   0.5318±0.0393    │
└─────────────────────┴───────────────────┴───────────────────┴───────────────────┴───────────────────┴────────────────────┘





## Cases when LI outperforms LKI

In [539]:
label_nums = [30, 35,  40, 45, 48]
data_sizes = [50]
data_list = [df_50]

results_LI, results_LKI = [], []
LI_means, LI_stds, LKI_means, LKI_stds = np.zeros((4,5)), np.zeros((4,5)), np.zeros((4,5)), np.zeros((4,5))

for i in range(len(data_list)):                    #for each dataset
    
    result_LI, result_LKI = [int(data_sizes[i])], [int(data_sizes[i])]
    
    for j in tqdm(range(len(label_nums))):               #for different numbers of labelled samples
        
        LI_mean, LI_std, LKI_mean, LKI_std = evaluate(data_list[i], label_nums[j], LI, LKI, runs=20)
        LI_means[i][j] = LI_mean
        LI_stds[i][j] = LI_std
        LKI_means[i][j] = LKI_mean
        LKI_stds[i][j] = LKI_std
        
        result_LI.append(f"{'{:.4f}'.format(LI_mean)}±{'{:.4f}'.format(LI_std)}")
        result_LKI.append(f"{'{:.4f}'.format(LKI_mean)}±{'{:.4f}'.format(LKI_std)}")
    
    results_LI.append(result_LI)
    results_LKI.append(result_LKI)

#tabulate the results
print(tabulate(results_LI, 
               headers = ["Samples per label", 
                          "L = 30", 
                          "L = 35",
                          "L = 40",
                          "L = 45",
                          "L = 48"], 
               tablefmt = "simple_outline",
               stralign = "center",
               numalign = "center"))
print(tabulate(results_LKI, 
               headers = ["Samples per label", 
                          "L = 30", 
                          "L = 35",
                          "L = 40",
                          "L = 45",
                          "L = 48"], 
               tablefmt = "simple_outline",
               stralign = "center",
               numalign = "center"))

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.55it/s]

┌─────────────────────┬───────────────┬───────────────┬───────────────┬───────────────┬───────────────┐
│  Samples per label  │    L = 30     │    L = 35     │    L = 40     │    L = 45     │    L = 48     │
├─────────────────────┼───────────────┼───────────────┼───────────────┼───────────────┼───────────────┤
│         50          │ 0.0563±0.0386 │ 0.0500±0.0289 │ 0.0275±0.0295 │ 0.0400±0.0663 │ 0.0250±0.0750 │
└─────────────────────┴───────────────┴───────────────┴───────────────┴───────────────┴───────────────┘
┌─────────────────────┬───────────────┬───────────────┬───────────────┬───────────────┬───────────────┐
│  Samples per label  │    L = 30     │    L = 35     │    L = 40     │    L = 45     │    L = 48     │
├─────────────────────┼───────────────┼───────────────┼───────────────┼───────────────┼───────────────┤
│         50          │ 0.0512±0.0311 │ 0.0567±0.0410 │ 0.0325±0.0327 │ 0.0500±0.0742 │ 0.0375±0.0893 │
└─────────────────────┴───────────────┴───────────────┴─────────


