In [1]:
import numpy as np
import matplotlib 
matplotlib.use('nbagg')
import matplotlib.pyplot as plt
import random

array([442, 843, 589,  93, 930, 382, 918, 549, 290, 962])

In [33]:
# Function to generate the Dataset for a non-linear hypothesis of the form sign(x1^2 + x2^2 - 1)
# Assumptions: Range of values on the x and y axis are the same
#              Code is only for classifying data in 2D
def gen_data_non_lnr_trnsfrm(llim,ulim,N):
    
    """
    llim: a scalar defining the lower limit on the x and y axis
    ulim: a scalar defining the upper limit on the x and y axis
    target_weight: a column vector of 3x1 listing the known weights used for generating the target hypothesis
    N: Total number of data points
    Output: Returns a tuple with following elements at respective indices-
        index        element
        0 -          a numpy array of Nx3 with the first column being all 1s, second and third column 
                     consists of N points generated uniformly between llim and ulim
        1 -          column vector of Nx1, Target hypothesis corresponding to the target weight 
    """
    x1_sample = np.append(np.ones((N,1)),np.random.uniform(llim,ulim,N).reshape(N,1),axis=1)
    x2_sample = np.random.uniform(llim,ulim,N).reshape(N,1)
    data_partial = np.append(x1_sample,x2_sample,axis=1)    
    target = (data_partial[:,1]**2).reshape(-1,1) + (data_partial[:,2]**2).reshape(-1,1) - (0.6*np.ones((N,1)))
    mask = target>0
    category_values_wo_noise = np.where(mask,np.ones((N,1)),-1)
    no_of_indices_to_transform = np.int(round(N/10,1))
    indices = np.random.randint(0,N,no_of_indices_to_transform)
    category_values_with_noise = category_values_wo_noise + np.zeros((N,1))
    category_values_with_noise[indices,0] = -category_values_wo_noise[indices,0]
    data_complete = np.append(np.append(data_partial,category_values_with_noise,axis=1),category_values_wo_noise,axis=1)
    
    return data_complete

def lin_reg(data_complete):
    """
    data_complete: A numpy array of Nx4 with the first column being all 1s, second and third column 
                consists of N points generated uniformly between llim and ulim and the last column
                contains the categories to be learnt
    
    Output: Returns an appropriate dimensional column vector of final weights computed using 
    linear regression system
    """
    a = np.linalg.inv(np.matmul(np.transpose(data_complete[:,0:3]),data_complete[:,0:3]))
    b = np.dot(np.transpose(data_complete[:,0:3]),data_complete[:,3].reshape(-1,1))
    final_weight = np.dot(a,b)
    return final_weight

# Function to extract the misclassified values 
def ident_misclass_pts(data_complete, weight_3D):
    """
    data_complete: A numpy array of Nx5 with the first column being all 1s, second and third column 
                consists of N points generated uniformly between llim and ulim, the fourth column
                contains the categories to be learnt with added noise and the last column contains
                the true uncontaminated categories
    weight_3D: A column vector of 3x1 specifying weights 
    
    Output: Returns a tuple with the following elements at the respective indices
    index   element
     index        element
        0 - a numpy array of Nx4 containing only the missclassified points and the 4 columns as above 
        1 - a scalar giving the proportion of missclassified points for the specified weight vector
    
    """
    ref_hyp_weight = np.dot(data_complete[:,0:3],weight_3D)
    mask = ref_hyp_weight > 0
    comp_vec = np.where(mask,np.ones((data_complete.shape[0],1)),-1)
    error_mask = comp_vec!=data_complete[:,3].reshape(-1,1)                    
    error_mask = np.append(np.append(error_mask,np.append(error_mask,np.append(error_mask,error_mask,axis=1),axis=1),axis=1),error_mask,axis=1)
    missclass_data_pts = data_complete[error_mask].reshape(-1,5)
    error_frac = missclass_data_pts.shape[0]/data_complete.shape[0]
    return (missclass_data_pts,error_frac)

In [91]:
llim = -1
ulim = 1
N = 1000
nsims = 10000
Ein = 0

In [68]:
# Computing the Ein for multiple linear regression involving just the terms (1,x1,x2)
for i in range(nsims):
    data_complete = gen_data_non_lnr_trnsfrm(llim,ulim,N)
    final_reg_weights = lin_reg(data_complete)
    Ein = Ein + ident_misclass_pts(data_complete, final_reg_weights)[1]
print("The average in sample error is: ", Ein/nsims)

The average in sample error is:  0.5049529999999994


In [92]:
# Learning the weights for a more complex feature set involving the terms (1,x1,x2,x1*x2,x1^2,x2^2)
weights = np.zeros((6,1))
for i in range(nsims):
    data_complete = gen_data_non_lnr_trnsfrm(llim,ulim,N)
    data1 = np.append(np.append(np.append(data_complete[:,0:3],(data_complete[:,1]*data_complete[:,2]).reshape(-1,1),axis=1),(data_complete[:,1]**2).reshape(-1,1),axis=1),(data_complete[:,2]**2).reshape(-1,1),axis=1)
    data2 = data_complete[:,3:]
    full_feature_dataset = np.append(data1,data2,axis=1)
    a = np.linalg.inv(np.matmul(np.transpose(full_feature_dataset[:,0:6]),full_feature_dataset[:,0:6]))
    b = np.dot(np.transpose(full_feature_dataset[:,0:6]),(full_feature_dataset[:,6]).reshape(-1,1))
    final_reg_weight = np.dot(a,b).reshape(6,1)
    weights = weights + final_reg_weight
weights = weights/nsims
print("The average learned weight vector is: ", weights)

The average learned weight vector is:  [[ -1.00430845e+00]
 [  1.12178328e-03]
 [ -7.09418645e-04]
 [ -8.20211456e-04]
 [  1.57604623e+00]
 [  1.57702279e+00]]


In [93]:
# Computing Eout for the average hypothesis calculated in the previous cell for the features
# (1,x1,x2,x1*x2,x1^2,x2^2)
Eout=0
for i in range(nsims):
    data_complete = gen_data_non_lnr_trnsfrm(llim,ulim,N)
    data1 = np.append(np.append(np.append(data_complete[:,0:3],(data_complete[:,1]*data_complete[:,2]).reshape(-1,1),axis=1),(data_complete[:,1]**2).reshape(-1,1),axis=1),(data_complete[:,2]**2).reshape(-1,1),axis=1)
    data2 = data_complete[:,3:]
    full_feature_dataset = np.append(data1,data2,axis=1)
    ref_hyp_weight = np.dot(full_feature_dataset[:,0:6],weights)
    mask = ref_hyp_weight > 0
    comp_vec = np.where(mask,np.ones((full_feature_dataset.shape[0],1)),-1)
    error_mask = (comp_vec!=full_feature_dataset[:,6].reshape(-1,1)).reshape(-1,1)                    
    error_frac = sum(error_mask)/N
    Eout = Eout + error_frac
print("The average Eout is: ",Eout/nsims)

The average Eout is:  [ 0.1188002]
