# Philip Carr
# CS/CNS/EE 156a Homework 7 Code Part 1 (Jupyter Notebook)

Code for The Logistic Regression Algorithm (Problems 1 - 5)

In [29]:
import numpy as np

In [30]:
def sign(x):
    """
    Return the sign of a number (1 if positive, 0 if 0, or -1 if
    negative).
    
    Return type: int
    """
    if x >= 0:
        return 1
    elif x == 0:
        return 0
    else:
        return -1

In [31]:
def get_data_from_file(filename):
    """
    Return an array of points within the region [-1, 1] x [-1, 1]
    and an array of corresponding values together as a tuple obtained
    from the file with the given name filename.
    
    Return type: 2D array of points
    """
    file1 = open(filename, "r")
    points = []
    values = []
    for line in file1:
        data = line.split()
        x0 = 1.0
        x1 = float(data[0])
        x2 = float(data[1])
        points.append([x0, x1, x2])
        values.append(float(data[2]))
    file1.close()
    return (np.array(points, dtype=np.float64),
            np.array(values, dtype=np.float64))

In [32]:
def get_transformed_points(points):
    """
    Return an array of points transformed using the nonlinear
    transformation Φ(1, x1, x2) = (1, x1, x2, x1^2, x2^2, x1 x2,
    |x1 - x2|, |x1 + x2|).
    """
    transformed_points = []
    for i in range(len(points)):
        point = []
        point.append(points[i,0])
        point.append(points[i,1])
        point.append(points[i,2])
        point.append(points[i,1] * points[i,1])
        point.append(points[i,2] * points[i,2])
        point.append(points[i,1] * points[i,2])
        point.append(abs(points[i,1] - points[i,2]))
        point.append(abs(points[i,1] + points[i,2]))
        transformed_points.append(point)
    return np.array(transformed_points, dtype=np.float64)

In [33]:
class LinReg:
    """
    This class represents the LinReg (Linear Regression Algorithm
    with regularization). This class contains the weights, as
    well as methods for running the Linear Regression Algorithm
    with regularization.
    """
    def __init__(self, n=7):
        """
        Initialize the weights of the LinReg using the given
        dimension n of the points to work with in R^n space.
        
        Return type: class (LinReg)
        """
        self.weights = np.array([0] * (n + 1), dtype=np.float64)
    
    def __repr__(self):
        """
        Print the weights of the LinReg.
        
        Return type: string
        """
        print("LogReg weights:", self.weights)
    
    def get_weights(self):
        """
        Return the weights of the LinReg.
        
        Return type: list of floats
        """
        return self.weights
    
    def evaluate(self, point):
        """
        Return a point's value using the Linear Regression
        Algorithm's weights.
        """
        real_value = 0
        if type(point) != np.float64:
            assert(len(self.weights) == len(point))
            for i in range(len(self.weights)):
                real_value += self.weights[i] * point[i]
        else:
            assert(type(self.weights) == np.float64)
            real_value += self.weights * point
            
        return sign(real_value)
    
    def optimize_weights(self, points, values,
                         regularization_k=None):
        """
        Optimize the LinReg weights using linear regression.
        Regularization is used when regularization_k is not
        equal to None.
        
        Return type: None
        """
        if len(points.shape) > 1:
            ZTZ = np.dot(np.transpose(points), points)
        
            dimension = len(points[0])
            I = np.identity(dimension)
        
            if regularization_k != None:
                lmda = np.power(float(10), regularization_k)
            else:
                lmda = 0
        
            lmda_I = lmda * I
        
            ZTZ_plus_lmda_I_inv = np.linalg.inv(ZTZ + lmda_I)
        
            ZT_times_y = np.dot(np.transpose(points), values)
        
            self.weights = np.dot(ZTZ_plus_lmda_I_inv, ZT_times_y)
        else:
            ZTZ = np.dot(np.transpose(points), points)
        
            if regularization_k != None:
                lmda = np.power(float(10), regularization_k)
            else:
                lmda = 0
        
            ZTZ_plus_lmda_I_inv = 1.0 / (ZTZ + lmda)
        
            ZT_times_y = np.dot(np.transpose(points), values)
        
            self.weights = ZTZ_plus_lmda_I_inv * ZT_times_y

    def get_classification_error(self, points, values):
        """
        Return the classification error of the Linear Regression
        algorithm given the points and values.
        
        Return type: np.float64
        """
        misclassified_list = []
        for i in range(len(points)):
            point = points[i]
            if self.evaluate(point) == values[i]:
                misclassified_list.append(0)
            else:
                misclassified_list.append(1)
        
        return np.mean(np.array(misclassified_list,
                                dtype=np.float64))
        
    def run(self, in_sample_points, in_sample_values,
            out_sample_points, out_sample_values,
            regularization_k=None):
        """
        Return the in-sample error and out-of-sample error of
        the Linear Regression algorithm after optimizing the
        weights of the algorithm using the given in-sample
        data (in_sample_points and in_sample_values).
        
        Return type: tuple of np.float64 values 
        """
        self.optimize_weights(in_sample_points, in_sample_values,
                              regularization_k=regularization_k)
        in_sample_error = \
            self.get_classification_error(in_sample_points,
                                          in_sample_values)
        out_sample_error = \
            self.get_classification_error(out_sample_points,
                                          out_sample_values)
        return (in_sample_error, out_sample_error)

In [34]:
# Getting in-sample and out-of-sample points and values from
# respective files.
in_sample_points_from_file, in_sample_values = \
    get_data_from_file("in.dta")
in_sample_points = \
    get_transformed_points(in_sample_points_from_file)

out_sample_points_from_file, out_sample_values = \
    get_data_from_file("out.dta")
out_sample_points = \
    get_transformed_points(out_sample_points_from_file)

For Problem 1

In [35]:
n_train = 25
n_validation = len(in_sample_points) - n_train

train_points = in_sample_points[:n_train]
train_values = in_sample_values[:n_train]

validation_points = in_sample_points[-n_validation:]
validation_values = in_sample_values[-n_validation:]

LR3 = LinReg(n=1)
train_error3, validation_error3 = \
    LR3.run(train_points[:,3], train_values,
            validation_points[:,3], validation_values)

LR4 = LinReg(n=1)
train_error4, validation_error4 = \
    LR4.run(train_points[:,4], train_values,
            validation_points[:,4], validation_values)

LR5 = LinReg(n=1)
train_error5, validation_error5 = \
    LR5.run(train_points[:,5], train_values,
            validation_points[:,5], validation_values)

LR6 = LinReg(n=1)
train_error6, validation_error6 = \
    LR6.run(train_points[:,6], train_values,
            validation_points[:,6], validation_values)

LR7 = LinReg(n=1)
train_error7, validation_error7 = \
    LR7.run(train_points[:,7], train_values,
            validation_points[:,7], validation_values)

print("k = 3 model validation set classification error:",
      validation_error3, "\n")

print("k = 4 model validation set classification error:",
      validation_error4, "\n")

print("k = 5 model validation set classification error:",
      validation_error5, "\n")

print("k = 6 model validation set classification error:",
      validation_error6, "\n")

print("k = 7 model validation set classification error:",
      validation_error7, "\n")

k = 3 model validation set classification error: 0.4 

k = 4 model validation set classification error: 0.4 

k = 5 model validation set classification error: 0.1 

k = 6 model validation set classification error: 0.4 

k = 7 model validation set classification error: 0.6 



For Problem 2

In [36]:
out_sample_error3 = \
    LR3.get_classification_error(out_sample_points[:,3],
                                 out_sample_values)
out_sample_error4 = \
    LR4.get_classification_error(out_sample_points[:,4],
                                 out_sample_values)
out_sample_error5 = \
    LR5.get_classification_error(out_sample_points[:,5],
                                 out_sample_values)
out_sample_error6 = \
    LR6.get_classification_error(out_sample_points[:,6],
                                 out_sample_values)
out_sample_error7 = \
    LR7.get_classification_error(out_sample_points[:,7],
                                 out_sample_values)

print("k = 3 model out-of-sample classification error:",
      out_sample_error3, "\n")

print("k = 4 model out-of-sample classification error:",
      out_sample_error4, "\n")

print("k = 5 model out-of-sample classification error:",
      out_sample_error5, "\n")

print("k = 6 model out-of-sample classification error:",
      out_sample_error6, "\n")

print("k = 7 model out-of-sample classification error:",
      out_sample_error7, "\n")

k = 3 model out-of-sample classification error: 0.472 

k = 4 model out-of-sample classification error: 0.472 

k = 5 model out-of-sample classification error: 0.168 

k = 6 model out-of-sample classification error: 0.472 

k = 7 model out-of-sample classification error: 0.528 



For Problem 3

In [37]:
n_train_2 = 10
n_validation_2 = len(in_sample_points) - n_train_2

train_points_2 = in_sample_points[-n_train_2:]
train_values_2 = in_sample_values[-n_train_2:]

validation_points_2 = in_sample_points[:n_validation_2]
validation_values_2 = in_sample_values[:n_validation_2]

for i in range(len(train_points_2)):
    assert(train_points_2[i][1] == validation_points[i][1])
for i in range(len(train_points)):
    assert(train_points[i][1] == validation_points_2[i][1])

LR3_2 = LinReg(n=1)
train_error3_2, validation_error3_2 = \
    LR3_2.run(train_points_2[:,3], train_values_2,
            validation_points_2[:,3], validation_values_2)

LR4_2 = LinReg(n=1)
train_error4_2, validation_error4_2 = \
    LR4_2.run(train_points_2[:,4], train_values_2,
            validation_points_2[:,4], validation_values_2)

LR5_2 = LinReg(n=1)
train_error5_2, validation_error5_2 = \
    LR5_2.run(train_points_2[:,5], train_values_2,
            validation_points_2[:,5], validation_values_2)

LR6_2 = LinReg(n=1)
train_error6_2, validation_error6_2 = \
    LR6_2.run(train_points_2[:,6], train_values_2,
            validation_points_2[:,6], validation_values_2)

LR7_2 = LinReg(n=1)
train_error7_2, validation_error7_2 = \
    LR7_2.run(train_points_2[:,7], train_values_2,
            validation_points_2[:,7], validation_values_2)

print("k = 3 model validation set classification error:",
      validation_error3_2, "\n")

print("k = 4 model validation set classification error:",
      validation_error4_2, "\n")

print("k = 5 model validation set classification error:",
      validation_error5_2, "\n")

print("k = 6 model validation set classification error:",
      validation_error6_2, "\n")

print("k = 7 model validation set classification error:",
      validation_error7_2, "\n")

k = 3 model validation set classification error: 0.44 

k = 4 model validation set classification error: 0.44 

k = 5 model validation set classification error: 0.2 

k = 6 model validation set classification error: 0.44 

k = 7 model validation set classification error: 0.56 



For Problem 4

In [38]:
out_sample_error3_2 = \
    LR3_2.get_classification_error(out_sample_points[:,3],
                                   out_sample_values)
out_sample_error4_2 = \
    LR4_2.get_classification_error(out_sample_points[:,4],
                                   out_sample_values)
out_sample_error5_2 = \
    LR5_2.get_classification_error(out_sample_points[:,5],
                                   out_sample_values)
out_sample_error6_2 = \
    LR6_2.get_classification_error(out_sample_points[:,6],
                                   out_sample_values)
out_sample_error7_2 = \
    LR7_2.get_classification_error(out_sample_points[:,7],
                                   out_sample_values)

print("k = 3 model out-of-sample classification error:",
      out_sample_error3_2, "\n")

print("k = 4 model out-of-sample classification error:",
      out_sample_error4_2, "\n")

print("k = 5 model out-of-sample classification error:",
      out_sample_error5_2, "\n")

print("k = 6 model out-of-sample classification error:",
      out_sample_error6_2, "\n")

print("k = 7 model out-of-sample classification error:",
      out_sample_error7_2, "\n")

k = 3 model out-of-sample classification error: 0.472 

k = 4 model out-of-sample classification error: 0.472 

k = 5 model out-of-sample classification error: 0.168 

k = 6 model out-of-sample classification error: 0.472 

k = 7 model out-of-sample classification error: 0.528 



For Problem 5

In [39]:
print("Models chosen in Problems 1 and 3: k = 5 model for both",
      "problems.")
print("Problem 1 (k = 5 model) out-of-sample classification error:",
      out_sample_error5)
print("Problem 3 (k = 5 model) out-of-sample classification error:",
      out_sample_error5_2)

Models chosen in Problems 1 and 3: k = 5 model for both problems.
Problem 1 (k = 5 model) out-of-sample classification error: 0.168
Problem 3 (k = 5 model) out-of-sample classification error: 0.168
