# Philip Carr
# CS/CNS/EE 156a Homework 6 Code (Jupyter Notebook)

Code for The Logistic Regression Algorithm (Problems 2 - 6)

In [1]:
import numpy as np

In [2]:
def sign(x):
    """
    Return the sign of a number (1 if positive, 0 if 0, or -1 if
    negative).
    
    Return type: int
    """
    if x >= 0:
        return 1
    elif x == 0:
        return 0
    else:
        return -1

In [3]:
def get_data_from_file(filename):
    """
    Return an array of points within the region [-1, 1] x [-1, 1]
    and an array of corresponding values together as a tuple obtained
    from the file with the given name filename.
    
    Return type: 2D array of points
    """
    file1 = open(filename, "r")
    points = []
    values = []
    for line in file1:
        data = line.split()
        x0 = 1.0
        x1 = float(data[0])
        x2 = float(data[1])
        points.append([x0, x1, x2])
        values.append(float(data[2]))
    file1.close()
    return (np.array(points, dtype=np.float64),
            np.array(values, dtype=np.float64))

In [4]:
def get_transformed_points(points):
    """
    Return an array of points transformed using the nonlinear
    transformation Φ(1, x1, x2) = (1, x1, x2, x1^2, x2^2, x1 x2,
    |x1 - x2|, |x1 + x2|).
    """
    transformed_points = []
    for i in range(len(points)):
        point = []
        point.append(points[i,0])
        point.append(points[i,1])
        point.append(points[i,2])
        point.append(points[i,1] * points[i,1])
        point.append(points[i,2] * points[i,2])
        point.append(points[i,1] * points[i,2])
        point.append(abs(points[i,1] - points[i,2]))
        point.append(abs(points[i,1] + points[i,2]))
        transformed_points.append(point)
    return np.array(transformed_points, dtype=np.float64)

In [5]:
class LinReg:
    """
    This class represents the LinReg (Linear Regression Algorithm
    with regularization).
    This class contains the weights, as well as methods for running
    the Linear Regression Algorithm with regularization.
    """
    def __init__(self, n=7):
        """
        Initialize the weights of the LinReg using the given
        dimension n of the points to work with in R^n space.
        
        Return type: class (LinReg)
        """
        self.weights = np.array([0] * (n + 1), dtype=np.float64)
    
    def __repr__(self):
        """
        Print the weights of the LinReg.
        
        Return type: string
        """
        print("LogReg weights:", self.weights)
    
    def get_weights(self):
        """
        Return the weights of the LinReg.
        
        Return type: list of floats
        """
        return self.weights
    
    def evaluate(self, point):
        """
        Return a point's value using the Linear Regression
        Algorithm's weights.
        """
        real_value = 0
        assert(len(self.weights) == len(point))
        for i in range(len(self.weights)):
            real_value += self.weights[i] * point[i]
            
        return sign(real_value)
    
    def optimize_weights(self, points, values,
                         regularization_k=None):
        """
        Optimize the LinReg weights using linear regression.
        Regularization is used when regularization_k is not
        equal to None.
        
        Return type: None
        """
        ZTZ = np.dot(np.transpose(points), points)
        
        dimension = len(points[0])
        I = np.identity(dimension)
        
        if regularization_k != None:
            lmda = np.power(float(10), regularization_k)
        else:
            lmda = 0
        
        lmda_I = lmda * I
        
        ZTZ_plus_lmda_I_inv = np.linalg.inv(ZTZ + lmda_I)
        
        ZT_times_y = np.dot(np.transpose(points), values)
        
        self.weights = np.dot(ZTZ_plus_lmda_I_inv, ZT_times_y)

    def get_classification_error(self, points, values):
        """
        Return the classification error of the Linear Regression
        algorithm given the points and values.
        """
        misclassified_list = []
        for i in range(len(points)):
            point = points[i]
            if self.evaluate(point) == values[i]:
                misclassified_list.append(0)
            else:
                misclassified_list.append(1)
        
        return np.mean(np.array(misclassified_list,
                                dtype=np.float64))
        
    def run(self, in_sample_points, in_sample_values,
            out_sample_points, out_sample_values,
            regularization_k=None):
        """
        Return the in-sample error and out-of-sample error of
        the Linear Regression algorithm after optimizing the
        weights of the algorithm using the given in-sample
        data (in_sample_points and in_sample_values).
        
        Return type: tuple of floats
        """
        self.optimize_weights(in_sample_points, in_sample_values,
                              regularization_k=regularization_k)
        in_sample_error = \
            self.get_classification_error(in_sample_points,
                                          in_sample_values)
        out_sample_error = \
            self.get_classification_error(out_sample_points,
                                          out_sample_values)
        return (in_sample_error, out_sample_error)

In [6]:
in_sample_points_from_file, in_sample_values = \
    get_data_from_file("in.dta")
in_sample_points = \
    get_transformed_points(in_sample_points_from_file)

out_sample_points_from_file, out_sample_values = \
    get_data_from_file("out.dta")
out_sample_points = \
    get_transformed_points(out_sample_points_from_file)

For Problem 2

In [7]:
LR2 = LinReg()
in_sample_error, out_sample_error = LR2.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values)
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)

In-sample Classification Error: 0.02857142857142857 
Out-of-sample Classification Error: 0.084


For Problem 3

In [8]:
LR3 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=-3)
print("k = -3")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)

k = -3
In-sample Classification Error: 0.02857142857142857 
Out-of-sample Classification Error: 0.08


For Problem 4

In [9]:
LR3 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=3)
print("k = 3")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)

k = 3
In-sample Classification Error: 0.37142857142857144 
Out-of-sample Classification Error: 0.436


For Problem 5

In [10]:
LR5_2 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=2)
print("k = 2")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)
print()

LR5_2 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=2)
print("k = 1")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)
print()

LR5_1 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=1)
print("k = 0")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)
print()

LR5_neg1 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=-1)
print("k = -1")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)
print()

LR5_neg2 = LinReg()
in_sample_error, out_sample_error = LR3.run(in_sample_points,
                                            in_sample_values,
                                            out_sample_points,
                                            out_sample_values,
                                            regularization_k=-2)
print("k = -2")
print("In-sample Classification Error:", in_sample_error,
      "\nOut-of-sample Classification Error:",
      out_sample_error)

k = 2
In-sample Classification Error: 0.2 
Out-of-sample Classification Error: 0.228

k = 1
In-sample Classification Error: 0.2 
Out-of-sample Classification Error: 0.228

k = 0
In-sample Classification Error: 0.05714285714285714 
Out-of-sample Classification Error: 0.124

k = -1
In-sample Classification Error: 0.02857142857142857 
Out-of-sample Classification Error: 0.056

k = -2
In-sample Classification Error: 0.02857142857142857 
Out-of-sample Classification Error: 0.084


For Problem 6

In [11]:
k_values = np.arange(-10,11)
min_index = -1
min_out_sample_error = float("inf")
for i in range(len(k_values)):
    LR6 = LinReg()
    in_sample_error, out_sample_error = \
        LR6.run(in_sample_points, in_sample_values,
                out_sample_points, out_sample_values,
                regularization_k=k_values[i])
    if min_out_sample_error > out_sample_error:
        min_out_sample_error = out_sample_error
        min_index = i

print("k that yields minimum out-of-sample classification error",
      "over range of k from", k_values[0], "to", k_values[-1],
      "(inclusive):", k_values[min_index])
print("Minimum out-of-sample classification error over range of k",
      "from", k_values[0], "to", k_values[-1], "(inclusive):",
      min_out_sample_error)

k that yields minimum out-of-sample classification error over range of k from -10 to 10 (inclusive): -1
Minimum out-of-sample classification error over range of k from -10 to 10 (inclusive): 0.056
