# Regression

## Description
This Python script is to do linear or logistic regression, using gradient descent or normal equation.

In [60]:
#imports
import numpy as np
import pandas as pd
import math
import time
import enum

In [61]:
# Regression Type
class RegressionType(enum.Enum):
    linear = 1
    logistic = 2

In [62]:
# Feature Processing Class
class FeaturePreProcessingType(enum.Enum):
    no_preprocessing = 0
    feature_scaling = 1

In [63]:
# Linear Regression Algorithm Class
class LinearRegressionAlgorithmType(enum.Enum):
    unspecified = 0
    gradient_descent = 1
    normal_equation = 2

In [64]:
class DataProcessor():
    
    @staticmethod
    def add_x0_column(A):
        return np.insert(A, obj=0, values=1, axis=1)

    @staticmethod
    def augmented_to_coefficient_and_b(A):
        return (A[:, :-1], A[:, -1])

    @staticmethod
    def partition(A, atInd):
        return (A[:atInd], A[atInd:])
    
    @staticmethod
    def get_category_to_number_and_vise_versa_dict(categories, case_sensitive=True):
        if not case_sensitive:
            categories = [x.lower() if isinstance(x, str) else x for x in arr ]
        original_categories = set(categories)
        cat_to_num = dict()
        num_to_cat = dict()
        for i, category in enumerate(original_categories):
            cat_to_num[category] = i
            num_to_cat[i] = category
        return (cat_to_num, num_to_cat)

In [65]:
class DataScalar():
    
    def __init__(self, data, data_has_x0_column=False):
        self.__data = data
        self.__data_has_x0_column = data_has_x0_column 
        self.__scalars = np.ones(np.size(data, axis=1))
        self.__calculate_scalars()
    
    def scaled_data(self):
        return self.scale_new_data(self.__data, self.__data_has_x0_column)
    
    def scale_new_data(self, data, input_has_x0_column=False):
        if input_has_x0_column:
            avg = np.insert(self.__avg, obj=0, values=0)
            std = np.insert(self.__std, obj=0, values=1)
        else:
            avg = self.__avg
            std = self.__std
        return (data - avg) / std
        
    def __calculate_scalars(self):
        if self.__data_has_x0_column:
            self.__avg = np.average(self.__data[:, 1:], axis=0)
            self.__std = np.std(self.__data[:, 1:], axis=0)
        else:
            self.__avg = np.average(self.__data, axis=0)
            self.__std = np.std(self.__data, axis=0)

In [66]:
class Trainer():
    def __init__(self,
                coefficient_matrix,
                outputs,
                regression_type):
        self.__x = coefficient_matrix
        self.__y = outputs
        self.__regression_type = regression_type

    def train(self,
              training_algorithm=LinearRegressionAlgorithmType.unspecified,
              learning_rate=0.01,
              regularization_lambda=0.0):
        self.__setup_training()
        
        print('Started training......')
        start_time = time.time()
        if not training_algorithm:
            training_algorithm = self.__get_training_alg_from_num_features()
        #TODO
    
    def get_weights(self):
        return self.__theta
    
    def __setup_training(self):
        if self.__regression_type == RegressionType.linear:
            self.__setup_linear_regression()
        elif self.__regression_type == RegressionType.logistic:
            self.__setup_logistic_regression()
        else:
            raise ValueError('Cannot start training, regression type not specified.')

    def __setup_linear_regression(self):
        self.__theta = np.zeros(np.size(self.__data, axis=1))
        
    def __setup_logistic_regression(self):
        self.__categories = np.array(set(self.__y))
        feature_count = np.size(self.__data, axis=1)
        cat_count = np.size(self.__categories, axis=0)
        if cat_count < 2:
            raise ValueError('Cannot do logistic regression, there is only one kind of output.')
        elif cat_count == 2:
            self.__binary_classification = True
            self.__theta = np.zeros(feature_count)
        else:
            self.__binary_classification = False
            theta_shape = (cat_count, feature_count)
            self.__theta = np.zeros(shape=theta_shape)
    
    def __get_training_alg_from_num_features(self):
        num_features_threshold = 10000
        training_set_num_features = np.size(self.__x, axis=1)
        linear_reg = self.__regression_type == RegressionType.linear
        feature_count_small = training_set_num_features < num_features_threshold
        if linear_reg and feature_count_small:
            return LinearRegressionAlgorithmType.normal_equation
        else:
            return LinearRegressionAlgorithmType.gradient_descent
        

IndentationError: expected an indented block (<ipython-input-66-db50fb9fc77e>, line 6)