# Linear Regression for Housing Data in Boston

In [38]:
# Imports
import numpy as np
import pandas as pd
import math
import time
import enum

In [39]:
# Feature Processing Class
class FeaturePreProcessingType(enum.Enum):
    none = 0
    feature_scaling = 1
    learning_rate_scaling = 2

In [40]:
# Trainer Class
class Trainer:
    """
    This class is used to perform linear regression.
    """
    def __init__(self,
                 data,
                 test_sample_ratio=0.0,
                 learning_rate=0.01,
                 features_pre_processing_type=FeaturePreProcessingType.feature_scaling):
        """
        Initialize a trainer with a training set data, which is an augmented matrix.
        Test sample percentage indicates how many percent of the data should be used as test samples,
        the rest of them will be training samples.
        """
        if not 0 <= test_sample_ratio < 1:
            raise ValueError('Test sample ratio has to be between greater than or equal to 0, and less than 1.')

        self.__data = data
        self.__learning_rate = learning_rate
        self.__test_sample_ratio = test_sample_ratio
        self.__features_pre_processing_type = features_pre_processing_type

        self.__setup_training_and_testing_sets()
        self.__setup_weights()

    def train(self, print_cost_while_training=False):
        print('Started training...')
        start_time = time.time()
        last_cost = self.__cost_of_training_set()
        cost_not_change_count = 0
        while cost_not_change_count <= 10:
            change = self.__derivative_of_cost()
            if self.__features_pre_processing_type == FeaturePreProcessingType.learning_rate_scaling:
                self.__weights = self.__weights - change * self.__learning_rate.transpose()
            else:
                self.__weights = self.__weights - change * self.__learning_rate
            current_cost = self.__cost_of_training_set()
            if print_cost_while_training:
                print('cost: {0:.2f}'.format(current_cost))
            if current_cost == last_cost:
                cost_not_change_count += 1
            last_cost = current_cost
    
        end_time = time.time()
        print('Used {0:.2f} seconds to train model.'.format(end_time - start_time))
        print('Weights are: {0}'.format(self.__weights))
        try:
            cost_of_testing, error_rate_of_testing = self.__cost_and_error_rate(self.__testing_set_features, self.__testing_set_outputs)
        except RuntimeWarning as e:
            print('Cost for testing samples is too large, can\'t be printed.')
        else:
            print('Cost for {0} testing samples is {1:.2f}'.format(np.size(self.__testing_set_features, axis=0), cost_of_testing))
            print('Error rate for {0} testing samples is ±{1:.2f}%.'.format(np.size(self.__testing_set_features, axis=0), error_rate_of_testing * 100))
        finally:
            print('Training finished.')

    def predict(self, features):
        if self.__features_pre_processing_type == FeaturePreProcessingType.feature_scaling:
            features = self.__scale_features(features)
        
        features = self.__add_x0_column(features)
        return features @ self.__weights.transpose()
    
    #
    # Helper Methods
    #
    
    def __predict_scaled_with_x0_column_features(self, features):
        return features @ self.__weights.transpose()
    
    def __setup_training_and_testing_sets(self):
        num_training_sample, _ = self.__get_training_and_testing_samples_counts()
        self.__training_set_features = self.__data[:num_training_sample, :-1]
        self.__training_set_outputs = self.__data[:num_training_sample, -1]
        self.__testing_set_features = self.__data[num_training_sample:, :-1]
        self.__testing_set_outputs = self.__data[num_training_sample:, -1]
        if self.__features_pre_processing_type != FeaturePreProcessingType.none:
            self.__update_feature_scaling_parameters()
            if self.__features_pre_processing_type == FeaturePreProcessingType.feature_scaling:
                self.__training_set_features = self.__scale_features(self.__training_set_features)
                self.__testing_set_features = self.__scale_features(self.__testing_set_features)
            elif self.__features_pre_processing_type == FeaturePreProcessingType.learning_rate_scaling:
                self.__scale_learning_rate_if_enabled()
        self.__training_set_features = self.__add_x0_column(self.__training_set_features)
        self.__testing_set_features = self.__add_x0_column(self.__testing_set_features)
    
    def __get_training_and_testing_samples_counts(self):
        total_sample_count = np.size(self.__data, axis=0)
        training_set_count = math.ceil((1.0 - self.__test_sample_ratio) * total_sample_count)
        testing_set_count = total_sample_count - training_set_count
        return (training_set_count, testing_set_count)

    def __update_feature_scaling_parameters(self):
        self.__feature_scaling_std = np.std(self.__training_set_features, axis=0)
        self.__feature_scaling_range = np.max(self.__training_set_features, axis=0) - np.min(self.__training_set_features, axis=0)
        
    def __scale_features(self, features):
        return (features - self.__feature_scaling_std) / self.__feature_scaling_range
    
    def __add_x0_column(self, A):
        try:
            return np.insert(A, obj=0, values=1, axis=1)
        except IndexError:
            return np.insert(A, obj=0, values=1)
    
    def __setup_weights(self):
        self.__weights = np.zeros(np.size(self.__training_set_features, axis=1))
        
    def __cost_of_training_set(self):
        result, _ = self.__cost_and_error_rate(self.__training_set_features, self.__training_set_outputs)
        return result
    
    def __cost_of_testing_set(self):
        result, _ = self.__cost_and_error_rate(self.__testing_set_features, self.__testing_set_outputs)
        return result
    
    def __cost_and_error_rate(self, features, outputs):
        predictions = self.__predict_scaled_with_x0_column_features(features)
        diff = np.array(outputs - predictions)
        diff_squared = np.power(diff, 2)
        result_cost = np.average(diff_squared) / 2.0
        result_error_rate = np.average(np.abs(diff) / predictions)
        return (result_cost, result_error_rate)
    
    def __derivative_of_cost(self):
        predictions = self.__predict_scaled_with_x0_column_features(self.__training_set_features)
        diff = predictions - self.__training_set_outputs
        features_scaled_with_diff = (self.__training_set_features.transpose() * diff).transpose()
        return np.average(features_scaled_with_diff, axis=0)

    def __scale_learning_rate_if_enabled(self):
        current_flat_rate = self.__learning_rate
        if self.__features_pre_processing_type == FeaturePreProcessingType.learning_rate_scaling:
            self.__learning_rate *= self.__feature_scaling_std
            self.__learning_rate = np.insert(self.__learning_rate, obj=0, values=current_flat_rate, axis=1)

In [41]:
# Get Data
df = pd.read_csv('housing/housing.data', header=None, delim_whitespace=True)
data = df.as_matrix()

In [None]:
# Setup Trainer
trainer = Trainer(data,
                  test_sample_ratio=0.05,
                  learning_rate=0.01,
                  features_pre_processing_type=FeaturePreProcessingType.feature_scaling)

In [None]:
# Start Training
trainer.train(print_cost_while_training=False)

Started training...


In [None]:
# Predicting
predictions = trainer.predict(np.matrix(
'6.39312   0.00  18.100  0  0.5840  6.1620  97.40  2.2060  24  666.0  20.20 302.76  24.10;\
 4.87141   0.00  18.100  0  0.6140  6.4840  93.60  2.3053  24  666.0  20.20 396.21  18.68;\
15.02340   0.00  18.100  0  0.6140  5.3040  97.30  2.1007  24  666.0  20.20 349.48  24.91;\
10.23300   0.00  18.100  0  0.6140  6.1850  96.70  2.1705  24  666.0  20.20 379.70  18.03;\
14.33370   0.00  18.100  0  0.6140  6.2290  88.00  1.9512  24  666.0  20.20 383.32  13.11;\
 5.82401   0.00  18.100  0  0.5320  6.2420  64.70  3.4242  24  666.0  20.20 396.90  10.74;\
 5.70818   0.00  18.100  0  0.5320  6.7500  74.90  3.3317  24  666.0  20.20 393.07   7.74;\
 5.73116   0.00  18.100  0  0.5320  7.0610  77.00  3.4106  24  666.0  20.20 395.28   7.01;\
 2.81838   0.00  18.100  0  0.5320  5.7620  40.30  4.0983  24  666.0  20.20 392.92  10.42;\
 2.37857   0.00  18.100  0  0.5830  5.8710  41.90  3.7240  24  666.0  20.20 370.73  13.34;\
 3.67367   0.00  18.100  0  0.5830  6.3120  51.90  3.9917  24  666.0  20.20 388.62  10.58;\
 5.69175   0.00  18.100  0  0.5830  6.1140  79.80  3.5459  24  666.0  20.20 392.68  14.98;\
 4.83567   0.00  18.100  0  0.5830  5.9050  53.20  3.1523  24  666.0  20.20 388.22  11.45;\
 0.15086   0.00  27.740  0  0.6090  5.4540  92.70  1.8209   4  711.0  20.10 395.09  18.06;\
 0.18337   0.00  27.740  0  0.6090  5.4140  98.30  1.7554   4  711.0  20.10 344.05  23.97;\
 0.20746   0.00  27.740  0  0.6090  5.0930  98.00  1.8226   4  711.0  20.10 318.43  29.68;\
 0.10574   0.00  27.740  0  0.6090  5.9830  98.80  1.8681   4  711.0  20.10 390.11  18.07;\
 0.11132   0.00  27.740  0  0.6090  5.9830  83.50  2.1099   4  711.0  20.10 396.90  13.35;\
 0.17331   0.00   9.690  0  0.5850  5.7070  54.00  2.3817   6  391.0  19.20 396.90  12.01;\
 0.27957   0.00   9.690  0  0.5850  5.9260  42.60  2.3817   6  391.0  19.20 396.90  13.59;\
 0.17899   0.00   9.690  0  0.5850  5.6700  28.80  2.7986   6  391.0  19.20 393.29  17.60;\
 0.28960   0.00   9.690  0  0.5850  5.3900  72.90  2.7986   6  391.0  19.20 396.90  21.14;\
 0.26838   0.00   9.690  0  0.5850  5.7940  70.60  2.8927   6  391.0  19.20 396.90  14.10;\
 0.23912   0.00   9.690  0  0.5850  6.0190  65.30  2.4091   6  391.0  19.20 396.90  12.92;\
 0.17783   0.00   9.690  0  0.5850  5.5690  73.50  2.3999   6  391.0  19.20 395.77  15.10;\
 0.22438   0.00   9.690  0  0.5850  6.0270  79.70  2.4982   6  391.0  19.20 396.90  14.33;\
 0.06263   0.00  11.930  0  0.5730  6.5930  69.10  2.4786   1  273.0  21.00 391.99   9.67;\
 0.04527   0.00  11.930  0  0.5730  6.1200  76.70  2.2875   1  273.0  21.00 396.90   9.08;\
 0.06076   0.00  11.930  0  0.5730  6.9760  91.00  2.1675   1  273.0  21.00 396.90   5.64;\
 0.10959   0.00  11.930  0  0.5730  6.7940  89.30  2.3889   1  273.0  21.00 393.45   6.48;\
 0.04741   0.00  11.930  0  0.5730  6.0300  80.80  2.5050   1  273.0  21.00 396.90   7.88'
))
for pred in np.squeeze(np.asarray(predictions)):
    print('{0:2.2f}'.format(pred).rjust(6))