In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import csv

np.random.seed(0)

## Linear Ridge Regression in Multiple Dimension [Medium]

In [2]:
# utilities for creating a dataset

### data structure to represent dataset
class Dataset:
    def __init__(self, x: np.ndarray, y: np.ndarray, n = None):
        # todo: assertion to verify the dimension of x and y
        self.__x = x
        self.__y = y
        self.__n = n if n else len(x)
    
    @property
    def x(self):
        return self.__x

    @property
    def y(self):
        return self.__y
    
    @x.setter
    def x(self, value):
        self.__x = value
        
    @y.setter
    def y(self, value):
        self.__y = value
        
    def __getitem__(self, index: int):
        #todo: assertion to verify out of bounds
        return self.__x[index], self.__y[index]
    
    def __setitem__(self, index: int, x_: np.ndarray, y_: np.ndarray):
        # todo: assertion to verify out of bounds
        self.__x[index] = x_
        self.__y[index] = y_
    
    def __len__(self):
        return self.__n
    
    def __del__(self):
        del(self.__x)
        del(self.__y)
        del(self.__n)
    
    def __iter__(self):
        self.__index = 0
        return self
    
    def __next__(self):
        if (self.__index < self.__n):
            self.__index += 1
            return self[self.__index - 1]
        raise StopIteration

In [3]:
# read data and create a dataset out of it
# NOTE: The only label field must be the last column

def read_raw(path: str):
    # read raw data from csv
    # convert str to float
    # for every field possible
    file = open(path, "r")
    raw_data = csv.reader(file, delimiter = ',')
    
    data = []
    for row in raw_data:
        for (i, value) in enumerate(row):
            try:
                row[i] = float(value)
            except:
                pass
        data.append(row)
    file.close()
    
    return data

def get_field_info(data):
    num_rows, num_cols = len(data), len(data[0])
    
    # extract information about numeric and non-numeric fields
    non_numeric_fields = {}
    numeric_fields = set([])
    
    for index in range(num_cols-1):
        if type(data[0][index]) == float:
            numeric_fields.add(index)
            continue
        # for each non-numeric field, we maintain information about number
        # and types of different values possible for that field
        non_numeric_fields[index] = {'count': -1, 'values': {}}
    
    for row in data:
        for index, field in non_numeric_fields.items():
            value = row[index]
            if (value not in field['values']):
                field['count'] += 1
                field['values'][value] = field['count']
    
    return non_numeric_fields, numeric_fields

def construct_design_matrix(data, non_numeric_fields, numeric_fields):
    num_rows, num_cols = len(data), len(data[0])
    
    # constructing desired design matrix and label vector.
    # we encode non-numeric values using one-hot encoding.
    
    # however, after one hot encoding, we eliminate a column
    # for each original non-numeric field to reduce correlation
    # between newly formed fields
    
    X, Y = [], []
    
    for i, row in enumerate(data):
        x = []
        Y.append(row[num_cols-1])
        for index in range(num_cols - 1):
            
            value = row[index]
            
            # append numeric feature as it is
            if (index in numeric_fields):
                x.append(row[index])
                continue
                
            # encode non-numeric feature and append
            field = non_numeric_fields[index]
            one_hot_encoded = [0]*field['count']
            pos = field['values'][value]
            if (pos): one_hot_encoded[pos-1] = 1
            x.extend(one_hot_encoded)
        
        X.append(x)
    
    return np.array(X), np.array(Y)

def get_dataset(path: str, hasHeader = True):
    
    data = read_raw(path)
    
    # remove first row if it is a header
    if (hasHeader):
        data = data[1:]
    
    # check if there is data
    if (not data):
        raise IndexError('No data in the given file')
    
    # extract information about numeric and non-numeric fields
    non_numeric_fields, numeric_fields = get_field_info(data)
    
    X, Y = construct_design_matrix(data, non_numeric_fields, numeric_fields)
    
    return Dataset(X, Y)

In [4]:
# setting up path of .csv file
path = 'dataset/insurance.csv'

dataset = get_dataset(path)

#### Feature Normalization

In [5]:
# normalize a 2-d matrix

def get_mean_variance(matrix):
    num_rows, num_cols = matrix.shape
    mean = np.sum(matrix, axis=0, keepdims=True) / num_rows
    variance = (np.sum((matrix - mean) ** 2, axis=0, keepdims=True) / (num_rows))
    return mean, variance

def normalize(matrix, mean, variance):
    std = variance ** 0.5
    return (matrix - mean) / std

dataset.x = normalize(dataset.x, *get_mean_variance(dataset.x))
mean, variance = get_mean_variance(dataset.x)
print(f'Verifying Mean and Variance for Normalized Data (for each feature):\nMean\t\t{np.around(mean, 4)}\nVariance\t{np.around(variance, 4)}')

Verifying Mean and Variance for Normalized Data (for each feature):
Mean		[[-0. -0.  0.  0.  0.  0.  0. -0.]]
Variance	[[1. 1. 1. 1. 1. 1. 1. 1.]]


#### Partition dataset into training, validation and test set