In [42]:
# Import libraries and Load the dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt

def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        for line in file:
            row = line.strip().split()
            if row:
                dataset.append(row)
    return dataset

In [43]:
# Convert string column to float 
def str_column_to_float(dataset, column):
    for i, row in enumerate(dataset):
        try:
            dataset[i][column] = float(row[column].strip())
        except ValueError:
            print(f"Warning: couldn't convert {row[column]} in row {i}, column {column}")
            dataset[i][column] = float('nan')
        except IndexError:
            print(f"Index error in row {row} and column {column}")
            dataset[i][column] = float('nan')

# Split the dataset into train and test
def train_test_split(dataset, split):
    train = list()
    train_size = int(split * len(dataset))
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

In [44]:
# Find mean and variance
def mean(values):
    return sum(values) / float(len(values))

def variance(values, mean):
    return sum([(x-mean) ** 2 for x in values])

In [45]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [46]:
# Estimate value of both coefficients in Linear regression
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

In [47]:
# Define linear regression model to make prediction
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [48]:
# Evaluate performance using root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

In [49]:
# Calling main function to see the output
filename = 'housing.csv'
dataset = load_csv(filename)

for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

split = 0.6
train, test = train_test_split(dataset, split)

predictions = simple_linear_regression(train, test)
actual = [row[1] for row in test]
rmse = rmse_metric(actual, predictions)
print('RMSE: %.3f' %rmse)

RMSE: 19.195
