In [728]:
import pandas
import matplotlib
import pandas as pd
import numpy as np

matplotlib.use("TkAgg")
pandas.set_option('display.max_columns', None)

In [729]:
path = 'pp-2021.csv'
data = pandas.read_csv(path, sep=',', skiprows=1)

In [730]:
data = data.drop(data.columns[[0, 2, 7, 8, 9, 10, 12, 13, 14, 15]], axis=1)
data.columns = ["Price", "Postcode", "PropertyType", "OldNew", "Duration", "City"]

In [731]:
# Remove rows that describe properties that are not typically Houses
data = data[~data["PropertyType"].isin(["F", "O"])]

In [732]:
# For cities that appear in the data less than 600 times, join into "Other" category
city_stats = data.groupby('City')['City'].agg('count').sort_values(ascending=False)
city_stats_less_than_600 = city_stats[city_stats <= 600]
data["City"] = data["City"].apply(lambda x: 'Other' if x in city_stats_less_than_600 else x)
# Print city count distribution
city_stats

City
LONDON           26311
MANCHESTER       13315
BRISTOL          12349
NOTTINGHAM       12239
BIRMINGHAM       11630
                 ...  
LYNMOUTH             3
RHOSGOCH             2
KELSO                1
LLANSANFFRAID        1
NEWCASTLETON         1
Name: City, Length: 1147, dtype: int64

In [733]:
# Remove rows with prices lower than £40,000 and higher than £5,000,000
data = data[data["Price"] > 40000]
data = data[data["Price"] < 5000000]
data.Price.describe()

count    8.658430e+05
mean     3.432514e+05
std      2.911646e+05
min      4.010000e+04
25%      1.785000e+05
50%      2.729950e+05
75%      4.125000e+05
max      4.999999e+06
Name: Price, dtype: float64

In [734]:
# Convert PropertyType and Duration from string to int
data['PropertyType'] = [ord(x) - 64 for x in data.PropertyType]
data['Duration'] = [ord(x) - 64 for x in data.Duration]
# Convert OldNew from Y/N to 1/0
data['OldNew'] = data['OldNew'].map({'Y': 1, 'N': 0})

In [735]:
# Import/Install pgeocode lib to get longitude and latitude from a postcode string
import sys
try:
    import pgeocode
except ImportError as e:
    !conda install --channel conda-forge --yes --prefix {sys.prefix} pgeocode

In [736]:
# Remove all rows with NaN
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

# Format postcode data in main df so that it matches the format in the pgeocode lib
codes = [str(x) for x in data.Postcode]
codes = [x[0:x.index(" ")] for x in codes if " " in x]
data

Unnamed: 0,Price,Postcode,PropertyType,OldNew,Duration,City
0,137000,B98 7BE,20,0,6,REDDITCH
1,337000,B61 8NJ,19,0,6,BROMSGROVE
2,178500,HR2 7RU,19,0,6,HEREFORD
3,178000,B97 6NJ,19,0,6,REDDITCH
4,212500,B96 6AU,19,0,6,REDDITCH
...,...,...,...,...,...,...
865627,155000,WS15 2AU,20,0,6,RUGELEY
865628,167200,B77 2JF,20,0,6,TAMWORTH
865629,90000,DE14 3PH,20,0,6,BURTON-ON-TRENT
865630,255000,WV10 7TU,4,0,6,WOLVERHAMPTON


In [780]:
# Get longitude and latitude values for every valid postcode
nomi = pgeocode.Nominatim("GB")
post_code_info = nomi.query_postal_code(codes)
latitude = post_code_info.latitude
longitude = post_code_info.longitude
post_code_info.longitude.describe()

count    864151.000000
mean         -1.391892
std           1.332416
min          -6.311400
25%          -2.277712
50%          -1.432300
75%          -0.353920
max           1.721625
Name: longitude, dtype: float64

In [738]:
# Store long and lat in main df
data['Longitude'] = longitude
data['Latitude'] = latitude

# Drop Postcode col now we have location data in long + lat
data.drop(['Postcode'], inplace=True, axis=1)

data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [739]:
# Convert each city into a column using one-hot encoding, remove old City col
dummies = pandas.get_dummies(data.City)
dummies.drop(('Other'), inplace=True, axis=1)
data = pd.concat([data, dummies], axis=1)
data.drop(['City'], inplace=True, axis=1)

In [778]:
# Normalize Price between -1 and 1
#min_params = data['Price'].min()
#max_params = data['Price'].max()
#data['Price'] =  2 * ((data['Price'] - min_params) / (max_params - min_params)) - 1
data['Price'] =  data['Price'] / 1000000

# Standardization
#data['Price'] = (data['Price'] - data['Price'].mean()) / data['Price'].std()


min_params = data['PropertyType'].min()
max_params = data['PropertyType'].max()
data['PropertyType'] =  ((data['PropertyType'] - min_params) / (max_params - min_params))


min_params = data['Duration'].min()
max_params = data['Duration'].max()
data['Duration'] =  ((data['Duration'] - min_params) / (max_params - min_params))


min_params = data['Longitude'].min()
max_params = data['Longitude'].max()
data['Longitude'] =  ((data['Longitude'] - min_params) / (max_params - min_params))

# min_params = data['Latitude'].min()
# max_params = data['Latitude'].max()
# data['Latitude'] =  ((data['Latitude'] - min_params) / (max_params - min_params))
data['Latitude'] = data['Latitude'] / 100

min_params, max_params

(0.0, 1.0)

In [792]:
data.columns.get_loc("DURHAM")

118

In [742]:
# Training, Validation, Test split = 80%, 10%, 10%
train, val, test = np.split(data.sample(frac=1, random_state=54), [int(.8 * len(data)), int(.9 * len(data))])

In [743]:
# y = column vector, true price value (labels) (len(data[0],)
y_train, y_val, y_test = train.Price, val.Price, test.Price
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
y_test = y_test.to_numpy()

In [744]:
# X = matrix, excludes price feature
train.drop(['Price'], axis=1, inplace=True)
X_train = train.to_numpy()
val.drop(['Price'], axis=1, inplace=True)
X_val = val.to_numpy()
test.drop(['Price'], axis=1, inplace=True)
X_test = test.to_numpy()

In [745]:
# Shuffle order of indexes for training data
np.set_printoptions(threshold=0)
keys = np.array(range(len(y_train)))
np.random.shuffle(keys)
X_train = X_train[keys]
y_train = y_train[keys]

In [746]:
import numpy as np

class Dense_Layer:
    def __init__(self, n_inputs, n_neurons,
                 weight_regularizer_l1=0., weight_regularizer_l2=0.,
                 bias_regularizer_l1=0., bias_regularizer_l2=0.):

        self.weights = 0.1 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases
        self.inputs = inputs

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        if self.weight_regularizer_l1 > 0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights < 0] = -1
            self.dweights += self.weight_regularizer_l1 * dL1

        if self.weight_regularizer_l2 > 0:
            self.dweights += 2 * self.weight_regularizer_l2 * self.weights

        if self.bias_regularizer_l1 > 0:
            dL1 = np.ones_like(self.biases)
            dL1[self.biases < 0] = -1
            self.dbiases += self.bias_regularizer_l1 * dL1

        if self.bias_regularizer_l2 > 0:
            self.dbiases += 2 * self.bias_regularizer_l2 * self.biases

        self.dinputs = np.dot(dvalues, self.weights.T)


In [747]:
class Dropout_Layer:
    def __init__(self, rate):
        self.rate = 1 - rate

    def forward(self, inputs):
        self.inputs = inputs

        self.binary_mask = np.random.binomial(1, self.rate, size=inputs.shape) / self.rate
        self.output = inputs * self.binary_mask

    def backward(self, dvalues):
        self.dinputs = dvalues * self.binary_mask

In [748]:
class Linear_Activation:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = inputs
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()

In [749]:
class ReLU_Activation:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

In [750]:
class Loss:
    def calculate(self, output, y, layers, include_regularization=False):
        sample_losses = self.forward(output, y, layers)
        data_loss = np.mean(sample_losses)


        self.accumulated_sum += np.sum(sample_losses)
        self.accumulated_count += len(sample_losses)

        if not include_regularization:
            return data_loss

        return data_loss, self.regularization_loss(layers)

    def calculate_accumulated(self, include_regularization=False):
        data_loss = self.accumulated_sum / self.accumulated_count

        if not include_regularization:
            return data_loss


        return data_loss, self.regularization_loss(layers)

    def regularization_loss(self, layers_list):

        regularization_loss = 0

        for layer in layers_list:
            if layer.weight_regularizer_l1 > 0:
                regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

            if layer.weight_regularizer_l2 > 0:
                regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

            if layer.bias_regularizer_l1 > 0:
                regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

            if layer.bias_regularizer_l2 > 0:
                regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

In [751]:
class MSE_Loss(Loss):
    def forward(self, y, y_hat, layers):
        sample_losses = np.mean((y_hat - y)**2, axis=-1)
        return sample_losses
    def backward(self, dvalues, y_hat):

        samples = len(dvalues)
        for batch in y_hat:
            self.dinputs = (-2 * (batch - dvalues)) / samples


In [752]:
class SGD_Optimzer:
    def __init__(self, learning_rate = 0.01, decay=1e-6, momentum=0):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1 / (1 + self.decay * self.iterations))
    def update_params(self, layer):

        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)

            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates

        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations += 1

In [753]:
class Optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

In [754]:
class Accuracy:
    def __init__(self):
        self.precision = None

    def init(self, y, reinit=False):
        if self.precision is None or reinit:
            self.precision = np.std(y) / 250

    def calculate(self, predictions, y):
        # Get comparison results
        comparisons = self.compare(predictions, y)
        # Calculate an accuracy
        accuracy = np.mean(comparisons)
        # Add accumulated sum of matching values and sample count
        self.accumulated_sum += np.sum(comparisons)
        self.accumulated_count += len(comparisons)
        # Return accuracy
        return accuracy

    def calculate_accumulated(self):
        # Calculate an accuracy
        accuracy = self.accumulated_sum / self.accumulated_count
        # Return the data and regularization losses
        return accuracy
        # Reset variables for accumulated accuracy

    def compare(self, predictions, y):
        return np.absolute(predictions - y)

    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0


In [755]:
dense1 = Dense_Layer(X_train.shape[1], 128, weight_regularizer_l1=1e-4, bias_regularizer_l1=1e-4)
activation1 = ReLU_Activation()
dropout1 = Dropout_Layer(0.1)
dense2 = Dense_Layer(128, 128)
activation2 = ReLU_Activation()
dropout2 = Dropout_Layer(0.1)
dense3 = Dense_Layer(128, 1)
activation3 = Linear_Activation()
loss_function = MSE_Loss()
#optimizer = SGD_Optimzer(learning_rate=0.01, decay=4e-5)
optimizer = Optimizer_Adam(learning_rate=0.001, decay=1e-3)
accuracy = Accuracy()



layers = [dense1, dense2, dense3]

In [756]:
batch_size = 128
train_steps = X_train.shape[0] // batch_size
val_steps = X_val.shape[0] // batch_size
test_steps = X_val.shape[0] // batch_size

if train_steps * batch_size < X_train.shape[0]:
    train_steps += 1

if val_steps * batch_size < X_val.shape[0]:
    val_steps += 1

if test_steps * batch_size < X_test.shape[0]:
    test_steps += 1

pairs = []

In [757]:
# Begin training model
for epoch in range(10):
    print(f'epoch: {epoch}')
    accuracy.init(y_train)
    np.random.shuffle(keys)
    X_train = X_train[keys]
    y_train = y_train[keys]

    loss_function.new_pass()
    accuracy.new_pass()

    # Train model in mini batches
    for step in range(train_steps):
        batch_X = X_train[step * batch_size:(step+1)*batch_size]
        batch_y = y_train[step * batch_size:(step+1)*batch_size]

        # Forward Pass
        dense1.forward(batch_X)
        activation1.forward(dense1.output)
        dropout1.forward(activation1.output)
        dense2.forward(dropout1.output)
        activation2.forward(dense2.output)
        dropout2.forward(activation2.output)
        dense3.forward(dropout2.output)
        activation3.forward(dense3.output)

        # Calculate loss of current batch
        data_loss, reg_loss = loss_function.calculate(activation3.output, batch_y, layers, include_regularization=True)
        loss = data_loss + reg_loss

        # Predicted Prices of current batch
        predictions = activation3.output

        # Calculate accuracy of current batch
        accuracy.calculate(predictions, batch_y)

        # Backward Pass
        loss_function.backward(activation3.output, batch_y)
        activation3.backward(loss_function.dinputs)
        dense3.backward(activation3.dinputs)
        dropout2.backward(dense3.dinputs)
        activation2.backward(dropout2.dinputs)
        dense2.backward(activation2.dinputs)
        dropout1.backward(dense2.dinputs)
        activation1.backward(dropout1.dinputs)
        dense1.backward(activation1.dinputs)

        # Update weights
        optimizer.pre_update_params()
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.update_params(dense3)
        optimizer.post_update_params()

        # Print loss every 1000 batches
        if not step % 1000:
            print(f'batch: {step} out of {train_steps}, loss: {loss:.3f}')

    # Calculate mean loss after epoch
    epoch_data_loss, epoch_reg_loss = \
        loss_function.calculate_accumulated(include_regularization=True)
    epoch_loss = epoch_data_loss + epoch_reg_loss

    # Calculate mean accuracy after epoch
    epoch_accuracy = accuracy.calculate_accumulated()

    # Print training progress after epoch
    print(f'training, ' +
    f'acc: {epoch_accuracy}, ' +
    f'loss: {epoch_loss:.3f} (' +
    f'data_loss: {epoch_data_loss:.3f}, ' +
    f'reg_loss: {epoch_reg_loss:.3f}, ' +
    f'lr: {optimizer.current_learning_rate}')

    # Reset mean loss before using validation set
    loss_function.new_pass()
    accuracy.new_pass()

    # Pass through validation set for evaluation during training
    for step in range(val_steps):

        batch_X = X_val[step * batch_size:(step+1)*batch_size]
        batch_y = y_val[step * batch_size:(step+1)*batch_size]

        dense1.forward(batch_X)
        activation1.forward(dense1.output)
        dense2.forward(activation1.output)
        activation2.forward(dense2.output)
        dense3.forward(activation2.output)
        activation3.forward(dense3.output)

        loss_function.calculate(activation3.output, batch_y, layers)

        predictions = activation3.output
        accuracy.calculate(predictions, batch_y)

    # Calculate mean loss for validation set on current model parameters
    val_loss = loss_function.calculate_accumulated()
    val_acc = accuracy.calculate_accumulated()

    # Print mean validation loss per epoch
    print(f'validation, ' +
          f'accuracy: {val_acc}, ' +
          f'loss: {val_loss:.3f}, ')

# Training finished

# Test model
print("Model Evaluation on Test Set")

loss_function.new_pass()
accuracy.new_pass()

# Pass through testing set
for step in range(test_steps):

    batch_X = X_test[step * batch_size:(step+1)*batch_size]
    batch_y = y_test[step * batch_size:(step+1)*batch_size]

    dense1.forward(batch_X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    dense3.forward(activation2.output)
    activation3.forward(dense3.output)

    loss_function.calculate(activation3.output, batch_y, layers)

    predictions = activation3.output
    accuracy.calculate(predictions, batch_y)



    pairs += [batch_X, batch_y, predictions, accuracy.calculate(predictions, batch_y)]


# Calculate mean loss for validation set on current model parameters
test_loss = loss_function.calculate_accumulated()
test_acc = accuracy.calculate_accumulated()

# Print mean validation loss per epoch
print(f'test set, ' +
      f'test acc: {test_acc}',
      f'test loss: {test_loss:.3f}, ')

epoch: 0
batch: 0 out of 5401, loss: 0.731
batch: 1000 out of 5401, loss: 0.119
batch: 2000 out of 5401, loss: 0.101
batch: 3000 out of 5401, loss: 0.057
batch: 4000 out of 5401, loss: 0.030
batch: 5000 out of 5401, loss: 0.121
training, acc: 22.972496901583636, loss: 0.089 (data_loss: 0.087, reg_loss: 0.002, lr: 0.00015625
validation, accuracy: 21.65069198494838, loss: 0.088, 
epoch: 1
batch: 0 out of 5401, loss: 0.198
batch: 1000 out of 5401, loss: 0.222
batch: 2000 out of 5401, loss: 0.082
batch: 3000 out of 5401, loss: 0.057
batch: 4000 out of 5401, loss: 0.066
batch: 5000 out of 5401, loss: 0.066
training, acc: 22.987105904592198, loss: 0.087 (data_loss: 0.086, reg_loss: 0.001, lr: 8.473858147614608e-05
validation, accuracy: 22.34172769650363, loss: 0.086, 
epoch: 2
batch: 0 out of 5401, loss: 0.039
batch: 1000 out of 5401, loss: 0.286
batch: 2000 out of 5401, loss: 0.087
batch: 3000 out of 5401, loss: 0.148
batch: 4000 out of 5401, loss: 0.141
batch: 5000 out of 5401, loss: 0.083

In [786]:
# Add accuracy
# Make some predictions, if predictions are good, test accuracy on external dataset
y_test.tolist(), predictions.tolist()
y_test.shape, predictions.shape
len(pairs), len(pairs[0]), test_steps
"x", pairs[0][0], "y", pairs[1][0],  "pred", pairs[2][0], "acc", pairs[3], "x", pairs[0][1], "y", pairs[1][1],  "pred", pairs[2][1], "acc", pairs[3]

('x',
 array([1., 0., 0., ..., 0., 0., 0.]),
 'y',
 0.23,
 'pred',
 array([0.33519864]),
 'acc',
 0.1424812012089159,
 'x',
 array([0.9375, 0.    , 0.    , ..., 0.    , 0.    , 0.    ]),
 'y',
 0.32,
 'pred',
 array([0.32947181]),
 'acc',
 0.1424812012089159)

In [799]:
cols = [0.0] * 373
#test_batch = [19.0, 0.0, 6.0, 0.012487, 0.533333]




test_batch = [0.0, 0.0, 0.0,  0.5797850, 0.546301]

test_batch.extend(cols)
test_batch[118] = 1.0


dense1.forward(test_batch)
activation1.forward(dense1.output)
dense2.forward(activation1.output)
activation2.forward(dense2.output)
dense3.forward(activation2.output)
activation3.forward(dense3.output)



predictions = activation3.output
predictions = predictions.flatten()
price = predictions[0]
x = ("£{:.2f}".format(price*1000000))
x

'£341590.40'