In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import random
import tensorflow as tf
import csv

import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.applications import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split


## Preprocessing

In [125]:
# Import Housing Data
new_housing_csv = pd.read_csv('../data/new_housing.csv')

# Select houses with a Sale Price over $100,000
mask = (new_housing_csv["SALE PRICE"]) > 100000.0
new_housing_csv = new_housing_csv[mask]

# Only select homes sold between 2011 and 2015
new_housing_csv['SALE DATE'] = pd.to_datetime(new_housing_csv['SALE DATE'])
mask = (new_housing_csv["SALE DATE"]).dt.year > 2011 
new_housing_csv = new_housing_csv[mask]
mask = (new_housing_csv["SALE DATE"]).dt.year < 2015
new_housing_csv = new_housing_csv[mask]


# Only select important rows
new_housing_csv = new_housing_csv[["LAND VAL", "PARCEL VAL", "IMPROVE VAL", "SALE PRICE", "SQFT", "ROOMS", "BEDROOM", "BATH", "LIVING AREA", "GROSS AREA", "YEAR", "PERCENT GOOD", "STORIES"]]

#Remove Nulls and 0s in Data
mask = new_housing_csv.notnull().all(axis=1)
new_housing_csv = new_housing_csv.loc[mask, :]
mask = (new_housing_csv != 0).all(axis=1)
new_housing_csv = new_housing_csv.loc[mask, :]

# Seperate into Data and Labels
housing_data = new_housing_csv[["LAND VAL", "PARCEL VAL", "SQFT", "ROOMS", "BEDROOM", "BATH", "LIVING AREA", "GROSS AREA", "YEAR","PERCENT GOOD", "STORIES"]]
housing_label = new_housing_csv[["SALE PRICE"]]

  new_housing_csv = pd.read_csv('../data/new_housing.csv')


In [126]:
X = housing_data
Y = housing_label

def normalize(X):
    """
    Normalizes the data X using the mean and standard deviation.  
    
    Parameters:
    X (np.ndarray): An array of unnormalized data.
    
    Returns:
    np.ndarray: The normalized version of the data.
    """    
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X = (X - mu) / sigma
    return X

# Normalize Data
X = normalize(X)
Y = normalize(Y)

# Convert to Numpy Array
X = pd.DataFrame.to_numpy(X)
Y = pd.DataFrame.to_numpy(Y)

## Model

In [127]:
class LinearRegression(tf.keras.Model):
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units=128, activation='relu')
        self.dense2 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dense3 = tf.keras.layers.Dense(units=32, activation='relu')
        self.dense4 = tf.keras.layers.Dense(units=1, kernel_regularizer=tf.keras.regularizers.l2(0.01))
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)

        return x

In [128]:
model = LinearRegression()

# Loss function and Optimizer
loss_fn = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

In [129]:
# Convert data to tensors
X = tf.convert_to_tensor(X, dtype=tf.float32)
y = tf.convert_to_tensor(Y, dtype=tf.float32)

# Percent of Data used for training
train_split = 0.7

# Split data into train and test
x_train = x[:int(train_split * len(X))]
y_train = y[:int(train_split * len(X))]
x_test = x[int(train_split * len(X) + 1):(len(X))]
y_test = y[int(train_split * len(X) + 1):(len(X))]


In [130]:
num_epochs = 10000
for epoch in range(num_epochs):
    # Forward pass
    with tf.GradientTape() as tape:
        y_pred = model(x_train)
        loss = loss_fn(y_train, y_pred)
    
    # Backward and optimize
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    # Print progress
    if (epoch+1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.numpy():.4f}")

Epoch [100/10000], Loss: 0.4040
Epoch [200/10000], Loss: 0.3726
Epoch [300/10000], Loss: 0.3519
Epoch [400/10000], Loss: 0.3334
Epoch [500/10000], Loss: 0.3162
Epoch [600/10000], Loss: 0.3012
Epoch [700/10000], Loss: 0.2867
Epoch [800/10000], Loss: 0.2729
Epoch [900/10000], Loss: 0.2595
Epoch [1000/10000], Loss: 0.2470
Epoch [1100/10000], Loss: 0.2354
Epoch [1200/10000], Loss: 0.2243
Epoch [1300/10000], Loss: 0.2139
Epoch [1400/10000], Loss: 0.2041
Epoch [1500/10000], Loss: 0.1947
Epoch [1600/10000], Loss: 0.1855
Epoch [1700/10000], Loss: 0.1768
Epoch [1800/10000], Loss: 0.1684
Epoch [1900/10000], Loss: 0.1605
Epoch [2000/10000], Loss: 0.1530
Epoch [2100/10000], Loss: 0.1464
Epoch [2200/10000], Loss: 0.1459
Epoch [2300/10000], Loss: 0.1441
Epoch [2400/10000], Loss: 0.1361
Epoch [2500/10000], Loss: 0.1372
Epoch [2600/10000], Loss: 0.1359
Epoch [2700/10000], Loss: 0.1267
Epoch [2800/10000], Loss: 0.1212
Epoch [2900/10000], Loss: 0.1253
Epoch [3000/10000], Loss: 0.1085
Epoch [3100/10000],

In [133]:
def accuracy(y_pred, y_true, sigma, mu, threshold=0.1):
    """
    Computes the accuracy of the predictions given the true labels, based on a threshold value.
    
    Parameters:
    y_pred (np.ndarray): An array of predicted labels.
    y_true (np.ndarray): An array of true labels.
    threshold (float): The threshold value to use for measuring accuracy.
    
    Returns:
    float: The accuracy of the predictions as a percentage.
    """
    # Compute the absolute difference between predicted and true values
    
    
    y_pred = (y_pred * sigma) + mu
    y_true = (y_true * sigma) + mu

    diff = np.abs(y_pred - y_true)
    
    # Compute accuracy
    acc = np.mean(diff <= threshold * y_pred)
    
    # Convert to percentage
    acc_pct = acc * 100
    
    return acc_pct

In [134]:
pred = model(x_train)
loss = loss_fn(y_train, pred)
print("Training Accuracy:", accuracy(pred, y_train, sigma, mu))

pred = model(x_test)
loss = loss_fn(y_test, pred)
print("Testing Accuracy:", accuracy(pred, y_test, sigma, mu))


Training Accuracy: 58.53174603174603
Testing Accuracy: 46.85185185185185
