# Testing Real Estate Predictions
The purpose of this notebook is to load a trained AI model for NJ real estate and use it to generate predictions agianst a pre-processed data file which will be compared against known property values to access the accuracy of the model.

# 1 Load and Scale Sample Data
First we'll load a sample pre-processed property file from our workspace and use the same scaler from our model creation to produce a consistent normalized view of the input data.

In [None]:
import json
import numpy as np
import pandas as pd
import os

current_directory = os.getcwd()
print(current_directory)

In [None]:
# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)

file_dir = 'data/sample'
model_name = 'all-county'
processed_file = '0204'

file_path = os.path.join(file_dir, 'processed', processed_file + ".pkl")
if os.path.exists(file_path):
    df = pd.read_pickle(file_path).sample(frac=1, random_state=42).reset_index(drop=True)
else:
    raise ValueError(f"{file_path} does not exist")
print(df.shape)
df

In [None]:
from sklearn.preprocessing import MinMaxScaler
import joblib

X = df.drop('Sale_Price', axis=1).values
y = df['Sale_Price'].values
    
file_path = os.path.join(file_dir, 'scaler', processed_file + ".save")
if os.path.exists(file_path):
    scaler = joblib.load(file_path)
    X_specific = scaler.transform(X)
else:
    raise ValueError(f"scaler for the model does not exist at {file_path}")

file_path = os.path.join(file_dir, 'scaler', model_name + ".save")
if os.path.exists(file_path):
    scaler = joblib.load(file_path)
    X = scaler.transform(X)
else:
    raise ValueError(f"scaler for the model does not exist at {file_path}")


## 2 Generate Predictions
Now we can load the pre-trained model and use the input data to generate predictions

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError

file_path = os.path.join(file_dir, 'model', processed_file + ".h5")
if os.path.exists(file_path):
    model_specific = load_model(file_path, custom_objects={'mse': MeanSquaredError()})
else:
    raise ValueError(f"model does not exist at {file_path}")

file_path = os.path.join(file_dir, 'model', model_name + ".h5")
if os.path.exists(file_path):
    model = load_model(file_path)
else:
    raise ValueError(f"model does not exist at {file_path}")


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

print(X.shape)
X

In [None]:
predictions_specific = model_specific.predict(X_specific)
predictions = model.predict(X)

print(f"Specific  : {mean_absolute_error(y, predictions_specific)}")
print(f"All County: {mean_absolute_error(y, predictions)}")

In [None]:
print(f"Specific  : {np.sqrt(mean_absolute_error(y, predictions_specific))}")
print(f"All County: {np.sqrt(mean_absolute_error(y, predictions))}")

In [None]:
print(f"Specific  : {explained_variance_score(y, predictions_specific)}")
print(f"All County: {explained_variance_score(y, predictions)}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

y_df = pd.DataFrame(y, columns=['value'])
specific_df = pd.DataFrame(predictions_specific, columns=['prediction'])
specific_df['value'] = y
predictions_df = pd.DataFrame(predictions, columns=['prediction'])
predictions_df['value'] = y

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.scatterplot(x='value', y='prediction', data=specific_df, ax=axes[0])
sns.lineplot(x='value', y='value', data=y_df, ax=axes[0], color='r')
axes[0].set_title(processed_file)

sns.scatterplot(x='value', y='prediction', data=predictions_df, ax=axes[1])
sns.lineplot(x='value', y='value', data=y_df, ax=axes[1], color='r')
axes[1].set_title(model_name)
plt.tight_layout()
plt.show()

In [None]:
errors_specific = y.reshape(X_specific.shape[0], 1) - predictions_specific
errors = y.reshape(X.shape[0], 1) - predictions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(errors_specific, kde=True, stat='density', alpha=0.4, edgecolor=(1,1,1,0.4), ax=axes[0])
axes[0].set_title(processed_file)
axes[0].set_xlim(-300000, 300000)

sns.histplot(errors, kde=True, stat='density', alpha=0.4, edgecolor=(1,1,1,0.4), ax=axes[1])
axes[1].set_title(model_name)
axes[1].set_xlim(-300000, 300000)
plt.tight_layout()
plt.show()

## 3 Use Similarity Scores
Blind predictions have a lot of variablity, but what if we find similar properties and imput information from them to augment our input before generating the prediction?

In [None]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    return dot(a, b) / (norm(a) * norm(b))

array1 = np.array([1, 1, 1, 1])
array2 = np.array([1, 1, 1, 0])
similarity = cosine_similarity(array1, array2)
print(f"Cosine similarity: {similarity}")

In [None]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def top_similar_vectors(query_vector, vector_list, top_n=5):
    similarity_scores = np.array([cosine_similarity(query_vector, vector) for vector in vector_list])
    top_indices = np.argsort(similarity_scores)[::-1][:top_n]
    top_vectors = vector_list[top_indices]
    top_similarity_scores = similarity_scores[top_indices]
    return top_vectors, top_similarity_scores

In [None]:
import random
random.seed(101)
rand_idx = random.randint(0, len(df))

example = X[rand_idx]
example_pred = model.predict(example.reshape(1, -1))

top_vectors, top_similarity_scores = top_similar_vectors(example, X)
column_names = df.columns.values
column_names = column_names[column_names != 'Sale_Price']
column_names

similar_agg = pd.DataFrame(top_vectors, columns=column_names).mean().values
similar_pred = model.predict(similar_agg.reshape(1, -1))

In [None]:
print(f"actual: {df.iloc[rand_idx]['Sale_Price']}, example: {example_pred[0][0]}, similar: {similar_pred[0][0]}")

## 4.1 Calculated Taxes and Year
This information is embedded in the recorded taxes and is not needed, therefore we will drop the columns from our data set.

In [None]:
df = df.drop(['Calculated_Taxes', 'Calculated_Taxes_Year'], axis=1)

## 4.2 County
We'll convert this into a category and then transpose the data into dummy columns, essentially a bitmap.

In [None]:
df['County'].value_counts()

In [None]:
df['County'] = df['County'].map({
    1: 'Atlantic',
    2: 'Bergen',
    3: 'Burlignton',
    4: 'Camden',
    5: 'Cape May',
    6: 'Cumberland',
    7: 'Essex',
    8: 'Gloucester',
    9: 'Hudson',
    10: 'Hunterdon',
    11: 'Mercer',
    12: 'Middlesex',
    13: 'Monmouth',
    14: 'Morris',
    15: 'Ocean',
    16: 'Passaic',
    17: 'Salem',
    18: 'Somerset',
    19: 'Sussex',
    20: 'Union',
    21: 'Warren'
})
df['County'].value_counts()

In [None]:
dummies = pd.get_dummies(df['County'], drop_first=True)
df = df.drop('County', axis=1)
# in case sample data is missing a county, we'll add
# the columns manually and then overwrite the values
# skip Atlantic because it would have already been dropped
counties = ['Bergen', 'Burlignton', 'Camden', 'Cape May',
            'Cumberland', 'Essex', 'Gloucester', 'Hudson',
            'Hunterdon', 'Mercer', 'Middlesex', 'Monmouth',
            'Morris', 'Ocean', 'Passaic', 'Salem', 'Somerset',
            'Sussex', 'Union', 'Warren']
for county in counties:
    if county in dummies.columns:
        df[county] = dummies[county]
    else:
        df[county] = False

## 4.3 NU Code
There's too much uncertainty around property values where an NU Code is applied, so we'll drop those records and remove this column from our data set.

In [None]:
df = df[df['NU_Code'] == 99]
df = df.drop('NU_Code', axis=1)

In [None]:
# df['NU_Code'].value_counts()

In [None]:
# df['NU_Code'] = df['NU_Code'].map({
#     99: 'None',
#     -1: 'Unknown',
#     0: 'Unknown',
#     1: 'immediate family',
#     2: 'love and affection',
#     3: 'corporation',
#     4: 'convenience',
#     5: 'transfer',
#     6: 'apportionment',
#     7: 'subsequent to assessment',
#     8: 'undivided interest',
#     9: 'governmental lien',
#     10: 'trustees',
#     11: 'judicial',
#     12: 'sheriff',
#     13: 'benefit of creditors',
#     14: 'doubtful title',
#     15: 'political',
#     16: 'more than one taxing district',
#     17: 'charitable',
#     18: 'foreclosure',
#     19: 'physical damage',
#     20: 'right-of-way',
#     21: 'affordable housing',
#     22: 'exchange',
#     23: 'industrial',
#     24: 'influenced',
#     25: 'realty transfer fee act',
#     26: 'not compelled',
#     27: 'reassessment',
#     28: 'leaseback',
#     29: 'freeze act',
#     30: 'package deal',
#     31: 'federal or state',
#     32: 'building omitted',
#     33: 'exempt property'
# })
# df['NU_Code'].value_counts()

In [None]:
# dummies = pd.get_dummies(df['NU_Code'], drop_first=True)
# df = df.drop('NU_Code', axis=1)
# df = pd.concat([df, dummies], axis=1)

## 4.4 Property Class
Since our model is only for residential properties we can remove this column.

In [None]:
df = df.drop('Property_Class', axis=1)

## 4.5 Ratio Year / Recorded Taxes Year
This information doesn't vary between records and doesn't have a very strong correlation with sales price, therefore we can drop the column.

In [None]:
df = df.drop(['RatioYear', 'Recorded_Taxes_Year'], axis=1)

## 4.6 Total Units
This data seems to have very little impact on sales price for residential homes and therefore we will drop the column.

In [None]:
df = df.drop('TotalUnits', axis=1)

## 4.7 Year 1 / Year 2
These should really be int values.

In [None]:
df['Year_1'] = df['Year_1'].astype(int)
df['Year_2'] = df['Year_2'].astype(int)

## 4.8 Year 2 Assessments
This information is closely related to year 1 assessments and therefore we will ignore it for training purposes.

In [None]:
df = df.drop(['Year_2', 'Land_Assmnt_2', 'Building_Assmnt_2', 'Total_Assmnt_2'], axis=1)

## 4.9 Review and Save the Dataframe
Now we can store our data set that will be used to train and test our model.

In [None]:
print(df.shape)
df

In [None]:
df.to_pickle('data/sample/processed.pkl')

In [None]:
test = pd.read_pickle('data/sample/processed.pkl')
print(test.shape)
test