In [7]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [8]:
# Gather Data
boston_data = load_boston()
data = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names)
features = data.drop(["INDUS", "AGE"], axis=1)

log_prices = np.log(boston_data.target)
target = pd.DataFrame(log_prices, columns=["PRICE"])
features



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,0.0,0.573,6.593,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,0.0,0.573,6.120,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,0.0,0.573,6.976,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,0.0,0.573,6.794,2.3889,1.0,273.0,21.0,393.45,6.48


In [9]:
property_stats = features.mean().values.reshape(1,11)

In [10]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

mse = mean_squared_error(target, fitted_vals)
rmse = np.sqrt(mse)

In [11]:
from typing import Tuple
def get_log_estimate(
    nr_rooms: float, 
    students_per_class_room: float, 
    next_to_river: bool = False, 
    high_confidence: bool = True,
    ) -> Tuple[float, float, float, int]:
    """
    :param nr_rooms: Number of rooms
    :param students_per_class_room: Number of students in each class room
    :param next_to_river: A bool representing if the property is next to the river
    :param high_confidence: A bool to use or not use high confidence in model
    :return: A log price, an upper bound, a lower bound and an interval value
    """
    # Configure the property
    property_stats[0][4] = nr_rooms
    property_stats[0][8] = students_per_class_room
    property_stats[0][2] = 0
    if next_to_river:
        property_stats[0][2] = 1

    # Make prediction
    log_estimate = regr.predict(property_stats)[0][0]

    # Calc Range
    mult = 1
    interval = 68
    if high_confidence:
        mult = 2
        interval = 95

    upper_bound = log_estimate + (mult * rmse)
    lower_bound = log_estimate - (mult * rmse)
    return log_estimate, upper_bound, lower_bound, interval

In [12]:
get_log_estimate(3, 20, next_to_river=True, high_confidence=False)



(2.7767581914803987, 2.964270326677529, 2.5892460562832684, 68)

In [32]:
ZILLOW_MEDIAN_PRICE = 583.3
SCALE_FACTOR = ZILLOW_MEDIAN_PRICE/np.median(boston_data.target)


# Convert to todays dollars
def from_log_to_today(value: float) -> float:
    """
    Converts a log value back out of a log and into today prices
    value: The value to convert
    """
    return np.around(np.e**value * 1000 * SCALE_FACTOR, -3)

def get_dollar_estimate(rm: float, ptratio: float, chas: bool = False, large_range: bool = True):
    """
    Gets the estimated value for a house
    rm: The number of rooms
    ptratio: The pupil to teacher ratio
    chas: A boolean representing if the house is next to the Charles River
    large_range: True for a 95% prediction, false for a 68% prediction
    """
    if rm < 1:
        print("Unrealistic number of rooms")
        return

    if ptratio < 1:
        print("Unrealistic pupil teacher ratio")
        return

    log_est, upper, lower, conf = get_log_estimate(rm, students_per_class_room=ptratio, next_to_river=chas, high_confidence=large_range)

    dollar_est = from_log_to_today(log_est)
    upper_est = from_log_to_today(upper)
    lower_est = from_log_to_today(lower)
    print(f"The estimated property value is {dollar_est}.")
    print(f"At {conf}% the valuation range is")
    print(f"USD {lower_est} to {upper_est}")

In [34]:
get_dollar_estimate(rm=1, ptratio=-1, chas=True)

Unrealistic People teacher ratio
