In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

In [8]:
california_dataset = fetch_california_housing()
data = pd.DataFrame(data=california_dataset.data, columns=california_dataset.feature_names)
features = data


log_prices = np.log(california_dataset.target)
target = pd.DataFrame(log_prices, columns=["PRICE"])
features

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [19]:
MEDINC_IDX = 0
HOUSEAGE_IDX = 1
AVEROOMS_IDX = 2
AVEBEDRMS_IDX = 3
POPULATION_IDX = 4
AVEOCCUP_IDX = 5
LATITUDE_IDX = 6
LONGITUDE_IDX = 7

property_stats = features.mean().values.reshape(1, 8)
property_stats

array([[ 3.87067100e+00,  2.86394864e+01,  5.42899974e+00,
         1.09667515e+00,  1.42547674e+03,  3.07065516e+00,
         3.56318614e+01, -1.19569704e+02]])

In [20]:
regr = LinearRegression().fit(features, target)
fitted_vals = regr.predict(features)

MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)

In [25]:
def get_log_estimate(income=property_stats[0][MEDINC_IDX],
                    houseAge=property_stats[0][HOUSEAGE_IDX],
                    avgRooms=property_stats[0][AVEROOMS_IDX],
                    avgBedrooms=property_stats[0][AVEBEDRMS_IDX],
                    population=property_stats[0][POPULATION_IDX],
                    avgOccup=property_stats[0][AVEOCCUP_IDX],
                    lat=property_stats[0][LATITUDE_IDX],
                    long=property_stats[0][LONGITUDE_IDX],
                    high_confidence=True):
    
    # Configure property
    property_stats[0][MEDINC_IDX] = income
    property_stats[0][HOUSEAGE_IDX] = houseAge
    property_stats[0][AVEROOMS_IDX] = avgRooms
    property_stats[0][AVEBEDRMS_IDX] = avgBedrooms
    property_stats[0][POPULATION_IDX] = population
    property_stats[0][AVEOCCUP_IDX] = avgOccup
    property_stats[0][LATITUDE_IDX] = lat
    property_stats[0][LONGITUDE_IDX] = long
    
    # Make Prediction
    log_estimate = regr.predict(property_stats)[0][0]
    
    # Calc Range
    if high_confidence:
        upper_bound = log_estimate + 2*RMSE
        lower_bound = log_estimate - 2*RMSE
        interval = 95
    else:
        upper_bound = log_estimate + RMSE
        lower_bound = log_estimate - RMSE 
        interval = 68
    
    return log_estimate, upper_bound, lower_bound, interval
    

In [26]:
get_log_estimate(8.3, 41, 6, 2, 300, 2.5, 37, -122)



(1.9172925743045823, 2.624132398583221, 1.2104527500259434, 95)

In [28]:
log_est, upper, lower, conf = get_log_estimate(8.3, 41, 6, 2, 300, 2.5, 37, -122)

# Convert to dollars
dollar_est = np.e**log_est * 100000
dollar_high = np.e**upper * 100000
dollar_low = np.e**lower * 100000

rounded_est = np.around(dollar_est, -3)
rounded_high = np.around(dollar_high, -3)
rounded_low = np.around(dollar_low, -3)

print(f"The estimated property value is {rounded_est}")
print(f"At {conf}% confidence, the valuation range is")
print(f"USD {rounded_low} at the lower end to USD {rounded_high} at the high end")

The estimated property value is 680000.0
At 95% confidence, the valuation range is
USD 336000.0 at the lower end to USD 1379000.0 at the high end


