In [9]:
%matplotlib inline

In [10]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
import patsy
from sklearn import neighbors
from copy import deepcopy
import requests
import json
from sklearn.preprocessing import StandardScaler

ZIP_API = 'https://public.opendatasoft.com/api/records/1.0/search/?dataset=us-zip-code-latitude-and-longitude&q=zip:{}&facet=state&facet=timezone&facet=dst'

sns.set(style="whitegrid")

In [11]:
def get_location(zip_string):
    response=requests.get(ZIP_API.format(zip_string.replace('WA', '')))
    record = response.json().get('records')[0].get('fields')
    return record.get('latitude'), record.get('longitude')

def mse(errors):
    n = len(errors)
    squared_error = np.sum([e**2 for e in errors])
    return np.sqrt((1.0/n) * squared_error)

def chunk(xs, n):
    k, m = divmod(len(xs), n)
    return [xs[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]

def cross_validation(formula, data, fold_count=10, repetitions=3):
    indices = list(range(len(data)))
    metrics = []
    for _ in range(repetitions):
        random.shuffle(indices)
        folds = chunk(indices, fold_count)
        for fold in folds:
            test_data = data.iloc[fold]
            train_indices = [idx not in fold for idx in indices]
            train_data = data.iloc[train_indices]
            y, X = patsy.dmatrices(formula, train_data, return_type='matrix')
            model = neighbors.KNeighborsRegressor(5).fit(X, y)
            y, X = patsy.dmatrices(formula, test_data, return_type='matrix')
            y_hat = model.predict(X)
            test_mse = mse(y - y_hat)
            metrics.append(mse(y - y_hat))
    return metrics

## Linear Regression Model on House Price DataSet

### DataSet:  https://www.kaggle.com/shree1992/housedata/version/2#

### EDA Variables: 
1. bathrooms: categorical
2. view: categorical
3. yr_renovated: float
4. sqft_living: float
5. location: statezip, categorical

In [12]:
# Load Data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [13]:
with open('statezip.json', 'r') as f:
    statezip_dict = json.load(f)
data['zip_lat'] = data.apply(lambda x: statezip_dict.get(x['statezip'])[0], axis=1)
data['zip_long'] = data.apply(lambda x: statezip_dict.get(x['statezip'])[1], axis=1)

In [15]:
# Normalize data
price_range = stats.mstats.mquantiles(data.price, [0.005, 0.995])
criteria_1 = data.price>= price_range[0]
criteria_2 = data.price<=price_range[1]
data=data[criteria_1 & criteria_2]
scaler = StandardScaler()
features =  [
    'bedrooms', 
    'bathrooms', 
    'sqft_living', 
    'sqft_lot',
    'floors',
    'sqft_above',
    'sqft_basement',
    'yr_built', 
    'yr_renovated',
    'zip_lat',
    'zip_long'
]  
data[features] = scaler.fit_transform(data[features])
data = pd.concat([
    data,
    pd.get_dummies(data.view, prefix='view'),
    pd.get_dummies(data.condition,prefix='condition')],
    axis=1
)
data=data.drop(['date', 'street', 'city', 'statezip', 'country', 'view', 'condition'], axis=1)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,sqft_above,sqft_basement,yr_built,...,view_0,view_1,view_2,view_3,view_4,condition_1,condition_2,condition_3,condition_4,condition_5
0,313000.0,-0.433612,-0.851635,-0.861246,-0.191417,-0.016936,0,-0.565865,-0.680394,-0.531637,...,1,0,0,0,0,0,0,1,0,0
2,342000.0,-0.433612,-0.191594,-0.202279,-0.078499,-0.946599,0,0.149647,-0.680394,-0.161156,...,1,0,0,0,0,0,0,0,1,0
3,420000.0,-0.433612,0.138426,-0.124097,-0.188115,-0.946599,0,-0.978194,1.554031,-0.262196,...,1,0,0,0,0,0,0,0,1,0
4,550000.0,0.674372,0.468446,-0.19111,-0.118993,-0.946599,0,-0.808411,1.107146,0.175645,...,1,0,0,0,0,0,0,0,1,0
5,490000.0,-1.541596,-1.511675,-1.375018,-0.234289,-0.946599,0,-1.123721,-0.680394,-1.104199,...,1,0,0,0,0,0,0,1,0,0


In [16]:
# cross_validation
random.seed(10)
model = 'price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + sqft_above + sqft_basement + yr_built + yr_renovated + zip_lat + zip_long + view_0 + view_1 + view_2 + view_3 + view_4 + condition_1 + condition_2 + condition_3 + condition_4 + condition_5'
mses = cross_validation(model, data, repetitions=10)
mse_bci = stats.mstats.mquantiles(mses, [0.05, 0.95])
print('Mean MSE is {}'.format(np.mean(mses)))
print('95% BCI of MSE s {}'.format(mse_bci))

Mean MSE is 158274.5325490488
95% BCI of MSE s [135201.74418323 175211.77391403]


In [17]:
scalers = [0.001 * 2 ** i for i in range(20)]
features =  [
    'bedrooms', 
    'bathrooms', 
    'sqft_living', 
    'sqft_lot',
    'floors',
    'waterfront', 
    'sqft_above',
    'sqft_basement',
    'yr_built', 
    'yr_renovated',
    'zip_lat',
    'zip_long',
    'view_0',
    'view_1',
    'view_2',
    'view_3',
    'view_4',
    'condition_1',
    'condition_2',
    'condition_3',
    'condition_4',
    'condition_5'    
]
best_scalers = {}
for index, feature in enumerate(features, 1):
    print('Optimizing {}'.format(feature))
    dfs = []
    for scaler in scalers:
        temp_df = data.copy()
        temp_df[feature] = temp_df[feature]*scaler
        dfs.append(temp_df)
    mean_mses = []
    for df in dfs:
        mean_mses.append(np.mean(cross_validation(model, df, repetitions=3)))
    best_scalers[feature] = scalers[mean_mses.index(min(mean_mses))]

Optimizing bedrooms
Optimizing bathrooms
Optimizing sqft_living
Optimizing sqft_lot
Optimizing floors
Optimizing waterfront
Optimizing sqft_above
Optimizing sqft_basement
Optimizing yr_built
Optimizing yr_renovated
Optimizing zip_lat
Optimizing zip_long
Optimizing view_0
Optimizing view_1
Optimizing view_2
Optimizing view_3
Optimizing view_4
Optimizing condition_1
Optimizing condition_2
Optimizing condition_3
Optimizing condition_4
Optimizing condition_5


In [21]:
best_scalers

{'bedrooms': 0.032,
 'bathrooms': 0.001,
 'sqft_living': 1.024,
 'sqft_lot': 0.016,
 'floors': 0.008,
 'waterfront': 0.128,
 'sqft_above': 1.024,
 'sqft_basement': 0.001,
 'yr_built': 0.128,
 'yr_renovated': 32.768,
 'zip_lat': 65.536,
 'zip_long': 524.288,
 'view_0': 0.128,
 'view_1': 0.008,
 'view_2': 16.384,
 'view_3': 65.536,
 'view_4': 1.024,
 'condition_1': 0.512,
 'condition_2': 0.016,
 'condition_3': 0.008,
 'condition_4': 0.004,
 'condition_5': 0.002}

In [22]:
new_data = data.copy()
for key, value in best_scalers.items():
    new_data[key] = value*new_data[key]

model = 'price ~ sqft_living + waterfront + sqft_above + yr_built + yr_renovated + zip_lat + zip_long + view_2 + view_3 + view_4 + condition_1'

# cross_validation
random.seed(10)
mses = cross_validation(model, new_data, repetitions=10)
mse_bci = stats.mstats.mquantiles(mses, [0.05, 0.95])
print('Mean MSE is {}'.format(np.mean(mses)))
print('95% BCI of MSE s {}'.format(mse_bci))

Mean MSE is 145130.55844942442
95% BCI of MSE s [124258.98807098 171032.7866833 ]
