# House Sale Price Analysis

This notebook does x y z


### Imports and Functions

In [15]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [34]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Added after the session - a better metrics function!
def print_metrics(y_tr, tr_pred, y_te, te_pred, log=False):
    '''
    Prints the R2 Score, Mean Absolute Error and Root Mean Squared Error
    Will unlog to get MAE & RMSE in terms of the original target if log=True
    
    Inputs:
        y_tr: array-like or pandas series
            Actual target values for the train set
        tr_pred: array-like or pandas series
            Predicted target values for the train set
        y_te: array-like or pandas series
            Actual target values for the test set
        te_pred: array-like or pandas series
            Predicted target values for the test set
        log: boolean
            Toggles whether the target values have been logged or not
            If True, assumes all other arguments passed into the function have been logged
            
    Outputs:
        None, just prints the metrics
    '''
    # Unlogging all variables if you set log=True
    if log == True:
        # Please note - if you used log to log the variables, change this to exp
        y_tr_unlog = np.expm1(y_tr)
        tr_pred_unlog = np.expm1(tr_pred)
        y_te_unlog = np.expm1(y_te)
        te_pred_unlog = np.expm1(te_pred)
    
    # Printing train scores
    print("Training Scores")
    print("-"*10)
    print(f"R2: {r2_score(y_tr, tr_pred):.4f}") # R2 should not be done on unlogged values
    if log == True:
        print(f"RMSE: {mean_squared_error(y_tr_unlog, tr_pred_unlog, squared=False):.4f}")
        print(f"MAE: {mean_absolute_error(y_tr_unlog, tr_pred_unlog):.4f}")
    else:
        print(f"RMSE: {mean_squared_error(y_tr, tr_pred, squared=False):.4f}")
        print(f"MAE: {mean_absolute_error(y_tr, tr_pred):.4f}")
    
    print("\n"+"*"*10)
    
    # Printing test scores
    print("Testing Scores")
    print("-"*10)
    print(f"R2: {r2_score(y_te, te_pred):.4f}") # R2 should not be done on unlogged values
    if log == True:
        print(f"RMSE: {mean_squared_error(y_te_unlog, te_pred_unlog, squared=False):.4f}")
        print(f"MAE: {mean_absolute_error(y_te_unlog, te_pred_unlog):.4f}")
    else:
        print(f"RMSE: {mean_squared_error(y_te, te_pred, squared=False):.4f}")
        print(f"MAE: {mean_absolute_error(y_te, te_pred):.4f}")   

### Initial Data Understanding

In [2]:
df = pd.read_csv("data/kc_house_data.csv")

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

In [5]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19221.0,21534.0,21597.0,21597.0,21597.0,21597.0,17755.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474000.0,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,0.007596,0.233863,3.409825,7.657915,1788.596842,1970.999676,83.636778,98077.951845,47.560093,-122.213982,1986.620318,12758.283512
std,2876736000.0,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,0.086825,0.765686,0.650546,1.1732,827.759761,29.375234,399.946414,53.513072,0.138552,0.140724,685.230472,27274.44195
min,1000102.0,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,370.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,1951.0,0.0,98033.0,47.4711,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,1975.0,0.0,98065.0,47.5718,-122.231,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,2210.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [6]:
df.loc[df['bedrooms'] == 33]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15856,2402100895,6/25/2014,640000.0,33,1.75,1620,6000,1.0,0.0,0.0,...,7,1040,580.0,1947,0.0,98103,47.6878,-122.331,1330,4700


In [7]:
df = df.loc[df['bedrooms'] < 33]

In [8]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21596.0,21596.0,21596.0,21596.0,21596.0,21596.0,21596.0,19220.0,21533.0,21596.0,21596.0,21596.0,21596.0,17754.0,21596.0,21596.0,21596.0,21596.0,21596.0
mean,4580575000.0,540292.0,3.371828,2.115843,2080.343165,15099.83,1.494119,0.007596,0.233874,3.409752,7.657946,1788.631506,1971.000787,83.641489,98077.950685,47.560087,-122.213977,1986.650722,12758.656649
std,2876764000.0,367376.0,0.904114,0.768998,918.122038,41413.55,0.539685,0.086827,0.765702,0.650471,1.173218,827.763251,29.37546,399.957185,53.51404,0.138552,0.140725,685.231768,27275.018316
min,1000102.0,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,370.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,1951.0,0.0,98033.0,47.4711,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7619.0,1.5,0.0,0.0,3.0,7.0,1560.0,1975.0,0.0,98065.0,47.5718,-122.231,1840.0,7620.0
75%,7308950000.0,645000.0,4.0,2.5,2550.0,10685.5,2.0,0.0,0.0,4.0,8.0,2210.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,11.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


### FSM

In [17]:
y = df['price']

used_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'condition', 'grade', 'zipcode']
X = df[used_cols]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
lr = LinearRegression()

lr.fit(X_train_scaled, y_train)

LinearRegression()

In [21]:
train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

In [54]:
# sketch version of a metrics function coded out during the session

# def metrics(y_train, train_preds, y_test, test_preds):
#     print('Train Scores:')
#     print(f'R2: {r2_score(y_train, train_preds)}')
#     print(f'RMSE: {mean_squared_error(y_train, train_preds, squared=False)}')
#     print('Test Scores:')
#     print(f'R2: {r2_score(y_test, test_preds)}')
#     print(f'RMSE: {mean_squared_error(y_test, test_preds, squared=False)}') 

In [24]:
print_metrics(y_train, train_preds, y_test, test_preds)

Training Scores
----------
R2: 0.5688
RMSE: 240314.8716
MAE: 156284.5511

**********
Testing Scores
----------
R2: 0.5605
RMSE: 246349.4286
MAE: 156143.2444


In [25]:
test_preds.min()

-268547.18845424545

In [26]:
test_preds.max()

2591692.432412645

In [27]:
y_test

3686      132500.0
10247     415000.0
4037      494000.0
3437      355000.0
19291     606000.0
           ...    
8476      343000.0
18276     397000.0
15003     615000.0
13478    1330000.0
18399     784500.0
Name: price, Length: 5399, dtype: float64

In [28]:
test_preds

array([ 142257.84027487,  333516.42000677,  319072.47964895, ...,
        557501.73372277, 1052604.12379231, 1608624.92486253])

In [29]:
results = y_test.reset_index()

In [30]:
results['predicted_price'] = test_preds

In [31]:
results.head()

Unnamed: 0,index,price,predicted_price
0,3686,132500.0,142257.840275
1,10247,415000.0,333516.420007
2,4037,494000.0,319072.479649
3,3437,355000.0,323176.808432
4,19291,606000.0,365319.501065


In [32]:
results.sort_values(by='predicted_price').drop('price', axis=1)

Unnamed: 0,index,predicted_price
3640,15698,-2.685472e+05
4718,8588,-1.738350e+05
196,12063,-1.620613e+05
336,7369,-1.565677e+05
1369,8614,-1.547766e+05
...,...,...
3123,1162,2.136859e+06
1944,13398,2.325111e+06
3426,3910,2.465116e+06
4415,14542,2.469748e+06


In [33]:
X_test

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,condition,grade,zipcode
3686,3,0.75,850,8573,3,6,98146
10247,3,1.00,1510,6083,4,6,98115
4037,4,2.25,1790,42000,3,7,98045
3437,2,1.50,1140,2500,3,7,98106
19291,3,1.00,1500,3920,3,7,98107
...,...,...,...,...,...,...,...
8476,3,1.00,1410,18600,5,7,98059
18276,5,1.00,1170,6757,4,6,98125
15003,3,1.75,1670,5100,5,7,98144
13478,4,2.25,3260,4640,5,9,98112


### Log Price

In [35]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [36]:
lr_log = LinearRegression()

lr_log.fit(X_train_scaled, y_train_log)

LinearRegression()

In [37]:
train_preds_log = lr_log.predict(X_train_scaled)
test_preds_log = lr_log.predict(X_test_scaled)

In [38]:
test_preds_log

array([12.46444604, 12.6889219 , 12.75209644, ..., 13.07734308,
       13.78730668, 14.50915745])

In [39]:
print_metrics(y_train_log, train_preds_log, y_test_log, test_preds_log, log=True)

Training Scores
----------
R2: 0.5880
RMSE: 249466.3568
MAE: 145527.2459

**********
Testing Scores
----------
R2: 0.5792
RMSE: 231938.1411
MAE: 145044.8212


In [40]:
np.expm1(y_test_log)

3686      132500.0
10247     415000.0
4037      494000.0
3437      355000.0
19291     606000.0
           ...    
8476      343000.0
18276     397000.0
15003     615000.0
13478    1330000.0
18399     784500.0
Name: price, Length: 5399, dtype: float64

In [41]:
y_test

3686      132500.0
10247     415000.0
4037      494000.0
3437      355000.0
19291     606000.0
           ...    
8476      343000.0
18276     397000.0
15003     615000.0
13478    1330000.0
18399     784500.0
Name: price, Length: 5399, dtype: float64

In [42]:
np.expm1(test_preds_log)

array([ 258963.44192959,  324136.11441022,  345273.98758973, ...,
        477988.03824939,  972189.13750526, 2000998.66283722])