### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.api import OLS

#SKLearn stuff
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv('../../data/kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,NONE,...,7 Average,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,NO,NONE,...,7 Average,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,NO,NONE,...,6 Low Average,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,NO,NONE,...,7 Average,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,NO,NONE,...,8 Good,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


### Initial Cleaning

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  object 
 9   view           21534 non-null  object 
 10  condition      21597 non-null  object 
 11  grade          21597 non-null  object 
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

The column sqft_basement has 454 '?' values. Used  sqft_living and sqft_above to fill in the values. 

In [4]:
df['sqft_basement'] = df['sqft_living'] - df['sqft_above']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  object 
 9   view           21534 non-null  object 
 10  condition      21597 non-null  object 
 11  grade          21597 non-null  object 
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  int64  
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

#### Drops

Dropping 'id', 'yr_renovated', and 'date'. No use for them and we don't have enough data to use them. 

In [6]:
df = df.drop(['id', 'date', 'yr_renovated'], axis=1) 

#### Outliers?

### Split

Using price as a dependent variable. Assuming our stockholder is a real estate/brokerage company, TBD, who gets a commission on each sale or purchase.

In [7]:
y = df["price"]
X = df.drop("price", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.33)

### Categorical Columns

The column 'waterfront' is a binomial categorical variable. The columns 'bathrooms', 'view', 'zipcode', 'condition', and 'grade' are categorical variables. They will need to be converted into a format that can be modeled. 

Need to drop columns for each. Should make a function to do encoding.

#### Waterfront

The column contains Yes and No values. I'm assuming NAs are Nos so I will be filling them in as such.

In [8]:
X_train['waterfront'].fillna('NO', inplace=True)
X_test['waterfront'].fillna('NO', inplace=True)

In [9]:
waterfront_train = X_train[['waterfront']]
encoder_waterfront = OrdinalEncoder()
encoder_waterfront.fit(waterfront_train)
encoder_waterfront.categories_[0]
waterfront_encoded_train = encoder_waterfront.transform(waterfront_train)
waterfront_encoded_train = waterfront_encoded_train.flatten()
X_train["waterfront"] = waterfront_encoded_train

In [10]:
waterfront_test = X_test[['waterfront']]
encoder_waterfront = OrdinalEncoder()
encoder_waterfront.fit(waterfront_test)
encoder_waterfront.categories_[0]
waterfront_encoded_test = encoder_waterfront.transform(waterfront_test)
waterfront_encoded_test = waterfront_encoded_test.flatten()
X_test["waterfront"] = waterfront_encoded_test

#### View 

The column 'view' has 5 categories; NONE, AVERAGE, GOOD, FAIR, and EXCELLENT. I'm assuming NAs are NONE so I will be filling them in as such.

Created a OneHotEncoder function for the categorical variables. 

In [11]:
X_train['view'].fillna("NONE", inplace=True)
X_test['view'].fillna("NONE", inplace=True)

In [12]:
X_train

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15
19709,5,3.75,3330,5042,2.0,0.0,AVERAGE,Average,9 Better,2470,860,2014,98103,47.6497,-122.339,1780,3990
4800,3,2.00,1678,13862,1.0,0.0,NONE,Average,7 Average,1678,0,1994,98030,47.3744,-122.190,1550,11753
13225,4,1.00,1550,15239,1.5,1.0,EXCELLENT,Average,6 Low Average,1370,180,1930,98166,47.4502,-122.378,1790,22047
16111,3,2.50,2760,9471,1.0,0.0,AVERAGE,Average,8 Good,1760,1000,1956,98115,47.6760,-122.272,3040,6765
17711,2,1.50,1400,5810,2.0,0.0,NONE,Average,7 Average,1400,0,1940,98103,47.6843,-122.341,1470,3920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,3,2.50,2230,5800,2.0,0.0,NONE,Average,7 Average,2230,0,2004,98065,47.5308,-121.847,2230,6088
21575,4,2.75,2770,3852,2.0,0.0,NONE,Average,8 Good,2770,0,2014,98178,47.5001,-122.232,1810,5641
5390,4,1.50,1530,9000,1.0,0.0,NONE,Good,6 Low Average,1530,0,1976,98014,47.6492,-121.908,1520,8500
860,1,0.75,380,15000,1.0,0.0,NONE,Average,5 Fair,380,0,1963,98168,47.4810,-122.323,1170,15000


In [13]:
#ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")

In [20]:
def ohe(df, df2, column):    
    for col in column:
        train = df[[col]]
        ohe = OneHotEncoder(drop="first", sparse=False, handle_unknown="error")
        ohe.fit(train)
        encoded_train = ohe.transform(train)
        col_names = [f"{col}_{f}" for f in ohe.get_feature_names()]
        encoded_train = pd.DataFrame(encoded_train,
                                     columns=col_names, index=df.index)
        df = pd.concat([df, encoded_train], axis=1)
        
        test = df2[[col]]
        encoded_test = ohe.transform(test)
        col_names = [f"{col}_{f}" for f in ohe.get_feature_names()]
        encoded_test = pd.DataFrame(encoded_test, 
                                    columns=col_names, index=df2.index)
        df2 = pd.concat([df2, encoded_test], axis=1)
    return df, df2, encoded_train.columns.tolist()

In [21]:
X_train, X_test, encoded_view = ohe(X_train, X_test, ['view'])

In [22]:
# Change view rating to a numeric value for comparitive purposes.
X_train['view'] = X_train['view'].map({'EXCELLENT':5,
                             'GOOD':4,
                             'AVERAGE':3,
                             'FAIR':2,
                             'NONE':1})

In [23]:
X_test['view'] = X_test['view'].map({'EXCELLENT':5,
                             'GOOD':4,
                             'AVERAGE':3,
                             'FAIR':2,
                             'NONE':1})

#### Condition 

In [24]:
X_train, X_test, encoded_condition = ohe(X_train, X_test, ['condition'])

In [25]:
# Change condition rating to a numeric value for comparitive purposes.
X_train['condition'] = X_train['condition'].map({'Very Good':5,
                                       'Good':4,
                                       'Average':3,
                                       'Fair':2,
                                       'Poor':1})

In [26]:
X_test['condition'] = X_test['condition'].map({'Very Good':5,
                                       'Good':4,
                                       'Average':3,
                                       'Fair':2,
                                       'Poor':1})

#### Grade 

In [27]:
X_train, X_test, encoded_grade = ohe(X_train, X_test, ['grade'])

#### Zipcode 

In [28]:
X_train, X_test, encoded_zipcode = ohe(X_train, X_test, ['zipcode'])

### Models

In [42]:
df.corr().price.abs().sort_values(ascending=False)

price            1.000000
sqft_living      0.701917
sqft_above       0.605368
sqft_living15    0.585241
bathrooms        0.525906
sqft_basement    0.323799
bedrooms         0.308787
lat              0.306692
floors           0.256804
sqft_lot         0.089876
sqft_lot15       0.082845
yr_built         0.053953
zipcode          0.053402
long             0.022036
Name: price, dtype: float64

#### Model 1

Using columns sqft_living, bedrooms, sqft_living15, and the columns created from encoding 'grade'.

In [43]:
cols = ['sqft_living', 'bedrooms', 'sqft_living15'] +  encoded_grade
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [44]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.5899636309664652
Test Score: 0.5694433258427583


#### Model 2

Using columns sqft_living, 'sqft_living15', 'bathrooms', and the columns created from encoding 'zipcode'.

Note the difference between entering the encoding for 'zipcode' and actually entering 'zipcode'. The encoding gives us a higher R2 value, however, it also gives you a much larger RSME Error.

In [45]:
cols = ['sqft_living', 'bathrooms', 'sqft_living15'] +  encoded_zipcode
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)


In [46]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.7361599030747257
Test Score: 0.7327111247438015


In [47]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.736
Mean Absolute Error: 112073.633
Mean Squared Error: 36151449061.468
Root Mean Squared Error: 190135.344


In [48]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.733
Mean Absolute Error: 109660.014
Mean Squared Error: 34944610690.996
Root Mean Squared Error: 186934.777


Compared to using unencoded zipcode 

In [49]:
cols = ['sqft_living', 'bathrooms', 'sqft_living15', 'zipcode']
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [50]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.5125951324010141
Test Score: 0.5073895822722052


In [51]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.513
Mean Absolute Error: 170407.192
Mean Squared Error: 66784360863.493
Root Mean Squared Error: 258426.703


In [52]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.507
Mean Absolute Error: 167299.129
Mean Squared Error: 64402527988.966
Root Mean Squared Error: 253776.532


#### Model 3

Using columns  'sqft_living', 'bedrooms', 'sqft_living15', and the columns created from encoding 'zipcode'.

In [53]:
cols = ['sqft_living', 'bedrooms', 'sqft_living15'] +  encoded_zipcode
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [54]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.742883945094359
Test Score: 0.7410110160941886


In [55]:
print("Training Metrics:")
print(f"R2: {r2_score(y, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.743
Mean Absolute Error: 111294.639
Mean Squared Error: 35230118811.090
Root Mean Squared Error: 187696.880


In [56]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.741
Mean Absolute Error: 108795.114
Mean Squared Error: 33859505776.926
Root Mean Squared Error: 184009.526


#### Model 4

Using columns 'sqft_living', 'bedrooms', and the columns created from encoding 'zipcode' and 'condition'.

In [57]:
cols = ['sqft_living', 'bedrooms'] +  encoded_zipcode + encoded_condition
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [58]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.7392180814648154
Test Score: 0.7409647556468758


In [59]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.739
Mean Absolute Error: 111371.347
Mean Squared Error: 35732416542.990
Root Mean Squared Error: 189030.200


In [60]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.741
Mean Absolute Error: 108173.505
Mean Squared Error: 33865553740.277
Root Mean Squared Error: 184025.959


#### Model 5

Using columns 'sqft_living', 'bedrooms', 'bathrooms', 'condition', 'view', and the columns created from encoding 'zipcode'. 

In [61]:
cols = ['sqft_living', 'bedrooms', 'bathrooms', 'condition', 'view'] +  encoded_zipcode
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [62]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.7703318967332109
Test Score: 0.7628512788986122


In [63]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.770
Mean Absolute Error: 105420.161
Mean Squared Error: 31469192260.966
Root Mean Squared Error: 177395.581


In [64]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.763
Mean Absolute Error: 103148.620
Mean Squared Error: 31004170026.951
Root Mean Squared Error: 176080.010


##### model 5.0

Compared to encoded view and condition.

In [69]:
cols = ['sqft_living', 'bedrooms', 'bathrooms'] + encoded_condition + encoded_view + encoded_zipcode
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [70]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.7801453890155033
Test Score: 0.7717335916453081


In [71]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.780
Mean Absolute Error: 103387.581
Mean Squared Error: 30124544610.769
Root Mean Squared Error: 173564.238


In [72]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.772
Mean Absolute Error: 101516.559
Mean Squared Error: 29842920945.142
Root Mean Squared Error: 172751.037


#### Model 6

Using columns 'sqft_living', 'bedrooms', 'bathrooms', 'sqft_living15', 'yr_built' and the columns created from encoding 'condition', 'view', and 'zipcode'. 

In [87]:
cols = ['sqft_living', 'bedrooms', 'bathrooms', 'sqft_living15', 'yr_built'] + encoded_condition + encoded_view + encoded_zipcode
y = y_train
x = X_train.filter(cols, axis=1)

y2 = y_test
x2 = X_test.filter(cols, axis=1)

In [88]:
scaler = StandardScaler()
scaler.fit(x)
X_train_scaled = scaler.transform(x)
X_test_scaled = scaler.transform(x2)

lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

train_preds = lr.predict(X_train_scaled)
test_preds = lr.predict(X_test_scaled)

print('Training Score:',r2_score(y_train,train_preds))
print('Test Score:',r2_score(y_test,test_preds))

Training Score: 0.7840252943840298
Test Score: 0.7723821658601114


In [89]:
print("Training Metrics:")
print(f"R2: {r2_score(y_train, train_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_train, train_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_train, train_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_train, train_preds, squared=False):.3f}")

Training Metrics:
R2: 0.784
Mean Absolute Error: 102558.361
Mean Squared Error: 29592918815.720
Root Mean Squared Error: 172025.925


In [90]:
print("Testing Metrics:")
print(f"R2: {r2_score(y_test, test_preds):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, test_preds):.3f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, test_preds):.3f}")
print(f"Root Mean Squared Error: {mean_squared_error(y_test, test_preds, squared=False):.3f}")

Testing Metrics:
R2: 0.772
Mean Absolute Error: 101255.917
Mean Squared Error: 29758128140.283
Root Mean Squared Error: 172505.444
