# Linear Regression
---

# Load packages and data

In [9]:
# Data manipulation
import pandas as pd
import numpy as np

# Regression
from sklearn.model_selection import train_test_split # split data into train and test sets
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # calculate model performance
import statsmodels.api as sm # print model summary

df = pd.read_csv('/home/simon/Predict_House_Prices/kc_house_data_cleaned_featured.csv', index_col=0)
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_sqft_living,bathrooms_floors,renovated
0,7129300520,13-10-2014,221900.0,3,1.0,1180,5650,1.0,0,0,...,1955,0,98178,47.5112,-122.257,1340,5650,0.002542,1.0,0
1,6414100192,09-12-2014,538000.0,3,2.25,2570,7242,2.0,0,0,...,1951,1991,98125,47.721,-122.319,1690,7639,0.001167,1.125,1
2,5631500400,25-02-2015,180000.0,2,1.0,770,10000,1.0,0,0,...,1933,0,98028,47.7379,-122.233,2720,8062,0.002597,1.0,0
3,2487200875,09-12-2014,604000.0,4,3.0,1960,5000,1.0,0,0,...,1965,0,98136,47.5208,-122.393,1360,5000,0.002041,3.0,0
4,1954400510,18-02-2015,510000.0,3,2.0,1680,8080,1.0,0,0,...,1987,0,98074,47.6168,-122.045,1800,7503,0.001786,2.0,0


# Model without engineered features
## Prepare data for model

In [31]:
# Create dummy features for categorical variables
df_dummy = pd.get_dummies(df, prefix='Category_', columns=['zipcode'])

# Define dependent variables by dropping unneeded features
X_dummy = df_dummy.drop(columns=['id', 'date', 'price', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms_sqft_living', 'bathrooms_floors'], axis=1)

# Define independent variable
y = df['price']

# Add the constant term
X_dummy = sm.add_constant(X_dummy)

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.25, random_state=1)

- dummies are created for zipcodes
- non-relevant columns are dropped from dataset, also additionally engineered features
- data is split into 75% train and 25% test

## Train and test model

In [24]:
# Train the model
model_ols = sm.OLS(y_train,X_train).fit()

# Print the model results
print(model_ols.summary())

# Making predictions on the test data
y_pred = model_ols.predict(X_test)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.818
Model:                            OLS   Adj. R-squared:                  0.817
Method:                 Least Squares   F-statistic:                     893.3
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        17:10:17   Log-Likelihood:            -2.1637e+05
No. Observations:               16209   AIC:                         4.329e+05
Df Residuals:                   16127   BIC:                         4.335e+05
Df Model:                          81                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            1.059e+06   1.35e+05     

In [32]:
# Calculating the error scores and R² for the test set
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

Mean Absolute Error: 99570.59702596527
Mean Squared Error: 35403906625.00592
Root Mean Squared Error: 188159.25867468206
R²: 0.778893340001771


- features are all significant
- model performance is already quite high
- mean estimation is around 100,000 $ off
- model can explain almost 80% of the variation of the dependent variable

# Model with engineered features
## Prepare data for model

In [26]:
# Create dummy features for categorical variables
df_dummy = pd.get_dummies(df, prefix='Category_', columns=['zipcode'])

# Define dependent variables by dropping unneeded features
X_dummy = df_dummy.drop(columns=['id', 'date', 'price', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'bedrooms', 'sqft_living', 'bathrooms', 'floors'], axis=1)

# Define independent variable
y = df['price']

# Add the constant term
X_dummy = sm.add_constant(X_dummy)

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.25, random_state=1)

- engineered features are included, their sources are dropped

## Train and test model

In [29]:
# Train the model
model_ols = sm.OLS(y_train,X_train).fit()

# Print the model results
print(model_ols.summary([1]))

# Making predictions on the test data
y_pred = model_ols.predict(X_test)

                            OLS Regression Results                            
Dep. Variable:                    [1]   R-squared:                       0.815
Model:                            OLS   Adj. R-squared:                  0.814
Method:                 Least Squares   F-statistic:                     890.0
Date:                Fri, 17 Feb 2023   Prob (F-statistic):               0.00
Time:                        17:18:04   Log-Likelihood:            -2.1648e+05
No. Observations:               16209   AIC:                         4.331e+05
Df Residuals:                   16128   BIC:                         4.337e+05
Df Model:                          80                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.121e+06 

In [33]:
# Calculating the error scores and R² on the test set
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

Mean Absolute Error: 99570.59702596527
Mean Squared Error: 35403906625.00592
Root Mean Squared Error: 188159.25867468206
R²: 0.778893340001771


- utilizing engineered features lowers model performance