# Linear Regression
---

# Load Packages and Data

In [46]:
# Data manipulation
import pandas as pd
import numpy as np

# split data into train and test sets
from sklearn.model_selection import train_test_split
# calculate model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# To perform the linear regression
import statsmodels.api as sm

df = pd.read_csv('/home/simon/Predict_House_Prices/kc_house_data_cleaned_featured.csv', index_col=0)

# Model without engineered features
## Prepare data for Model

In [47]:
# Create dummy features for categorical variables
df_dummy = pd.get_dummies(df, prefix='Category_', columns=['yr_built', 'yr_renovated', 'zipcode'])

# Define dependent variables by dropping unneeded features
X_dummy = df_dummy.drop(columns=['id', 'date', 'price', 'lat', 'long', 'sqft_living15',  'sqft_lot15', 'bedrooms_sqft_living', 'bathrooms_floors'], axis=1)

# Define independent variable
y = df['price']

# Add the constant term
X_dummy = sm.add_constant(X_dummy)

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.25, random_state=1)

## Linear Regression Model

In [48]:
# Trainthe model
model_ols = sm.OLS(y_train,X_train).fit()

# Print the model results
print(model_ols.summary())

# Making predictions on the test data
y_pred = model_ols.predict(X_test)

# Calculating the error scores and R²
print('\nMean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.825
Model:                            OLS   Adj. R-squared:                  0.822
Method:                 Least Squares   F-statistic:                     286.6
Date:                Tue, 14 Feb 2023   Prob (F-statistic):               0.00
Time:                        20:07:33   Log-Likelihood:            -2.1602e+05
No. Observations:               16209   AIC:                         4.326e+05
Df Residuals:                   15945   BIC:                         4.346e+05
Df Model:                         263                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const           -2.303e+05   1.93e+04    -

# Model with engineered features
## Prepare data for Model

In [49]:
# Create dummy features for categorical variables
df_dummy = pd.get_dummies(df, prefix='Category_', columns=['yr_built', 'yr_renovated', 'zipcode'])

# Define dependent variables by dropping unneeded features
X_dummy = df_dummy.drop(columns=['id', 'date', 'price', 'lat', 'long', 'sqft_living15',  'sqft_lot15', 'bedrooms', 'sqft_living', 'bathrooms', 'floors'], axis=1)

# Define independent variable
y = df['price']

# Add the constant term
X_dummy = sm.add_constant(X_dummy)

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size=0.25, random_state=1)

## Linear Regression Model

In [50]:
# Trainthe model
model_ols = sm.OLS(y_train,X_train).fit()

# Print the model results
print(model_ols.summary())

# Making predictions on the test data
y_pred = model_ols.predict(X_test)

# Calculating the error scores and R²
print('\nMean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R²:', r2_score(y_test, y_pred))

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.822
Model:                            OLS   Adj. R-squared:                  0.820
Method:                 Least Squares   F-statistic:                     283.1
Date:                Tue, 14 Feb 2023   Prob (F-statistic):               0.00
Time:                        20:08:08   Log-Likelihood:            -2.1616e+05
No. Observations:               16209   AIC:                         4.328e+05
Df Residuals:                   15947   BIC:                         4.349e+05
Df Model:                         261                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -5.074e+05 