In [3]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Read data
df_kc = pd.read_csv("King_County_House_prices_dataset.csv")

#set correct format
df_kc["id"] = df_kc["id"].astype("str")
df_kc["date"] = pd.to_datetime(df_kc["date"]).dt.date
df_kc["price"] = df_kc["price"].astype("int")
df_kc["waterfront"] = df_kc["waterfront"].astype("category")
df_kc["view"] = df_kc["view"].fillna(0).astype("int")
df_kc["condition"] = df_kc["condition"].astype("category")
df_kc["grade"] = df_kc["grade"].astype("category")
df_kc["sqft_basement"] = pd.to_numeric(df_kc["sqft_basement"], errors='coerce')
df_kc["zipcode"] = df_kc["zipcode"].astype("category")
df_kc["yr_renovated"] = df_kc["yr_renovated"].fillna(0).astype("int")

#Fill NaNs
df_kc["waterfront"] = df_kc["waterfront"].fillna(0)
for i in range(df_kc.shape[0]):
    if df_kc["yr_renovated"][i] == 0:
        df_kc["yr_renovated"][i] = df_kc["yr_built"][i]

In [4]:
#Feature engineering
df_kc["pp_sqft_living"] = df_kc["price"] / df_kc["sqft_living"]
df_kc["pp_sqft_lot"] = df_kc["price"] / df_kc["sqft_lot"]

In [5]:
#Find the single best predictor for a simple linear regression.
import statsmodels.formula.api as smf

col_names = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "waterfront",
             "grade", "yr_renovated", "lat", "sqft_living15", "sqft_lot15",
            "pp_sqft_living", "pp_sqft_lot"]
r_values = []

for predictor in col_names:
    r_values.append(smf.ols(formula="price ~ %s" % predictor, data=df_kc).fit().rsquared_adj)

r_values = pd.DataFrame(r_values, col_names)
r_values.columns = ["adj_r-squared"]
r_values.sort_values("adj_r-squared", ascending=False)

Unnamed: 0,adj_r-squared
grade,0.519751
sqft_living,0.492664
sqft_living15,0.342477
pp_sqft_living,0.309166
bathrooms,0.276543
bedrooms,0.095308
pp_sqft_lot,0.094863
lat,0.094018
waterfront,0.069815
yr_renovated,0.009468


In [10]:
#Feature selection
X = df_kc[["grade", "sqft_living", "sqft_living15", "pp_sqft_living", "bathrooms"]]
Y = df_kc["price"]

#Splitting data
print("-----  Splitting the data in train and test ----")
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#Adding the constant
X_train = sm.add_constant(X_train) # adding a constant
X_test = sm.add_constant(X_test) # adding a constant

#Training the model
print("-----  Training the model ----")
model = sm.OLS(y_train, X_train).fit()
print_model = model.summary()

-----  Splitting the data in train and test ----
-----  Training the model ----


In [11]:
#Predictions to check the model
print("-----  Evaluating the model ----")
predictions = model.predict(X_train)
err_train = np.sqrt(mean_squared_error(y_train, predictions))
predictions_test = model.predict(X_test)
err_test = np.sqrt(mean_squared_error(y_test, predictions_test))


print(print_model)
print ("-------------")
print (f"RMSE on train data: {err_train}")
print (f"RMSE on test data: {err_test}")

-----  Evaluating the model ----
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.883
Model:                            OLS   Adj. R-squared:                  0.883
Method:                 Least Squares   F-statistic:                 2.282e+04
Date:                Thu, 15 Oct 2020   Prob (F-statistic):               0.00
Time:                        11:43:17   Log-Likelihood:            -1.9900e+05
No. Observations:               15117   AIC:                         3.980e+05
Df Residuals:                   15111   BIC:                         3.980e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const      