In [40]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import sklearn.metrics as metrics

from matplotlib import pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

from scipy import stats as stats
%matplotlib inline

In [93]:
df = pd.read_csv("../../data/kc_house_data.csv")

In [81]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 32 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                21597 non-null  int64  
 1   date              21597 non-null  object 
 2   price             21597 non-null  float64
 3   bedrooms          21597 non-null  int64  
 4   bathrooms         21597 non-null  float64
 5   sqft_living       21597 non-null  int64  
 6   sqft_lot          21597 non-null  int64  
 7   floors            21597 non-null  float64
 8   waterfront        19221 non-null  object 
 9   view              21534 non-null  object 
 10  condition         21597 non-null  object 
 11  grade             21597 non-null  object 
 12  sqft_above        21597 non-null  int64  
 13  sqft_basement     21597 non-null  object 
 14  yr_built          21597 non-null  int64  
 15  yr_renovated      17755 non-null  float64
 16  zipcode           21597 non-null  int64 

In [34]:
df.corr()['price']

id              -0.016772
price            1.000000
bedrooms         0.308787
bathrooms        0.525906
sqft_living      0.701917
sqft_lot         0.089876
floors           0.256804
sqft_above       0.605368
yr_built         0.053953
yr_renovated     0.129599
zipcode         -0.053402
lat              0.306692
long             0.022036
sqft_living15    0.585241
sqft_lot15       0.082845
Name: price, dtype: float64

In [95]:
def ohe(df, *cols):
    temp_df = df
    for col in cols:
        encoder = OneHotEncoder(handle_unknown="ignore")
        fitted_arr = encoder.fit_transform(temp_df[[col]]).toarray()
        col_names = [f"{col}_{f}" for f in encoder.get_feature_names()]
        encoded_values = pd.DataFrame(fitted_arr, columns=col_names)
        temp_df = temp_df.join(encoded_values)
    return temp_df


In [96]:
df_clean = ohe(df, "grade", "condition")

In [98]:
df[["yr_built", "yr_renovated"]]

Unnamed: 0,yr_built,yr_renovated
0,1955,0.0
1,1951,1991.0
2,1933,
3,1965,0.0
4,1987,0.0
...,...,...
21592,2009,0.0
21593,2014,0.0
21594,2009,0.0
21595,2004,0.0


In [66]:
df.groupby(["yr_renovated"])["id"].count()

yr_renovated
0.0       17011
1934.0        1
1940.0        2
1944.0        1
1945.0        3
          ...  
2011.0        9
2012.0        8
2013.0       31
2014.0       73
2015.0       14
Name: id, Length: 70, dtype: int64

Out of a total of 21597 records we have 17011 records which are 0.0, which is ~79% of the data, hence we cannot use this field for feature engineering.

In [101]:
df_clean.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'grade_x0_10 Very Good',
       'grade_x0_11 Excellent', 'grade_x0_12 Luxury', 'grade_x0_13 Mansion',
       'grade_x0_3 Poor', 'grade_x0_4 Low', 'grade_x0_5 Fair',
       'grade_x0_6 Low Average', 'grade_x0_7 Average', 'grade_x0_8 Good',
       'grade_x0_9 Better', 'condition_x0_Average', 'condition_x0_Fair',
       'condition_x0_Good', 'condition_x0_Poor', 'condition_x0_Very Good'],
      dtype='object')

In [109]:
df_model = df_clean.drop(['id','price','yr_renovated','waterfront','date', 'grade', 'condition','view','sqft_basement'], axis = 1)

In [110]:
X=df_model
y=df['price']

In [111]:
model = sm.OLS(endog=y, exog=X).fit()

In [112]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.694
Model:,OLS,Adj. R-squared:,0.693
Method:,Least Squares,F-statistic:,1878.0
Date:,"Tue, 29 Mar 2022",Prob (F-statistic):,0.0
Time:,02:24:14,Log-Likelihood:,-294620.0
No. Observations:,21597,AIC:,589300.0
Df Residuals:,21570,BIC:,589500.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bedrooms,-2.986e+04,1952.828,-15.293,0.000,-3.37e+04,-2.6e+04
bathrooms,5.017e+04,3296.747,15.217,0.000,4.37e+04,5.66e+04
sqft_living,159.7181,4.407,36.243,0.000,151.080,168.356
sqft_lot,0.1408,0.048,2.904,0.004,0.046,0.236
floors,3.217e+04,3699.670,8.696,0.000,2.49e+04,3.94e+04
sqft_above,-22.5210,4.419,-5.096,0.000,-31.183,-13.859
yr_built,-2614.6268,70.695,-36.984,0.000,-2753.195,-2476.059
zipcode,-511.0846,33.422,-15.292,0.000,-576.595,-445.575
lat,5.695e+05,1.08e+04,52.644,0.000,5.48e+05,5.91e+05

0,1,2,3
Omnibus:,16508.378,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1071662.114
Skew:,3.13,Prob(JB):,0.0
Kurtosis:,36.937,Cond. No.,1.04e+16


In [91]:
np.asarray(X)

array([[3, 1.0, 1180, ..., 1.0, 0.0, 0.0],
       [3, 2.25, 2570, ..., 1.0, 0.0, 0.0],
       [2, 1.0, 770, ..., 0.0, 0.0, 0.0],
       ...,
       [2, 0.75, 1020, ..., 1.0, 0.0, 0.0],
       [3, 2.5, 1600, ..., 0.0, 1.0, 0.0],
       [2, 0.75, 1020, ..., 1.0, 0.0, 0.0]], dtype=object)

In [108]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   bedrooms                21597 non-null  int64  
 1   bathrooms               21597 non-null  float64
 2   sqft_living             21597 non-null  int64  
 3   sqft_lot                21597 non-null  int64  
 4   floors                  21597 non-null  float64
 5   view                    21534 non-null  object 
 6   sqft_above              21597 non-null  int64  
 7   sqft_basement           21597 non-null  object 
 8   yr_built                21597 non-null  int64  
 9   zipcode                 21597 non-null  int64  
 10  lat                     21597 non-null  float64
 11  long                    21597 non-null  float64
 12  sqft_living15           21597 non-null  int64  
 13  sqft_lot15              21597 non-null  int64  
 14  grade_x0_10 Very Good   21597 non-null