### Linear Regression Model

In [1]:
import pandas as pd
import numpy as np

In [30]:
# read in the data
data = pd.read_csv('../data/data_new.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,yr_renovated,city,decade_built
0,0,376000.0,3.0,2.0,1340,1384,3.0,0,3,0,Seattle,2000
1,1,800000.0,4.0,3.25,3540,159430,2.0,0,3,0,Carnation,2000
2,3,324000.0,3.0,2.25,998,904,2.0,0,3,0,Seattle,2000
3,4,549900.0,5.0,2.75,3060,7015,1.0,0,5,0,Seattle,1970
4,5,320000.0,3.0,2.5,2130,6969,2.0,0,3,0,Maple Valley,2000


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3873 entries, 0 to 3872
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    3873 non-null   int64  
 1   price         3873 non-null   float64
 2   bedrooms      3873 non-null   float64
 3   bathrooms     3873 non-null   float64
 4   sqft_living   3873 non-null   int64  
 5   sqft_lot      3873 non-null   int64  
 6   floors        3873 non-null   float64
 7   waterfront    3873 non-null   int64  
 8   condition     3873 non-null   int64  
 9   yr_renovated  3873 non-null   int64  
 10  city          3873 non-null   object 
 11  decade_built  3873 non-null   int64  
dtypes: float64(4), int64(7), object(1)
memory usage: 363.2+ KB


In [32]:
data.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3873 entries, 0 to 3872
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         3873 non-null   float64
 1   bedrooms      3873 non-null   float64
 2   bathrooms     3873 non-null   float64
 3   sqft_living   3873 non-null   int64  
 4   sqft_lot      3873 non-null   int64  
 5   floors        3873 non-null   float64
 6   waterfront    3873 non-null   int64  
 7   condition     3873 non-null   int64  
 8   yr_renovated  3873 non-null   int64  
 9   city          3873 non-null   object 
 10  decade_built  3873 non-null   int64  
dtypes: float64(4), int64(6), object(1)
memory usage: 333.0+ KB


#### Split the Data

We have to use a stratified split because 33% of the data comes from Seattle. 

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
# drop city
data.drop('city', axis = 1, inplace = True)

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3873 entries, 0 to 3872
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         3873 non-null   float64
 1   bedrooms      3873 non-null   float64
 2   bathrooms     3873 non-null   float64
 3   sqft_living   3873 non-null   int64  
 4   sqft_lot      3873 non-null   int64  
 5   floors        3873 non-null   float64
 6   waterfront    3873 non-null   int64  
 7   condition     3873 non-null   int64  
 8   yr_renovated  3873 non-null   int64  
 9   decade_built  3873 non-null   int64  
dtypes: float64(4), int64(6)
memory usage: 302.7 KB


In [37]:
def replace(year:int) -> int:
    # binary encoding of yr_renovated
    if year == 0:
        return 0
    else:
        return 1
    
data['yr_renovated'] = data['yr_renovated'].apply(replace)

In [38]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,yr_renovated,decade_built
0,376000.0,3.0,2.0,1340,1384,3.0,0,3,0,2000
1,800000.0,4.0,3.25,3540,159430,2.0,0,3,0,2000
2,324000.0,3.0,2.25,998,904,2.0,0,3,0,2000
3,549900.0,5.0,2.75,3060,7015,1.0,0,5,0,1970
4,320000.0,3.0,2.5,2130,6969,2.0,0,3,0,2000


In [39]:
data['decade_built'].unique()

array([2000, 1970, 1920, 1960, 1900, 1990, 2010, 1950, 1940, 1980, 1910,
       1930])

In [40]:
# ordinal encoding of decade built
from sklearn.preprocessing import OrdinalEncoder

# first sort the values
bins = sorted(data['decade_built'].unique())
print(bins)

[np.int64(1900), np.int64(1910), np.int64(1920), np.int64(1930), np.int64(1940), np.int64(1950), np.int64(1960), np.int64(1970), np.int64(1980), np.int64(1990), np.int64(2000), np.int64(2010)]


In [41]:
# encode them
encoder = OrdinalEncoder(categories=[bins])

# transform the data
data['encoded_decade_built'] = encoder.fit_transform(data[['decade_built']])

In [43]:
data.encoded_decade_built

0       10.0
1       10.0
2       10.0
3        7.0
4       10.0
        ... 
3868     5.0
3869     8.0
3870    10.0
3871     7.0
3872     9.0
Name: encoded_decade_built, Length: 3873, dtype: float64