In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [41]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [42]:
data.shape

(2016, 8)

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 6   zip_code        2016 non-null   int64  
 7   price           2016 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 126.1+ KB


In [44]:
for col in data.columns:
    print(data[col].value_counts())
    print("*"*20)

beds
3     645
2     560
4     398
1     256
5     123
6      22
9       5
7       3
8       2
15      1
14      1
Name: count, dtype: int64
********************
baths
2.0    627
1.0    493
2.5    282
3.0    198
3.5    179
1.5    137
4.0     37
4.5     21
5.0     16
5.5     13
6.0      5
7.0      4
8.5      1
0.5      1
9.0      1
6.5      1
Name: count, dtype: int64
********************
size
2080.0    12
1440.0    11
1460.0    11
1370.0    11
1670.0    11
          ..
1548.0     1
1174.0     1
1865.0     1
578.0      1
795.0      1
Name: count, Length: 879, dtype: int64
********************
size_units
sqft    2016
Name: count, dtype: int64
********************
lot_size
5000.0    61
4000.0    45
6000.0    38
1.0       26
4800.0    16
          ..
745.0      1
5043.0     1
2256.0     1
8540.0     1
4267.0     1
Name: count, Length: 959, dtype: int64
********************
lot_size_units
sqft    1449
acre     220
Name: count, dtype: int64
********************
zip_code
98115    170
98103   

In [45]:
data.isna().sum()

beds                0
baths               0
size                0
size_units          0
lot_size          347
lot_size_units    347
zip_code            0
price               0
dtype: int64

In [46]:
data.drop(['lot_size', 'lot_size_units'], axis=1, inplace=True)

In [47]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price
0,3,2.5,2590.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,98106,915000.0
2,4,3.0,2040.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,98102,950000.0


In [48]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price
count,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2
std,1.255092,1.002023,920.132591,22.650819,944095.4
min,1.0,0.5,250.0,98101.0,159000.0
25%,2.0,1.5,1068.75,98108.0,601750.0
50%,3.0,2.0,1560.0,98117.0,800000.0
75%,4.0,2.5,2222.5,98126.0,1105250.0
max,15.0,9.0,11010.0,98199.0,25000000.0


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beds        2016 non-null   int64  
 1   baths       2016 non-null   float64
 2   size        2016 non-null   float64
 3   size_units  2016 non-null   object 
 4   zip_code    2016 non-null   int64  
 5   price       2016 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 94.6+ KB


In [50]:
data['beds'].value_counts()

beds
3     645
2     560
4     398
1     256
5     123
6      22
9       5
7       3
8       2
15      1
14      1
Name: count, dtype: int64

In [51]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price
0,3,2.5,2590.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,98106,915000.0
2,4,3.0,2040.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,98102,950000.0


#### Price per sq feet

In [52]:
data['price_per_sqrt'] = data['price'] * 100000 / data['size']
data['price_per_sqrt']

0       3.069498e+07
1       4.084821e+07
2       4.656863e+07
3       5.131579e+07
4       9.117083e+07
            ...     
2011    6.642336e+07
2012    6.186727e+07
2013    5.373832e+07
2014    7.421384e+07
2015    3.853801e+07
Name: price_per_sqrt, Length: 2016, dtype: float64

In [53]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price,price_per_sqrt
count,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2,59158510.0
std,1.255092,1.002023,920.132591,22.650819,944095.4,83279520.0
min,1.0,0.5,250.0,98101.0,159000.0,6796117.0
25%,2.0,1.5,1068.75,98108.0,601750.0,44522210.0
50%,3.0,2.0,1560.0,98117.0,800000.0,55297620.0
75%,4.0,2.5,2222.5,98126.0,1105250.0,65953890.0
max,15.0,9.0,11010.0,98199.0,25000000.0,3424658000.0


In [54]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price,price_per_sqrt
0,3,2.5,2590.0,sqft,98144,795000.0,30694980.0
1,4,2.0,2240.0,sqft,98106,915000.0,40848210.0
2,4,3.0,2040.0,sqft,98107,950000.0,46568630.0
3,4,3.0,3800.0,sqft,98199,1950000.0,51315790.0
4,2,2.0,1042.0,sqft,98102,950000.0,91170830.0


In [55]:
data.drop(columns=['size_units', 'price_per_sqrt'],inplace=True)
data.head()

Unnamed: 0,beds,baths,size,zip_code,price
0,3,2.5,2590.0,98144,795000.0
1,4,2.0,2240.0,98106,915000.0
2,4,3.0,2040.0,98107,950000.0
3,4,3.0,3800.0,98199,1950000.0
4,2,2.0,1042.0,98102,950000.0


In [56]:
X = data.drop(columns=['price'])
y = data['price']

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1612, 4) (404, 4)
(1612,) (404,)


### Applying Liner Regression

In [60]:
from sklearn.compose import make_column_transformer

In [62]:
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False), ['beds']), remainder='passthrough')

In [63]:
scaler = StandardScaler()

In [64]:
lr = LinearRegression(normalize=True)

TypeError: LinearRegression.__init__() got an unexpected keyword argument 'normalize'

In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# X being feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lr = LinearRegression()
lr.fit(X_scaled, y)

In [67]:
from sklearn.pipeline import Pipeline

In [69]:
from sklearn.pipeline import make_pipeline

In [70]:
pipe = make_pipeline(column_trans, scaler, lr)

In [71]:
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [72]:
y_pred_lr = pipe.predict(X_test)

In [73]:
r2_score(y_test, y_pred_lr)

0.5746816285348347

### Using Lasso

In [74]:
lasso = Lasso()
pipe = make_pipeline(column_trans, scaler, lasso)
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [75]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test, y_pred_lasso)

0.5746817917322382

### Using Ridge

In [76]:
ridge = Ridge()
pipe = make_pipeline(column_trans, scaler, ridge)
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [77]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test, y_pred_ridge)

0.5746884627878555

In [None]:
import pickle
pickle.dump(pipe, open('RidgeModel.pkl', 'wb'))