In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_train = pd.read_csv('train.csv')

In [4]:
df_train.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 6   zip_code        2016 non-null   int64  
 7   price           2016 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 126.1+ KB


In [6]:
df_train.shape

(2016, 8)

In [7]:
df_train.isnull().sum()

beds                0
baths               0
size                0
size_units          0
lot_size          347
lot_size_units    347
zip_code            0
price               0
dtype: int64

In [8]:
df_train.drop(columns=['lot_size','lot_size_units', 'size_units'],inplace=True)

In [9]:
df_train.describe()

Unnamed: 0,beds,baths,size,zip_code,price
count,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2
std,1.255092,1.002023,920.132591,22.650819,944095.4
min,1.0,0.5,250.0,98101.0,159000.0
25%,2.0,1.5,1068.75,98108.0,601750.0
50%,3.0,2.0,1560.0,98117.0,800000.0
75%,4.0,2.5,2222.5,98126.0,1105250.0
max,15.0,9.0,11010.0,98199.0,25000000.0


In [10]:
df_train.head()

Unnamed: 0,beds,baths,size,zip_code,price
0,3,2.5,2590.0,98144,795000.0
1,4,2.0,2240.0,98106,915000.0
2,4,3.0,2040.0,98107,950000.0
3,4,3.0,3800.0,98199,1950000.0
4,2,2.0,1042.0,98102,950000.0


In [11]:
df_train.shape

(2016, 5)

In [12]:
x = df_train.iloc[:,0:4]

In [13]:
x.shape

(2016, 4)

In [14]:
y = df_train['price']

In [15]:
sc = StandardScaler()
x_scaled = sc.fit_transform(x)

In [16]:
x_scaled.shape

(2016, 4)

Linear Regression

In [17]:
lr = LinearRegression()

In [18]:
lr.fit(x_scaled, y)

In [19]:
df_test = pd.read_csv('test.csv')

In [20]:
df_test.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.0,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.0,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.0,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.0,sqft,98107,1187000.0


In [21]:
df_test.drop(columns=['lot_size','lot_size_units', 'size_units'],inplace=True)

In [22]:
df_test.head()

Unnamed: 0,beds,baths,size,zip_code,price
0,3,3.0,2850.0,98119,1175000.0
1,4,5.0,3040.0,98106,1057500.0
2,3,1.0,1290.0,98125,799000.0
3,3,2.0,2360.0,98188,565000.0
4,3,3.5,1942.0,98107,1187000.0


In [23]:
x_test =x = df_test.iloc[:,0:4]

In [24]:
x_test

Unnamed: 0,beds,baths,size,zip_code
0,3,3.0,2850.0,98119
1,4,5.0,3040.0,98106
2,3,1.0,1290.0,98125
3,3,2.0,2360.0,98188
4,3,3.5,1942.0,98107
...,...,...,...,...
500,5,4.5,5580.0,98146
501,3,2.5,1390.0,98126
502,3,2.5,2950.0,98118
503,5,5.0,3010.0,98115


In [25]:
y_test = df_test['price']

In [26]:
x_test_scaled = sc.transform(x_test)

In [27]:
y_pred = lr.predict(x_test_scaled)

In [28]:
accuracy = r2_score(y_test, y_pred)

In [29]:
accuracy

0.5277088205815966

Lasso regression

In [30]:
lasso = Lasso()

In [31]:
x_scaled.shape

(2016, 4)

In [32]:
y.shape

(2016,)

In [33]:
lasso.fit(x_scaled, y)

In [34]:
y_pred_ls = lasso.predict(x_test_scaled)

In [35]:
accuracy = r2_score(y_test, y_pred_ls)

In [36]:
accuracy

0.5277089257355143

Ridge regression

In [37]:
rg = Ridge()

In [38]:
rg.fit(x_scaled, y)

In [39]:
y_pred_rg = rg.predict(x_test_scaled)

In [40]:
acc = r2_score(y_test, y_pred_rg)
acc

0.5277406959137216