In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [3]:
data=pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [5]:
data.shape

(2016, 8)

In [6]:
data.info

<bound method DataFrame.info of       beds  baths    size size_units  lot_size lot_size_units  zip_code  \
0        3    2.5  2590.0       sqft   6000.00           sqft     98144   
1        4    2.0  2240.0       sqft      0.31           acre     98106   
2        4    3.0  2040.0       sqft   3783.00           sqft     98107   
3        4    3.0  3800.0       sqft   5175.00           sqft     98199   
4        2    2.0  1042.0       sqft       NaN            NaN     98102   
...    ...    ...     ...        ...       ...            ...       ...   
2011     3    2.0  1370.0       sqft      0.50           acre     98112   
2012     1    1.0   889.0       sqft       NaN            NaN     98121   
2013     4    2.0  2140.0       sqft   6250.00           sqft     98199   
2014     2    2.0   795.0       sqft       NaN            NaN     98103   
2015     3    2.0  1710.0       sqft   4267.00           sqft     98133   

          price  
0      795000.0  
1      915000.0  
2      950000

In [7]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

beds
3     645
2     560
4     398
1     256
5     123
6      22
9       5
7       3
8       2
15      1
14      1
Name: count, dtype: int64
********************
baths
2.0    627
1.0    493
2.5    282
3.0    198
3.5    179
1.5    137
4.0     37
4.5     21
5.0     16
5.5     13
6.0      5
7.0      4
8.5      1
0.5      1
9.0      1
6.5      1
Name: count, dtype: int64
********************
size
2080.0    12
1440.0    11
1460.0    11
1370.0    11
1670.0    11
          ..
1548.0     1
1174.0     1
1865.0     1
578.0      1
795.0      1
Name: count, Length: 879, dtype: int64
********************
size_units
sqft    2016
Name: count, dtype: int64
********************
lot_size
5000.0    61
4000.0    45
6000.0    38
1.0       26
4800.0    16
          ..
745.0      1
5043.0     1
2256.0     1
8540.0     1
4267.0     1
Name: count, Length: 959, dtype: int64
********************
lot_size_units
sqft    1449
acre     220
Name: count, dtype: int64
********************
zip_code
98115    170
98103   

In [8]:
data.isna().sum()

beds                0
baths               0
size                0
size_units          0
lot_size          347
lot_size_units    347
zip_code            0
price               0
dtype: int64

In [9]:
data.drop(columns=['lot_size','lot_size_units'],inplace=True)


In [10]:
data.describe()

Unnamed: 0,beds,baths,size,zip_code,price
count,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2
std,1.255092,1.002023,920.132591,22.650819,944095.4
min,1.0,0.5,250.0,98101.0,159000.0
25%,2.0,1.5,1068.75,98108.0,601750.0
50%,3.0,2.0,1560.0,98117.0,800000.0
75%,4.0,2.5,2222.5,98126.0,1105250.0
max,15.0,9.0,11010.0,98199.0,25000000.0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beds        2016 non-null   int64  
 1   baths       2016 non-null   float64
 2   size        2016 non-null   float64
 3   size_units  2016 non-null   object 
 4   zip_code    2016 non-null   int64  
 5   price       2016 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 94.6+ KB


In [12]:
data.head()

Unnamed: 0,beds,baths,size,size_units,zip_code,price
0,3,2.5,2590.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,98106,915000.0
2,4,3.0,2040.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,98102,950000.0


In [13]:
data['price_per_sqft']=data['price']*100000/data['size']
data['price_per_sqft']

0       3.069498e+07
1       4.084821e+07
2       4.656863e+07
3       5.131579e+07
4       9.117083e+07
            ...     
2011    6.642336e+07
2012    6.186727e+07
2013    5.373832e+07
2014    7.421384e+07
2015    3.853801e+07
Name: price_per_sqft, Length: 2016, dtype: float64

In [14]:
data.shape

(2016, 7)

In [15]:
data.describe()


Unnamed: 0,beds,baths,size,zip_code,price,price_per_sqft
count,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0
mean,2.857639,2.15997,1735.740575,98123.638889,963625.2,59158510.0
std,1.255092,1.002023,920.132591,22.650819,944095.4,83279520.0
min,1.0,0.5,250.0,98101.0,159000.0,6796117.0
25%,2.0,1.5,1068.75,98108.0,601750.0,44522210.0
50%,3.0,2.0,1560.0,98117.0,800000.0,55297620.0
75%,4.0,2.5,2222.5,98126.0,1105250.0,65953890.0
max,15.0,9.0,11010.0,98199.0,25000000.0,3424658000.0


In [16]:
data.drop(columns=['size_units'],inplace=True)


In [17]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   zip_code        2016 non-null   int64  
 4   price           2016 non-null   float64
 5   price_per_sqft  2016 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 94.6 KB


In [18]:
data.shape

(2016, 6)

In [19]:
data.to_csv("final_dataset.csv")

In [20]:
X=data.drop(columns=['price'])
y=data['price']

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)


In [22]:
print(X_train.shape)
print(y_train.shape)

(1612, 5)
(1612,)


In [24]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['beds']), remainder='passthrough')

In [27]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [28]:
lr = LinearRegression()
lr.fit(X_scaled, y)


In [29]:
pipe = make_pipeline(column_trans,scaler, lr)

In [30]:
pipe.fit(X_train,y_train)



In [31]:
y_pred_lr = pipe.predict(X_test)

In [32]:
r2_score(y_test,y_pred_lr)

0.8093787330643996

In [33]:
ridge = Ridge()
pipe = make_pipeline(column_trans,scaler, ridge)

In [34]:
pipe.fit(X_train,y_train)



In [35]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

0.8099004227947176

In [36]:
import pickle
pickle.dump(pipe, open('RidgeModel.pkl','wb'))