In [83]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [84]:
# Read the CSV File
data = pd.read_csv('train.csv')

In [85]:
print(data.head())
print(data.shape)
print(data.info())

for column in data.columns:
    print(data[column].value_counts())
    print("*" * 20)

print(data.isna().sum())

   beds  baths    size size_units  lot_size lot_size_units  zip_code  \
0     3    2.5  2590.0       sqft   6000.00           sqft     98144   
1     4    2.0  2240.0       sqft      0.31           acre     98106   
2     4    3.0  2040.0       sqft   3783.00           sqft     98107   
3     4    3.0  3800.0       sqft   5175.00           sqft     98199   
4     2    2.0  1042.0       sqft       NaN            NaN     98102   

       price  
0   795000.0  
1   915000.0  
2   950000.0  
3  1950000.0  
4   950000.0  
(2016, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2016 non-null   int64  
 1   baths           2016 non-null   float64
 2   size            2016 non-null   float64
 3   size_units      2016 non-null   object 
 4   lot_size        1669 non-null   float64
 5   lot_size_units  1669 non-null   object 
 

In [86]:
data.drop(columns=['lot_size', 'lot_size_units'], inplace=True)
print(data.describe())
print(data.info())

              beds        baths          size      zip_code         price
count  2016.000000  2016.000000   2016.000000   2016.000000  2.016000e+03
mean      2.857639     2.159970   1735.740575  98123.638889  9.636252e+05
std       1.255092     1.002023    920.132591     22.650819  9.440954e+05
min       1.000000     0.500000    250.000000  98101.000000  1.590000e+05
25%       2.000000     1.500000   1068.750000  98108.000000  6.017500e+05
50%       3.000000     2.000000   1560.000000  98117.000000  8.000000e+05
75%       4.000000     2.500000   2222.500000  98126.000000  1.105250e+06
max      15.000000     9.000000  11010.000000  98199.000000  2.500000e+07
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   beds        2016 non-null   int64  
 1   baths       2016 non-null   float64
 2   size        2016 non-null   float64
 3   size_units  2016 n

In [87]:
data['price_per_sqft'] = data['price'] * 100000 / data['size']
print(data.describe())

              beds        baths          size      zip_code         price  \
count  2016.000000  2016.000000   2016.000000   2016.000000  2.016000e+03   
mean      2.857639     2.159970   1735.740575  98123.638889  9.636252e+05   
std       1.255092     1.002023    920.132591     22.650819  9.440954e+05   
min       1.000000     0.500000    250.000000  98101.000000  1.590000e+05   
25%       2.000000     1.500000   1068.750000  98108.000000  6.017500e+05   
50%       3.000000     2.000000   1560.000000  98117.000000  8.000000e+05   
75%       4.000000     2.500000   2222.500000  98126.000000  1.105250e+06   
max      15.000000     9.000000  11010.000000  98199.000000  2.500000e+07   

       price_per_sqft  
count    2.016000e+03  
mean     5.915851e+07  
std      8.327952e+07  
min      6.796117e+06  
25%      4.452221e+07  
50%      5.529762e+07  
75%      6.595389e+07  
max      3.424658e+09  


In [88]:
data.drop(columns=['size_units', 'price_per_sqft'], inplace=True)
print(data.head())

   beds  baths    size  zip_code      price
0     3    2.5  2590.0     98144   795000.0
1     4    2.0  2240.0     98106   915000.0
2     4    3.0  2040.0     98107   950000.0
3     4    3.0  3800.0     98199  1950000.0
4     2    2.0  1042.0     98102   950000.0


In [89]:
data.to_csv("final_dataset.csv", index=False)

In [90]:
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(y_train.shape)

(1612, 4)
(1612,)


In [91]:
column_trans = make_column_transformer((OneHotEncoder(), ['beds']), remainder='passthrough')
scaler = StandardScaler(with_mean=False)  # Set with_mean=False to handle sparse matrices
lr = LinearRegression()

pipe_lr = make_pipeline(column_trans, scaler, lr)
pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)


In [92]:
print("No Regularization: ", r2_score(y_test, y_pred_lr))

No Regularization:  0.5746822864697891


In [93]:
pickle.dump(pipe_lr, open('LinearRegressionModel.pkl', 'wb'))