In [21]:
import pandas as pd 
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [22]:
data_conneticut = pd.read_csv('/Users/saksham/Projects/Housing-Price/data/interim/Connecticut.csv')

In [23]:
data_conneticut.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,1.0,3.93,Willington,Connecticut,6279.0,1572.0,225000.0
1,for_sale,4.0,3.0,2.34,Coventry,Connecticut,6238.0,3320.0,579900.0
2,for_sale,1.0,1.0,32.156834,Willington,Connecticut,6279.0,680.0,65000.0
3,for_sale,2.0,1.0,0.91,East Windsor,Connecticut,6016.0,960.0,215000.0
4,for_sale,2.0,1.0,0.36,Vernon,Connecticut,6066.0,860.0,144900.0


In [44]:
data_conneticut['house_size'] = StandardScaler().fit_transform(data_conneticut['house_size'].values.reshape(len(data_conneticut), 1))
data_conneticut['price'] = StandardScaler().fit_transform(data_conneticut['price'].values.reshape(len(data_conneticut), 1))
data_conneticut['bath'] = MinMaxScaler().fit_transform(data_conneticut['bath'].values.reshape(len(data_conneticut), 1))
data_conneticut['bed'] = MinMaxScaler().fit_transform(data_conneticut['bed'].values.reshape(len(data_conneticut), 1))
data_conneticut['acre_lot'] = MinMaxScaler().fit_transform(data_conneticut['acre_lot'].values.reshape(len(data_conneticut), 1))

In [45]:
data_conneticut

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.090909,0.000000,0.000039,Willington,Connecticut,6279.0,-0.348938,-0.206817
1,for_sale,0.136364,0.083333,0.000023,Coventry,Connecticut,6238.0,0.632040,-0.010419
2,for_sale,0.000000,0.000000,0.000321,Willington,Connecticut,6279.0,-0.849529,-0.295360
3,for_sale,0.045455,0.000000,0.000009,East Windsor,Connecticut,6016.0,-0.692393,-0.212351
4,for_sale,0.045455,0.000000,0.000004,Vernon,Connecticut,6066.0,-0.748513,-0.251144
...,...,...,...,...,...,...,...,...,...
98807,for_sale,0.090909,0.083333,0.000010,New Milford,Connecticut,6755.0,-0.507758,-0.084573
98808,for_sale,0.090909,0.083333,0.000038,North Canaan,Connecticut,6018.0,-0.332664,-0.096139
98809,for_sale,0.090909,0.061979,0.000026,Sherman,Connecticut,6784.0,-0.008438,-0.173061
98810,for_sale,0.136364,0.125000,0.000012,Salisbury,Connecticut,6039.0,0.546177,0.440651


In [46]:
X = data_conneticut[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
# hot encode the city and state
X = pd.get_dummies(X, columns=['city', 'state'])

y = data_conneticut['price']

In [47]:
y

0       -0.206817
1       -0.010419
2       -0.295360
3       -0.212351
4       -0.251144
           ...   
98807   -0.084573
98808   -0.096139
98809   -0.173061
98810    0.440651
98811    0.346574
Name: price, Length: 98812, dtype: float64

In [48]:
X.head()

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,city_Andover,city_Ansonia,city_Ashford,city_Avon,city_Barkhamsted,...,city_Wilton,city_Winchester,city_Windham,city_Windsor,city_Windsor Locks,city_Wolcott,city_Woodbridge,city_Woodbury,city_Woodstock,state_Connecticut
0,0.090909,0.0,3.9e-05,6279.0,-0.348938,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0.136364,0.083333,2.3e-05,6238.0,0.63204,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,0.0,0.0,0.000321,6279.0,-0.849529,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,0.045455,0.0,9e-06,6016.0,-0.692393,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,0.045455,0.0,4e-06,6066.0,-0.748513,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [50]:
model = LinearRegression()
model.fit(X_train, y_train)

In [51]:
y_pred = model.predict(X_test)

In [52]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [53]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')


MSE: 0.5931886552666533
RMSE: 0.7701874156766347
MAE: 0.22994530157194465
R2: 0.37294523082910214


In [54]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [55]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [56]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.003666518783118711
RMSE: 0.06055178596142901
MAE: 0.004348629791769399
R2: 0.9961241536418531


In [58]:
y_pred_tree[89]

-0.1902156502089453