In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import joblib

In [2]:
data_conneticut = pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Connecticut.csv')

In [3]:
data_conneticut.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,1.0,3.93,Willington,Connecticut,6279.0,1572.0,225000.0
1,for_sale,4.0,3.0,2.34,Coventry,Connecticut,6238.0,3320.0,579900.0
2,for_sale,1.0,1.0,32.156834,Willington,Connecticut,6279.0,680.0,65000.0
3,for_sale,2.0,1.0,0.91,East Windsor,Connecticut,6016.0,960.0,215000.0
4,for_sale,2.0,1.0,0.36,Vernon,Connecticut,6066.0,860.0,144900.0


In [4]:
data_conneticut['house_size'] = StandardScaler().fit_transform(data_conneticut['house_size'].values.reshape(len(data_conneticut), 1))

price_scaler_conn = StandardScaler().fit(data_conneticut['price'].values.reshape(-1, 1))  # Assign to a variable
data_conneticut['price'] = price_scaler_conn.transform(data_conneticut['price'].values.reshape(-1, 1))

data_conneticut['bath'] = MinMaxScaler().fit_transform(data_conneticut['bath'].values.reshape(len(data_conneticut), 1))
data_conneticut['bed'] = MinMaxScaler().fit_transform(data_conneticut['bed'].values.reshape(len(data_conneticut), 1))
data_conneticut['acre_lot'] = MinMaxScaler().fit_transform(data_conneticut['acre_lot'].values.reshape(len(data_conneticut), 1))

In [5]:
data_conneticut

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.090909,0.000000,0.000039,Willington,Connecticut,6279.0,-0.348938,-0.206817
1,for_sale,0.136364,0.083333,0.000023,Coventry,Connecticut,6238.0,0.632040,-0.010419
2,for_sale,0.000000,0.000000,0.000321,Willington,Connecticut,6279.0,-0.849529,-0.295360
3,for_sale,0.045455,0.000000,0.000009,East Windsor,Connecticut,6016.0,-0.692393,-0.212351
4,for_sale,0.045455,0.000000,0.000004,Vernon,Connecticut,6066.0,-0.748513,-0.251144
...,...,...,...,...,...,...,...,...,...
98807,for_sale,0.090909,0.083333,0.000010,New Milford,Connecticut,6755.0,-0.507758,-0.084573
98808,for_sale,0.090909,0.083333,0.000038,North Canaan,Connecticut,6018.0,-0.332664,-0.096139
98809,for_sale,0.090909,0.061979,0.000026,Sherman,Connecticut,6784.0,-0.008438,-0.173061
98810,for_sale,0.136364,0.125000,0.000012,Salisbury,Connecticut,6039.0,0.546177,0.440651


In [6]:
X = data_conneticut[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
# hot encode the city and state
X = pd.get_dummies(X, columns=['city', 'state'])

y = data_conneticut['price']

In [7]:
y

0       -0.206817
1       -0.010419
2       -0.295360
3       -0.212351
4       -0.251144
           ...   
98807   -0.084573
98808   -0.096139
98809   -0.173061
98810    0.440651
98811    0.346574
Name: price, Length: 98812, dtype: float64

In [8]:
X.head()

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,city_Andover,city_Ansonia,city_Ashford,city_Avon,city_Barkhamsted,...,city_Wilton,city_Winchester,city_Windham,city_Windsor,city_Windsor Locks,city_Wolcott,city_Woodbridge,city_Woodbury,city_Woodstock,state_Connecticut
0,0.090909,0.0,3.9e-05,6279.0,-0.348938,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0.136364,0.083333,2.3e-05,6238.0,0.63204,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,0.0,0.0,0.000321,6279.0,-0.849529,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,0.045455,0.0,9e-06,6016.0,-0.692393,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,0.045455,0.0,4e-06,6066.0,-0.748513,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [13]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')


MSE: 0.5931886801965697
RMSE: 0.7701874318609527
MAE: 0.2299454205561855
R2: 0.37294520447589585


In [14]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [15]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [16]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.003624857330025026
RMSE: 0.06020678807265029
MAE: 0.004359750737144872
R2: 0.9961681936156811


In [17]:
y_pred_tree_reshaped = y_pred_tree.reshape(-1, 1)
original_price_predictions = price_scaler_conn.inverse_transform(y_pred_tree_reshaped)
absolute_predictions = np.abs(original_price_predictions)
absolute_predictions[89]

array([255000.])

In [18]:
joblib.dump(model_tree, '../models/primary/connecticut_dt.joblib')
joblib.dump(model, '../models/primary/connecticut_lr.joblib')

['../models/primary/connecticut_lr.joblib']

In [19]:
data_delaware = pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Delaware.csv')

In [20]:
data_delaware['house_size'] = StandardScaler().fit_transform(data_delaware['house_size'].values.reshape(len(data_delaware), 1))

price_scaler_del = StandardScaler().fit(data_delaware['price'].values.reshape(-1, 1))  # Assign to a variable
data_delaware['price'] = price_scaler_del.transform(data_delaware['price'].values.reshape(-1, 1))

data_delaware['bath'] = MinMaxScaler().fit_transform(data_delaware['bath'].values.reshape(len(data_delaware), 1))
data_delaware['bed'] = MinMaxScaler().fit_transform(data_delaware['bed'].values.reshape(len(data_delaware), 1))
data_delaware['acre_lot'] = MinMaxScaler().fit_transform(data_delaware['acre_lot'].values.reshape(len(data_delaware), 1))

In [21]:
data_delaware.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.133333,0.0,0.000143,New Castle,Delaware,19720.0,-0.71278,-1.002163
1,for_sale,0.2,0.111111,0.000229,New Castle,Delaware,19720.0,0.043122,-0.527026
2,for_sale,0.266667,0.165279,5.7e-05,Wilmington,Delaware,19805.0,0.140811,-0.544799
3,for_sale,0.333333,0.111111,8.6e-05,Wilmington,Delaware,19801.0,0.472763,-0.943768
4,for_sale,0.133333,0.165279,2.9e-05,Wilmington,Delaware,19802.0,0.144354,-0.998173


In [22]:
X = data_delaware[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X = pd.get_dummies(X, columns=['city', 'state'])
y  = data_delaware['price']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [24]:
model = LinearRegression()
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)
y_pred_reshaped = y_pred.reshape(-1, 1)
original_price_predictions = price_scaler_del.inverse_transform(y_pred_reshaped)
absolute_predictions = np.abs(original_price_predictions)
absolute_predictions[89]

array([300593.96664187])

In [26]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [27]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 0.5001263651387725
RMSE: 0.7071961291881995
MAE: 0.3765443499369424
R2: 0.5741006940105253


In [28]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [29]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [30]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.3429934670908027
RMSE: 0.5856564411758849
MAE: 0.19138662311508634
R2: 0.7079124601792133


In [31]:
joblib.dump(model_tree, '../models/primary/delaware_dt.joblib')
joblib.dump(model, '../models/primary/delaware_lr.joblib')

['../models/primary/delaware_lr.joblib']

In [32]:
data_maine= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Maine.csv')

In [33]:
data_maine.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,4.0,3.0,2.0,Eliot,Maine,3903.0,2628.0,750000.0
1,for_sale,3.0,4.0,2.97,Eliot,Maine,3903.0,2600.0,549000.0
2,for_sale,5.0,5.0,0.95,Eliot,Maine,3903.0,3116.0,850000.0
3,for_sale,3.0,2.0,1.23,Eliot,Maine,3903.0,1116.0,469500.0
4,for_sale,3.0,3.0,2.01,Saco,Maine,4027.0,2080.0,599990.0


In [34]:
data_maine['house_size']= StandardScaler().fit_transform(data_maine['house_size'].values.reshape(len(data_maine), 1))

price_scaler_mai= StandardScaler().fit(data_maine['price'].values.reshape(-1, 1))  # Assign to a variable
data_maine['price']= price_scaler_mai.transform(data_maine['price'].values.reshape(-1, 1))

data_maine['bath']= MinMaxScaler().fit_transform(data_maine['bath'].values.reshape(len(data_maine), 1))
data_maine['bed']= MinMaxScaler().fit_transform(data_maine['bed'].values.reshape(len(data_maine), 1))
data_maine['acre_lot']= MinMaxScaler().fit_transform(data_maine['acre_lot'].values.reshape(len(data_maine), 1))

In [35]:
data_maine.head( )

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.103448,0.105263,0.000145,Eliot,Maine,3903.0,0.283693,0.501779
1,for_sale,0.068966,0.157895,0.000216,Eliot,Maine,3903.0,0.266974,0.196141
2,for_sale,0.137931,0.210526,6.9e-05,Eliot,Maine,3903.0,0.575076,0.653838
3,for_sale,0.068966,0.052632,8.9e-05,Eliot,Maine,3903.0,-0.619115,0.075254
4,for_sale,0.068966,0.105263,0.000146,Saco,Maine,4027.0,-0.043515,0.273675


In [36]:
X= data_maine[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_maine['price']

In [37]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

In [38]:
y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_mai.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[89]

array([831124.29282096])

In [39]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

In [40]:
print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.5406832804480621
RMSE:  0.7353116893182524
MAE:  0.37506047731475084
R2:  0.46492488726690884


In [41]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)


In [42]:
mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

In [43]:
print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.00846854627306582
RMSE:  0.09202470468882701
MAE:  0.01272680979484548
R2:  0.9916192926328496


In [44]:
joblib.dump(model_tree, '../models/primary/maine_dt.joblib')
joblib.dump(model, '../models/primary/maine_lr.joblib')

['../models/primary/maine_lr.joblib']

In [45]:
data_massachusetts= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Massachusetts.csv')

In [46]:
data_massachusetts.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,2.0,1.0,0.34,Agawam,Massachusetts,1001.0,676.0,180000.0
1,for_sale,3.0,2.487507,3.41,Agawam,Massachusetts,1001.0,2178.735694,25000.0
2,for_sale,2.0,2.0,32.156834,Agawam,Massachusetts,1001.0,892.0,169900.0
3,for_sale,2.0,2.0,32.156834,Agawam,Massachusetts,1001.0,1428.0,242000.0
4,for_sale,2.0,2.0,32.156834,Agawam,Massachusetts,1001.0,1659.0,299950.0


In [47]:
data_massachusetts['house_size']= StandardScaler().fit_transform(data_massachusetts['house_size'].values.reshape(len(data_massachusetts), 1))

price_scaler_mas= StandardScaler().fit(data_massachusetts['price'].values.reshape(-1, 1))  # Assign to a variable
data_massachusetts['price']= price_scaler_mas.transform(data_massachusetts['price'].values.reshape(-1, 1))
data_massachusetts['bath']= MinMaxScaler().fit_transform(data_massachusetts['bath'].values.reshape(len(data_massachusetts), 1))
data_massachusetts['bed']= MinMaxScaler().fit_transform(data_massachusetts['bed'].values.reshape(len(data_massachusetts), 1))
data_massachusetts['acre_lot']= MinMaxScaler().fit_transform(data_massachusetts['acre_lot'].values.reshape(len(data_massachusetts), 1))

In [48]:
data_massachusetts.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.010204,0.0,3.8e-05,Agawam,Massachusetts,1001.0,-0.667431,-0.545145
1,for_sale,0.020408,0.007551,0.000377,Agawam,Massachusetts,1001.0,-0.038161,-0.652
2,for_sale,0.010204,0.005076,0.003559,Agawam,Massachusetts,1001.0,-0.576981,-0.552108
3,for_sale,0.010204,0.005076,0.003559,Agawam,Massachusetts,1001.0,-0.352531,-0.502403
4,for_sale,0.010204,0.005076,0.003559,Agawam,Massachusetts,1001.0,-0.2558,-0.462453


In [49]:
X= data_massachusetts[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_massachusetts['price']

In [50]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

In [51]:
y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_mas.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[89]

array([857782.17700072])

In [52]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

In [53]:
print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.5593626317556111
RMSE:  0.7479054965405797
MAE:  0.3187772905012709
R2:  0.45205076266780475


In [54]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

In [55]:
mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.007542238705239226
RMSE:  0.08684606326851682
MAE:  0.0030208105098336125
R2:  0.9926116552810434


In [57]:
joblib.dump(model_tree, '../models/primary/massachusetts_dt.joblib')
joblib.dump(model, '../models/primary/massachusetts_lr.joblib')

['../models/primary/massachusetts_lr.joblib']

In [58]:
data_NewHampshire= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/New_Hampshire.csv')

In [59]:
data_NewHampshire.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,2.487507,16.5,Fitzwilliam,New Hampshire,3447.0,2178.735694,199900.0
1,for_sale,3.0,2.487507,12.69,Fitzwilliam,New Hampshire,3447.0,2178.735694,150000.0
2,for_sale,3.0,2.487507,13.93,Fitzwilliam,New Hampshire,3447.0,2178.735694,150000.0
3,for_sale,3.0,2.487507,12.17,Fitzwilliam,New Hampshire,3447.0,2178.735694,131000.0
4,for_sale,3.0,2.487507,20.5,Richmond,New Hampshire,3470.0,2178.735694,99900.0


In [60]:
data_NewHampshire['house_size']= StandardScaler().fit_transform(data_NewHampshire['house_size'].values.reshape(len(data_NewHampshire), 1))
price_scaler_NewHampshire= StandardScaler().fit(data_NewHampshire['price'].values.reshape(-1, 1))  # Assign to a variable
data_NewHampshire['price']= price_scaler_NewHampshire.transform(data_NewHampshire['price'].values.reshape(-1, 1))
data_NewHampshire['bath']= MinMaxScaler().fit_transform(data_NewHampshire['bath'].values.reshape(len(data_NewHampshire), 1))
data_NewHampshire['bed']= MinMaxScaler().fit_transform(data_NewHampshire['bed'].values.reshape(len(data_NewHampshire), 1))
data_NewHampshire['acre_lot']= MinMaxScaler().fit_transform(data_NewHampshire['acre_lot'].values.reshape(len(data_NewHampshire), 1))

In [61]:
data_NewHampshire.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.133333,0.053125,0.027484,Fitzwilliam,New Hampshire,3447.0,-0.038605,-0.372857
1,for_sale,0.133333,0.053125,0.021134,Fitzwilliam,New Hampshire,3447.0,-0.038605,-0.4391
2,for_sale,0.133333,0.053125,0.0232,Fitzwilliam,New Hampshire,3447.0,-0.038605,-0.4391
3,for_sale,0.133333,0.053125,0.020267,Fitzwilliam,New Hampshire,3447.0,-0.038605,-0.464323
4,for_sale,0.133333,0.053125,0.034151,Richmond,New Hampshire,3470.0,-0.038605,-0.505608


In [62]:
X= data_NewHampshire[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_NewHampshire['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_NewHampshire.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[60]

array([370801.83748778])

In [63]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.4915712155163244
RMSE:  0.7011213985582841
MAE:  0.34476279889725914
R2:  0.4653349146290511


In [64]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.0027408512353859315
RMSE:  0.0523531396898594
MAE:  0.0029686574486224105
R2:  0.9970188704840712


In [65]:
joblib.dump(model_tree, '../models/primary/newhampshire_dt.joblib')
joblib.dump(model, '../models/primary/newhampshire_lr.joblib')

['../models/primary/newhampshire_lr.joblib']

In [66]:
data_NewJersey= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/New_Jersey.csv')
data_NewJersey.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,3.0,0.07,Burlington,New Jersey,8016.0,1500.0,333490.0
1,for_sale,3.0,3.0,0.07,Burlington,New Jersey,8016.0,1500.0,333490.0
2,for_sale,3.0,3.0,0.07,Burlington,New Jersey,8016.0,1500.0,333490.0
3,for_sale,3.0,3.0,0.07,Burlington,New Jersey,8016.0,1500.0,333490.0
4,for_sale,3.0,3.0,0.07,Burlington,New Jersey,8016.0,1500.0,333490.0


In [67]:
data_NewJersey['house_size']= StandardScaler().fit_transform(data_NewJersey['house_size'].values.reshape(len(data_NewJersey), 1))
price_scaler_NewJersey= StandardScaler().fit(data_NewJersey['price'].values.reshape(-1, 1))  # Assign to a variable
data_NewJersey['price']= price_scaler_NewJersey.transform(data_NewJersey['price'].values.reshape(-1, 1))
data_NewJersey['bath']= MinMaxScaler().fit_transform(data_NewJersey['bath'].values.reshape(len(data_NewJersey), 1))
data_NewJersey['bed']= MinMaxScaler().fit_transform(data_NewJersey['bed'].values.reshape(len(data_NewJersey), 1))
data_NewJersey['acre_lot']= MinMaxScaler().fit_transform(data_NewJersey['acre_lot'].values.reshape(len(data_NewJersey), 1))
data_NewJersey.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.043478,0.052632,7e-07,Burlington,New Jersey,8016.0,-0.303212,-0.356667
1,for_sale,0.043478,0.052632,7e-07,Burlington,New Jersey,8016.0,-0.303212,-0.356667
2,for_sale,0.043478,0.052632,7e-07,Burlington,New Jersey,8016.0,-0.303212,-0.356667
3,for_sale,0.043478,0.052632,7e-07,Burlington,New Jersey,8016.0,-0.303212,-0.356667
4,for_sale,0.043478,0.052632,7e-07,Burlington,New Jersey,8016.0,-0.303212,-0.356667


In [68]:
X= data_NewJersey[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_NewJersey['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_NewJersey.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[71]


array([615995.07912463])

In [69]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.39898217122669766
RMSE:  0.6316503552019089
MAE:  0.29270076559641967
R2:  0.5643052663022174


In [70]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.034119212335853924
RMSE:  0.18471386611690505
MAE:  0.029917910851837645
R2:  0.962741289699881


In [71]:
joblib.dump(model_tree, '../models/primary/newjersey_dt.joblib')
joblib.dump(model, '../models/primary/newjersey_lr.joblib')

['../models/primary/newjersey_lr.joblib']

In [109]:
data_NewYork= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/New_York.csv')
data_NewYork.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,1.0,60.0,Berlin,New York,12022.0,1176.0,175000.0
1,for_sale,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
2,for_sale,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
3,for_sale,3.0,3.0,1.9,Copake,New York,12516.0,1800.0,419000.0
4,for_sale,3.0,2.0,2.0,Copake,New York,12517.0,1482.0,365000.0


In [110]:
data_NewYork['house_size']= StandardScaler().fit_transform(data_NewYork['house_size'].values.reshape(len(data_NewYork), 1))
price_scaler_NewYork= StandardScaler().fit(data_NewYork['price'].values.reshape(-1, 1))
data_NewYork['price']= price_scaler_NewYork.transform(data_NewYork['price'].values.reshape(-1, 1))
data_NewYork['bath']= MinMaxScaler().fit_transform(data_NewYork['bath'].values.reshape(len(data_NewYork), 1))
data_NewYork['bed']= MinMaxScaler().fit_transform(data_NewYork['bed'].values.reshape(len(data_NewYork), 1))
data_NewYork['acre_lot']= MinMaxScaler().fit_transform(data_NewYork['acre_lot'].values.reshape(len(data_NewYork), 1))


In [111]:
X= data_NewYork[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_NewYork['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_NewYork.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[89]

array([134773.90504416])

In [112]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.2478637724410383
RMSE:  0.4978591893708886
MAE:  0.19407176876044474
R2:  0.3711574299789214


In [113]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.024721734575064457
RMSE:  0.15723146814510275
MAE:  0.012466552400701592
R2:  0.9372797446256065


In [114]:
joblib.dump(model_tree, '../models/primary/newyork_dt.joblib')
joblib.dump(model, '../models/primary/newyork_lr.joblib')

['../models/primary/newyork_lr.joblib']

In [77]:
data_Pennsylvania= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Pennsylvania.csv')
data_Pennsylvania.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,5.0,5.0,0.92,Newtown Square,Pennsylvania,19073.0,4953.0,1470000.0
1,for_sale,5.0,5.0,0.92,Newtown Square,Pennsylvania,19073.0,4953.0,1445000.0
2,for_sale,5.0,5.0,32.156834,Newtown Square,Pennsylvania,19073.0,2178.735694,1445000.0
3,for_sale,5.0,5.0,0.92,Newtown Square,Pennsylvania,19073.0,4953.0,1420000.0
4,for_sale,5.0,5.0,0.92,Newtown Square,Pennsylvania,19073.0,4953.0,1445000.0


In [78]:
data_Pennsylvania['house_size']= StandardScaler().fit_transform(data_Pennsylvania['house_size'].values.reshape(len(data_Pennsylvania), 1))
price_scaler_Pennsylvania= StandardScaler().fit(data_Pennsylvania['price'].values.reshape(-1, 1))
data_Pennsylvania['price']= price_scaler_Pennsylvania.transform(data_Pennsylvania['price'].values.reshape(-1, 1))
data_Pennsylvania['bath']= MinMaxScaler().fit_transform(data_Pennsylvania['bath'].values.reshape(len(data_Pennsylvania), 1))
data_Pennsylvania['bed']= MinMaxScaler().fit_transform(data_Pennsylvania['bed'].values.reshape(len(data_Pennsylvania), 1))
data_Pennsylvania['acre_lot']= MinMaxScaler().fit_transform(data_Pennsylvania['acre_lot'].values.reshape(len(data_Pennsylvania), 1))

In [79]:
X= data_Pennsylvania[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_Pennsylvania['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()

model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_Pennsylvania.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[680]

array([1228640.11109349])

In [80]:
mse= mean_squared_error(y_test, y_pred)  # seems to be a problem with the prediction
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  98211572501914.23
RMSE:  9910175.200364232
MAE:  369840.4316585278
R2:  -133296773768215.11


In [81]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.16518868000233652
RMSE:  0.4064341029027172
MAE:  0.06735857535885445
R2:  0.775799149261025


In [82]:
joblib.dump(model_tree, '../models/primary/pennsylvania_dt.joblib')
joblib.dump(model, '../models/primary/pennsylvania_lr.joblib')

['../models/primary/pennsylvania_lr.joblib']

In [83]:
data_PuertoRico= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Puerto_Rico.csv')
data_PuertoRico.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,2178.735694,65000.0


In [84]:
data_PuertoRico['house_size']= StandardScaler().fit_transform(data_PuertoRico['house_size'].values.reshape(len(data_PuertoRico), 1))
price_scaler_PuertoRico= StandardScaler().fit(data_PuertoRico['price'].values.reshape(-1, 1))
data_PuertoRico['price']= price_scaler_PuertoRico.transform(data_PuertoRico['price'].values.reshape(-1, 1))
data_PuertoRico['bath']= MinMaxScaler().fit_transform(data_PuertoRico['bath'].values.reshape(len(data_PuertoRico), 1))
data_PuertoRico['bed']= MinMaxScaler().fit_transform(data_PuertoRico['bed'].values.reshape(len(data_PuertoRico), 1))
data_PuertoRico['acre_lot']= MinMaxScaler().fit_transform(data_PuertoRico['acre_lot'].values.reshape(len(data_PuertoRico), 1))

In [85]:
X= data_PuertoRico[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_PuertoRico['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_PuertoRico.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[621]

array([348604.45960788])

In [86]:
mse= mean_squared_error(y_test, y_pred)  
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  6.962377063872717
RMSE:  2.63863166506292
MAE:  0.36463972838340347
R2:  -6.226864155399053


In [87]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.18716180658352094
RMSE:  0.4326220135216433
MAE:  0.03081761663609676
R2:  0.805728282330946


In [88]:
joblib.dump(model_tree, '../models/primary/puertorico_dt.joblib')
joblib.dump(model, '../models/primary/puertorico_lr.joblib')

['../models/primary/puertorico_lr.joblib']

In [89]:
data_RhodeIsland= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Rhode_Island.csv')
data_RhodeIsland.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,4.0,2.0,2.02,Burrillville,Rhode Island,2830.0,1512.0,350000.0
1,for_sale,3.0,3.0,0.21,Woonsocket,Rhode Island,2895.0,3419.0,399000.0
2,for_sale,4.0,2.0,0.09,North Smithfield,Rhode Island,2896.0,3154.0,329000.0
3,for_sale,3.0,3.0,0.53,Burrillville,Rhode Island,2830.0,1642.0,399000.0
4,for_sale,3.0,1.0,32.156834,Woonsocket,Rhode Island,2895.0,1224.0,159900.0


In [90]:
data_RhodeIsland['house_size']= StandardScaler().fit_transform(data_RhodeIsland['house_size'].values.reshape(len(data_RhodeIsland), 1))
price_scaler_RhodeIsland= StandardScaler().fit(data_RhodeIsland['price'].values.reshape(-1, 1))
data_RhodeIsland['price']= price_scaler_RhodeIsland.transform(data_RhodeIsland['price'].values.reshape(-1, 1))
data_RhodeIsland['bath']= MinMaxScaler().fit_transform(data_RhodeIsland['bath'].values.reshape(len(data_RhodeIsland), 1))
data_RhodeIsland['bed']= MinMaxScaler().fit_transform(data_RhodeIsland['bed'].values.reshape(len(data_RhodeIsland), 1))
data_RhodeIsland['acre_lot']= MinMaxScaler().fit_transform(data_RhodeIsland['acre_lot'].values.reshape(len(data_RhodeIsland), 1))

In [91]:
X= data_RhodeIsland[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_RhodeIsland['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_RhodeIsland.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[990]

array([323194.91609527])

In [92]:
mse= mean_squared_error(y_test, y_pred)  
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.5178219712233484
RMSE:  0.7195984791697022
MAE:  0.38653619460644156
R2:  0.4637004841688679


In [93]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.08077301982967579
RMSE:  0.2842059461546781
MAE:  0.013032019415507015
R2:  0.9163447404046337


In [95]:
joblib.dump(model_tree, '../models/primary/rhodeisland_dt.joblib')
joblib.dump(model, '../models/primary/rhodeisland_lr.joblib')

['../models/primary/rhodeisland_lr.joblib']

In [96]:
data_Vermont= pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Vermont.csv')
data_Vermont.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,2.487507,48.8,Readsboro,Vermont,5350.0,2178.735694,109900.0
1,for_sale,3.0,2.487507,48.8,Readsboro,Vermont,5350.0,2178.735694,109900.0
2,for_sale,3.0,2.487507,10.0,Whitingham,Vermont,5361.0,2178.735694,39900.0
3,for_sale,3.0,1.0,3.4,Guilford,Vermont,5301.0,1335.0,200000.0
4,for_sale,6.0,3.0,2.56,Vernon,Vermont,5354.0,3108.0,515000.0


In [97]:
data_Vermont['house_size']= StandardScaler().fit_transform(data_Vermont['house_size'].values.reshape(len(data_Vermont), 1))
price_scaler_Vermont= StandardScaler().fit(data_Vermont['price'].values.reshape(-1, 1))
data_Vermont['price']= price_scaler_Vermont.transform(data_Vermont['price'].values.reshape(-1, 1))
data_Vermont['bath']= MinMaxScaler().fit_transform(data_Vermont['bath'].values.reshape(len(data_Vermont), 1))
data_Vermont['bed']= MinMaxScaler().fit_transform(data_Vermont['bed'].values.reshape(len(data_Vermont), 1))
data_Vermont['acre_lot']= MinMaxScaler().fit_transform(data_Vermont['acre_lot'].values.reshape(len(data_Vermont), 1))

In [98]:
data_Vermont.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.074074,0.053125,0.051348,Readsboro,Vermont,5350.0,-0.091578,-0.494546
1,for_sale,0.074074,0.053125,0.051348,Readsboro,Vermont,5350.0,-0.091578,-0.494546
2,for_sale,0.074074,0.053125,0.010505,Whitingham,Vermont,5361.0,-0.091578,-0.594871
3,for_sale,0.074074,0.0,0.003558,Guilford,Vermont,5301.0,-0.735346,-0.365413
4,for_sale,0.185185,0.071429,0.002674,Vernon,Vermont,5354.0,0.617448,0.08605


In [99]:
X= data_Vermont[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_Vermont['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_Vermont.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[4561]

array([472747.72258154])

In [100]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.5577122247717182
RMSE:  0.746801328849727
MAE:  0.4027222239239513
R2:  0.4684744632411558


In [101]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.0015408668148492685
RMSE:  0.039253876430860535
MAE:  0.005365840944470717
R2:  0.9985314826814637


In [102]:
joblib.dump(model_tree, '../models/primary/vermont_dt.joblib')
joblib.dump(model, '../models/primary/vermont_lr.joblib')

['../models/primary/vermont_lr.joblib']

In [103]:
data_VirginIslands = pd.read_csv('/Users/akshatsharma/UPES/Sem 6/AI_Applications/Housing-Price/data/interim/Virgin_Islands.csv')
data_VirginIslands.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,5.0,2.0,0.21,Saint Thomas,Virgin Islands,802.0,1140.0,175000.0
1,for_sale,9.0,5.0,0.04,Saint Thomas,Virgin Islands,802.0,3120.0,100000.0
2,for_sale,4.0,6.0,1.04,Saint Thomas,Virgin Islands,802.0,6760.0,1599000.0
3,for_sale,3.0,1.0,32.156834,Saint Thomas,Virgin Islands,802.0,235.0,115000.0
4,for_sale,3.0,5.0,0.81,Saint Thomas,Virgin Islands,802.0,4500.0,925000.0


In [104]:
data_VirginIslands['house_size']= StandardScaler().fit_transform(data_VirginIslands['house_size'].values.reshape(len(data_VirginIslands), 1))
price_scaler_VirginIslands= StandardScaler().fit(data_VirginIslands['price'].values.reshape(-1, 1))
data_VirginIslands['price']= price_scaler_VirginIslands.transform(data_VirginIslands['price'].values.reshape(-1, 1))
data_VirginIslands['bath']= MinMaxScaler().fit_transform(data_VirginIslands['bath'].values.reshape(len(data_VirginIslands), 1))
data_VirginIslands['bed']= MinMaxScaler().fit_transform(data_VirginIslands['bed'].values.reshape(len(data_VirginIslands), 1))
data_VirginIslands['acre_lot']= MinMaxScaler().fit_transform(data_VirginIslands['acre_lot'].values.reshape(len(data_VirginIslands), 1))

In [105]:
X= data_VirginIslands[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X= pd.get_dummies(X, columns=['city', 'state'])
y = data_VirginIslands['price']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model=  LinearRegression()
model.fit(X_train, y_train )

y_pred= model.predict(X_test)
y_pred_reshaped= y_pred.reshape(-1, 1)
original_price_predictions= price_scaler_VirginIslands.inverse_transform(y_pred_reshaped)
absolute_predictions= np.abs(original_price_predictions)
absolute_predictions[410]

array([392271.2048091])

In [106]:
mse= mean_squared_error(y_test, y_pred)
rmse= np.sqrt(mse)
mae= mean_absolute_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)

print(f'MSE:  {mse}')
print(f'RMSE:  {rmse}')
print(f'MAE:  {mae}')
print(f'R2:  {r2}')

MSE:  0.7554249730747764
RMSE:  0.8691518699713972
MAE:  0.3006605045441261
R2:  -0.1548348603126084


In [107]:
model_tree= DecisionTreeRegressor()
model_tree.fit(X_train, y_train )
y_pred_tree= model_tree.predict(X_test)

mse_tree= mean_squared_error(y_test, y_pred_tree)
rmse_tree= np.sqrt(mse_tree)
mae_tree= mean_absolute_error(y_test, y_pred_tree)
r2_tree= r2_score(y_test, y_pred_tree)

print(f'MSE:  {mse_tree}')
print(f'RMSE:  {rmse_tree}')
print(f'MAE:  {mae_tree}')
print(f'R2:  {r2_tree}')

MSE:  0.005157453204365332
RMSE:  0.07181541063285325
MAE:  0.02209870794209465
R2:  0.992115687244771


In [108]:
joblib.dump(model_tree, '../models/primary/virginislands_dt.joblib')
joblib.dump(model, '../models/primary/virginislands_lr.joblib')

['../models/primary/virginislands_lr.joblib']