In [2]:
import pandas as pd 
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [90]:
data_conneticut = pd.read_csv('D:\DESKTOP\AML_Project\Housing-Price\data\interim\Connecticut.csv')

In [91]:
data_conneticut.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,1.0,3.93,Willington,Connecticut,6279.0,1572.0,225000.0
1,for_sale,4.0,3.0,2.34,Coventry,Connecticut,6238.0,3320.0,579900.0
2,for_sale,1.0,1.0,32.156834,Willington,Connecticut,6279.0,680.0,65000.0
3,for_sale,2.0,1.0,0.91,East Windsor,Connecticut,6016.0,960.0,215000.0
4,for_sale,2.0,1.0,0.36,Vernon,Connecticut,6066.0,860.0,144900.0


In [92]:
data_conneticut['house_size'] = StandardScaler().fit_transform(data_conneticut['house_size'].values.reshape(len(data_conneticut), 1))

price_scaler_conn = StandardScaler().fit(data_conneticut['price'].values.reshape(-1, 1))  # Assign to a variable
data_conneticut['price'] = price_scaler_conn.transform(data_conneticut['price'].values.reshape(-1, 1))

data_conneticut['bath'] = MinMaxScaler().fit_transform(data_conneticut['bath'].values.reshape(len(data_conneticut), 1))
data_conneticut['bed'] = MinMaxScaler().fit_transform(data_conneticut['bed'].values.reshape(len(data_conneticut), 1))
data_conneticut['acre_lot'] = MinMaxScaler().fit_transform(data_conneticut['acre_lot'].values.reshape(len(data_conneticut), 1))

In [93]:
data_conneticut

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.090909,0.000000,0.000039,Willington,Connecticut,6279.0,-0.348938,-0.206817
1,for_sale,0.136364,0.083333,0.000023,Coventry,Connecticut,6238.0,0.632040,-0.010419
2,for_sale,0.000000,0.000000,0.000321,Willington,Connecticut,6279.0,-0.849529,-0.295360
3,for_sale,0.045455,0.000000,0.000009,East Windsor,Connecticut,6016.0,-0.692393,-0.212351
4,for_sale,0.045455,0.000000,0.000004,Vernon,Connecticut,6066.0,-0.748513,-0.251144
...,...,...,...,...,...,...,...,...,...
98807,for_sale,0.090909,0.083333,0.000010,New Milford,Connecticut,6755.0,-0.507758,-0.084573
98808,for_sale,0.090909,0.083333,0.000038,North Canaan,Connecticut,6018.0,-0.332664,-0.096139
98809,for_sale,0.090909,0.061979,0.000026,Sherman,Connecticut,6784.0,-0.008438,-0.173061
98810,for_sale,0.136364,0.125000,0.000012,Salisbury,Connecticut,6039.0,0.546177,0.440651


In [94]:
X = data_conneticut[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
# hot encode the city and state
X = pd.get_dummies(X, columns=['city', 'state'])

y = data_conneticut['price']

In [95]:
y

0       -0.206817
1       -0.010419
2       -0.295360
3       -0.212351
4       -0.251144
           ...   
98807   -0.084573
98808   -0.096139
98809   -0.173061
98810    0.440651
98811    0.346574
Name: price, Length: 98812, dtype: float64

In [96]:
X.head()

Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,city_Andover,city_Ansonia,city_Ashford,city_Avon,city_Barkhamsted,...,city_Wilton,city_Winchester,city_Windham,city_Windsor,city_Windsor Locks,city_Wolcott,city_Woodbridge,city_Woodbury,city_Woodstock,state_Connecticut
0,0.090909,0.0,3.9e-05,6279.0,-0.348938,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.136364,0.083333,2.3e-05,6238.0,0.63204,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.0,0.0,0.000321,6279.0,-0.849529,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.045455,0.0,9e-06,6016.0,-0.692393,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0.045455,0.0,4e-06,6066.0,-0.748513,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [98]:
model = LinearRegression()
model.fit(X_train, y_train)

In [99]:
y_pred = model.predict(X_test)

In [100]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [101]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')


MSE: 0.5931887035155777
RMSE: 0.7701874469994806
MAE: 0.22994551377862324
R2: 0.3729451798255674


In [102]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [103]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [104]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.0027511114733679244
RMSE: 0.0524510388206747
MAE: 0.004041464911771258
R2: 0.9970918230573363


In [124]:
y_pred_tree_reshaped = y_pred_tree.reshape(-1, 1)
original_price_predictions = price_scaler_conn.inverse_transform(y_pred_tree_reshaped)
absolute_predictions = np.abs(original_price_predictions)
absolute_predictions[89]

array([1205009.73080667])

In [106]:
data_delaware = pd.read_csv('D:\DESKTOP\AML_Project\Housing-Price\data\interim\Delaware.csv')

In [107]:
data_delaware['house_size'] = StandardScaler().fit_transform(data_delaware['house_size'].values.reshape(len(data_delaware), 1))

price_scaler_del = StandardScaler().fit(data_delaware['price'].values.reshape(-1, 1))  # Assign to a variable
data_delaware['price'] = price_scaler_del.transform(data_delaware['price'].values.reshape(-1, 1))

data_delaware['bath'] = MinMaxScaler().fit_transform(data_delaware['bath'].values.reshape(len(data_delaware), 1))
data_delaware['bed'] = MinMaxScaler().fit_transform(data_delaware['bed'].values.reshape(len(data_delaware), 1))
data_delaware['acre_lot'] = MinMaxScaler().fit_transform(data_delaware['acre_lot'].values.reshape(len(data_delaware), 1))

In [108]:
data_delaware.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.133333,0.0,0.000143,New Castle,Delaware,19720.0,-0.71278,-1.002163
1,for_sale,0.2,0.111111,0.000229,New Castle,Delaware,19720.0,0.043122,-0.527026
2,for_sale,0.266667,0.165279,5.7e-05,Wilmington,Delaware,19805.0,0.140811,-0.544799
3,for_sale,0.333333,0.111111,8.6e-05,Wilmington,Delaware,19801.0,0.472763,-0.943768
4,for_sale,0.133333,0.165279,2.9e-05,Wilmington,Delaware,19802.0,0.144354,-0.998173


In [109]:
X = data_delaware[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X = pd.get_dummies(X, columns=['city', 'state'])
y  = data_delaware['price']

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [111]:
model = LinearRegression()
model.fit(X_train, y_train)

In [125]:
y_pred = model.predict(X_test)
y_pred_reshaped = y_pred.reshape(-1, 1)
original_price_predictions = price_scaler_del.inverse_transform(y_pred_reshaped)
absolute_predictions = np.abs(original_price_predictions)
absolute_predictions[89]

array([300593.96664185])

In [113]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [114]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 0.5001263651387834
RMSE: 0.7071961291882072
MAE: 0.3765443499369436
R2: 0.5741006940105161


In [126]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [116]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [117]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.3054045791501775
RMSE: 0.5526342182223043
MAE: 0.183612004825363
R2: 0.7399225328383225


In [3]:
data_delaware = pd.read_csv('/Users/saksham/Projects/Housing-Price/data/interim/Delaware.csv')

In [4]:
data_delaware['house_size'] = StandardScaler().fit_transform(data_delaware['house_size'].values.reshape(len(data_delaware), 1))
data_delaware['price'] = StandardScaler().fit_transform(data_delaware['price'].values.reshape(len(data_delaware), 1))
data_delaware['bath'] = MinMaxScaler().fit_transform(data_delaware['bath'].values.reshape(len(data_delaware), 1))
data_delaware['bed'] = MinMaxScaler().fit_transform(data_delaware['bed'].values.reshape(len(data_delaware), 1))
data_delaware['acre_lot'] = MinMaxScaler().fit_transform(data_delaware['acre_lot'].values.reshape(len(data_delaware), 1))

In [5]:
data_delaware.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,0.133333,0.0,0.000143,New Castle,Delaware,19720.0,-0.71278,-1.002163
1,for_sale,0.2,0.111111,0.000229,New Castle,Delaware,19720.0,0.043122,-0.527026
2,for_sale,0.266667,0.165279,5.7e-05,Wilmington,Delaware,19805.0,0.140811,-0.544799
3,for_sale,0.333333,0.111111,8.6e-05,Wilmington,Delaware,19801.0,0.472763,-0.943768
4,for_sale,0.133333,0.165279,2.9e-05,Wilmington,Delaware,19802.0,0.144354,-0.998173


In [7]:
X = data_delaware[['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'city', 'state']]
X = pd.get_dummies(X, columns=['city', 'state'])
y  = data_delaware['price']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model =  LinearRegression()
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [12]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

MSE: 0.5001263651387718
RMSE: 0.707196129188199
MAE: 0.3765443499369449
R2: 0.574100694010526


In [13]:
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_tree = model_tree.predict(X_test)


In [14]:
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [15]:
print(f'MSE: {mse_tree}')
print(f'RMSE: {rmse_tree}')
print(f'MAE: {mae_tree}')
print(f'R2: {r2_tree}')

MSE: 0.34545068842689675
RMSE: 0.5877505324769147
MAE: 0.1913920942304789
R2: 0.7058199313012073
