In [214]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


#loading the dataset
file_path = 'Data.csv'
df = pd.read_csv(file_path)
df = df.drop(columns=['Unnamed: 0','title', 'list_price','Unnamed: 8', 'Unnamed: 9'], errors='ignore')
df['final_price'] = df['final_price'].str.replace(',', '')
df['final_price'] = pd.to_numeric(df['final_price'], errors='coerce')
df['sqft'] = df['sqft'].replace('N/A', np.nan)
df['sqft'] = pd.to_numeric(df['sqft'], errors='coerce')

#setting ranges
df = df[df['parking'] <= 4]
df = df[df['final_price'] <= 2000000]
df = df[df['final_price'] >= 300000]

df


Unnamed: 0,final_price,bedrooms,bathrooms,sqft,parking
0,630000.0,1.0,1.0,650.0,0.0
1,550000.0,1.0,1.0,550.0,0.0
2,502000.0,1.0,1.0,650.0,0.0
3,368000.0,0.0,1.0,250.0,1.0
6,975000.0,2.0,2.0,1300.0,1.0
...,...,...,...,...,...
9521,630000.0,2.0,2.0,1100.0,1.0
9523,785000.0,2.0,2.0,750.0,1.0
9524,1025000.0,5.0,5.0,3250.0,2.0
9525,681000.0,3.0,3.0,,1.0


In [215]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,final_price,bedrooms,bathrooms,sqft,parking
0,630000.0,1.0,1.0,650.0,0.0
1,550000.0,1.0,1.0,550.0,0.0
2,502000.0,1.0,1.0,650.0,0.0
3,368000.0,0.0,1.0,250.0,1.0
6,975000.0,2.0,2.0,1300.0,1.0
...,...,...,...,...,...
9520,343000.0,1.0,1.0,750.0,1.0
9521,630000.0,2.0,2.0,1100.0,1.0
9523,785000.0,2.0,2.0,750.0,1.0
9524,1025000.0,5.0,5.0,3250.0,2.0


In [216]:
#creating an 80/20 train/test split
df_cleaned_training, df_cleaned_testing = train_test_split(df_cleaned, test_size=0.2, random_state=1)


In [217]:
labels = ["final_value", "bedrooms", "bathrooms", "sqft", "parking"]

correlations = pd.DataFrame(columns=labels, index=labels)
np.fill_diagonal(correlations.values, 1)

#.corr finds r (pearson's correlation ).
correlations.iloc[0,1] = df_cleaned_training[['final_price', 'bedrooms']].corr().iloc[0,1]
correlations.iloc[0,2] = df_cleaned_training[['final_price', 'bathrooms']].corr().iloc[0,1]
correlations.iloc[0,3] = df_cleaned_training[['final_price', 'sqft']].corr().iloc[0,1]
correlations.iloc[0,4] = df_cleaned_training[['final_price', 'parking']].corr().iloc[0,1]
correlations.iloc[1,2] = df_cleaned_training[['bedrooms', 'bathrooms']].corr().iloc[0,1]
correlations.iloc[1,3] = df_cleaned_training[['bedrooms', 'sqft']].corr().iloc[0,1]
correlations.iloc[1,4] = df_cleaned_training[['bedrooms', 'parking']].corr().iloc[0,1]
correlations.iloc[2,3] = df_cleaned_training[['bathrooms', 'sqft']].corr().iloc[0,1]
correlations.iloc[2,4] = df_cleaned_training[['bathrooms', 'parking']].corr().iloc[0,1]
correlations.iloc[3,4] = df_cleaned_training[['sqft', 'parking']].corr().iloc[0,1]

correlations

Unnamed: 0,final_value,bedrooms,bathrooms,sqft,parking
final_value,1.0,0.489897,0.560742,0.647188,0.343641
bedrooms,,1.0,0.745586,0.808153,0.526631
bathrooms,,,1.0,0.775501,0.490674
sqft,,,,1.0,0.576784
parking,,,,,1.0


In [218]:
#independent variables
x1 = df_cleaned_testing['bedrooms']
x2 = df_cleaned_testing['bathrooms']
x3 = df_cleaned_testing['sqft']
x4 = df_cleaned_testing['parking']

#actual values
y_true = df_cleaned_testing['final_price']

#coefficients table
coefficients_table = pd.DataFrame(columns=['Model', 'Intercept', 'Bedrooms', 'Bathrooms', 'Sqft', 'Parking', '% error RMSE', '% error MAE'])

In [219]:
#coefficients with all 4 inputs
X = df_cleaned_training[['bedrooms', 'bathrooms', 'sqft', 'parking']].values
y = df_cleaned_training['final_price'].values

#column of ones for intercept term
X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b  #this is x^T * x
XT_y = X_b.T @ y    #this is x^T * y

#solve for the coefficients
coefficients = np.linalg.solve(XT_X, XT_y)

print(f"Estimated Coefficients: {coefficients}")

b0, b1, b2, b3, b4 = coefficients

#predicted values based on the linear model equation
y_pred = b0 + b1*x1 + b2*x2 + b3*x3 + b4*x4

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "All 4 inputs",
    "Intercept": round(b0,1),
    "Bedrooms": round(b1,1),
    "Bathrooms": round(b2,1),
    "Sqft": round(b3,1),
    "Parking": round(b4,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)

Estimated Coefficients: [289755.9172089  -49989.37650169  76320.3808561     363.47889506
 -18510.13153277]
Mean Squared Error: 45455562924.18642
Root Mean Squared Error: 213203.10252007688
Mean Absolute Error: 156384.068107416
Percent error from RMSE: 31.848337564975647
Percent error from MAE: 23.36069471784595


  coefficients_table = coefficients_table.append(


In [220]:
#coefficients without parking

X = df_cleaned_training[['bedrooms', 'bathrooms', 'sqft']].values
y = df_cleaned_training['final_price'].values

X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b
XT_y = X_b.T @ y

coefficients = np.linalg.solve(XT_X, XT_y)
print(f"Estimated Coefficients: {coefficients}")

b0, b1, b2, b3 = coefficients

#actual values
y_true = df_cleaned_testing['final_price']

#predicted values based on the linear model equation
y_pred = b0 + b1*x1 + b2*x2 + b3*x3

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "bedrooms, bathrooms, sqft",
    "Intercept": round(b0,1),
    "Bedrooms": round(b1,1),
    "Bathrooms": round(b2,1),
    "Sqft": round(b3,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)

#note: does not change the other coefficients much when removing parking. This applies to all combinations of other inputs.

Estimated Coefficients: [287152.58303654 -52102.00102812  75271.33416881    353.62186496]
Mean Squared Error: 45566164514.700134
Root Mean Squared Error: 213462.32575023663
Mean Absolute Error: 157293.53008750535
Percent error from RMSE: 31.88706040175066
Percent error from MAE: 23.496550396314177


  coefficients_table = coefficients_table.append(


In [221]:
#coefficients with out sqft

X = df_cleaned_training[['bedrooms', 'bathrooms', 'parking']].values
y = df_cleaned_training['final_price'].values

X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b
XT_y = X_b.T @ y

coefficients = np.linalg.solve(XT_X, XT_y)
print(f"Estimated Coefficients: {coefficients}")

b0, b1, b2, b4 = coefficients

#predicted values based on the linear model equation
y_pred = b0 + b1*x1 + b2*x2 + b4*x4

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "bedrooms, bathrooms, parking",
    "Intercept": round(b0,1),
    "Bedrooms": round(b1,1),
    "Bathrooms": round(b2,1),
    "Parking": round(b4,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)

#note: made all coefficients positive when removing sqft, marking possible multicollinearity with sqft

Estimated Coefficients: [270481.24727189  44749.84084515 163772.91877949  25974.48306214]
Mean Squared Error: 54600101052.176056
Root Mean Squared Error: 233666.64514255358
Mean Absolute Error: 168996.676989783
Percent error from RMSE: 34.90518713945374
Percent error from MAE: 25.244769670380034


  coefficients_table = coefficients_table.append(


In [222]:
#coefficients without bathrooms

X = df_cleaned_training[['bedrooms', 'sqft', 'parking']].values
y = df_cleaned_training['final_price'].values

X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b
XT_y = X_b.T @ y

coefficients = np.linalg.solve(XT_X, XT_y)
print(f"Estimated Coefficients: {coefficients}")

b0, b1, b3, b4 = coefficients

#predicted values based on the linear model equation
y_pred = b0 + b1*x1 + b3*x3 + b4*x4

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "bedrooms, sqft, parking",
    "Intercept": round(b0,1),
    "Bedrooms": round(b1,1),
    "Sqft": round(b3,1),
    "Parking": round(b4,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)


#note: values changed, but still negative. might suggest bathrooms not causing multicollinearity with bedrooms

Estimated Coefficients: [323915.01981119 -28551.29084959    417.75779003 -15571.68937278]
Mean Squared Error: 46333120796.156235
Root Mean Squared Error: 215251.29685127622
Mean Absolute Error: 158972.01132636276
Percent error from RMSE: 32.154297392424986
Percent error from MAE: 23.747282381260668


  coefficients_table = coefficients_table.append(


In [223]:
#coefficients without bedrooms

X = df_cleaned_training[['bathrooms', 'sqft', 'parking']].values
y = df_cleaned_training['final_price'].values

X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b
XT_y = X_b.T @ y

coefficients = np.linalg.solve(XT_X, XT_y)
print(f"Estimated Coefficients: {coefficients}")

b0, b2, b3, b4 = coefficients

#predicted values based on the linear model equation
y_pred = b0 + b2*x2 + b3*x3 + b4*x4

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "bathrooms, sqft, parking",
    "Intercept": round(b0,1),
    "Bathrooms": round(b2,1),
    "Sqft": round(b3,1),
    "Parking": round(b4,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)

#note: values changed, but still negative. might suggest bedrooms not causing multicollinearity with bathrooms

Estimated Coefficients: [274785.7020654   58923.35595915    315.7614309  -23312.25841127]
Mean Squared Error: 46295079240.67394
Root Mean Squared Error: 215162.9132556862
Mean Absolute Error: 157342.30827158046
Percent error from RMSE: 32.14109462682596
Percent error from MAE: 23.50383689474627


  coefficients_table = coefficients_table.append(


In [224]:
#coefficients without bedrooms and bathrooms

X = df_cleaned_training[['sqft', 'parking']].values
y = df_cleaned_training['final_price'].values

X_b = np.c_[np.ones((X.shape[0], 1)), X]

XT_X = X_b.T @ X_b
XT_y = X_b.T @ y

coefficients = np.linalg.solve(XT_X, XT_y)
print(f"Estimated Coefficients: {coefficients}")

b0, b3, b4 = coefficients

#predicted values based on the linear model equation
y_pred = b0 + b3*x3 + b4*x4

#calculate MSE and MAE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_true - y_pred))
percent_rmse = (rmse/np.mean(y))*100
percent_mae = (mae/np.mean(y))*100

#add to the calculations to the table
coefficients_table = coefficients_table.append(
    {
    "Model": "sqft, parking",
    "Intercept": round(b0,1),
    "Sqft": round(b3,1),
    "Parking": round(b4,1),
    "% error RMSE": percent_rmse,
    "% error MAE": percent_mae
    }, ignore_index=True
    )

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Percent error from RMSE:", percent_rmse)
print("Percent error from MAE:", percent_mae)

coefficients_table

#note: values changed, but still negative. might suggest bedrooms not causing multicollinearity with bathrooms

  coefficients_table = coefficients_table.append(


Estimated Coefficients: [309509.36338347    379.71892576 -19035.58683218]
Mean Squared Error: 46783081027.44611
Root Mean Squared Error: 216293.96900386777
Mean Absolute Error: 159064.1488458452
Percent error from RMSE: 32.31005204276929
Percent error from MAE: 23.76104590903391


Unnamed: 0,Model,Intercept,Bedrooms,Bathrooms,Sqft,Parking,% error RMSE,% error MAE
0,All 4 inputs,289755.9,-49989.4,76320.4,363.5,-18510.1,31.848338,23.360695
1,"bedrooms, bathrooms, sqft",287152.6,-52102.0,75271.3,353.6,,31.88706,23.49655
2,"bedrooms, bathrooms, parking",270481.2,44749.8,163772.9,,25974.5,34.905187,25.24477
3,"bedrooms, sqft, parking",323915.0,-28551.3,,417.8,-15571.7,32.154297,23.747282
4,"bathrooms, sqft, parking",274785.7,,58923.4,315.8,-23312.3,32.141095,23.503837
5,"sqft, parking",309509.4,,,379.7,-19035.6,32.310052,23.761046


In [225]:
data = {
    'Range': ['All', '<$5,000,000', '<$4,000,000', '<$3,000,000', '<$2,500,000', '<$2,000,000', '<$1,500,000', '<$1,000,000'],
    'MAE': ['190266', '189111', '192086', '184862', '179805', '164998', '148182', '120041'],
    'Data Points': ['9178', '9171', '9166', '9126', '9076', '8944', '8564', '7316']
    }

range_errors = pd.DataFrame(data)

data2 = {
    'Range': ['>$200,000', '>$300,000', '>$400,000'],
    'MAE': ['168919', '159175', '152462'],
    'Data Points': ['8921', '8776', '8097']
}

range_errors2 = pd.DataFrame(data2)

range_errors

Unnamed: 0,Range,MAE,Data Points
0,All,190266,9178
1,"<$5,000,000",189111,9171
2,"<$4,000,000",192086,9166
3,"<$3,000,000",184862,9126
4,"<$2,500,000",179805,9076
5,"<$2,000,000",164998,8944
6,"<$1,500,000",148182,8564
7,"<$1,000,000",120041,7316


In [226]:
#using <$2,000,000
range_errors2

Unnamed: 0,Range,MAE,Data Points
0,">$200,000",168919,8921
1,">$300,000",159175,8776
2,">$400,000",152462,8097
