# Simplified Melbourne House Price Prediction

Constraints:
- Not forecasting into the future.
- Linear Model.
- Max 2 features per model.
- Optimising Mean Square Error.

In [None]:
import os

In [None]:
os.getcwd()
os.chdir('C:\\Users\\zak\\Projects\\PycharmProjects\\data-science\\')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import math
import statsmodels.api as sm

from utils import setup_project_root

In [None]:
setup_project_root()

In [None]:
unaltered_dataframe = pd.read_csv("data/melb_data.csv")

In [None]:
# I have the constraint of picking just two features. Let's pick the ones that have the highest correlation with price.
plt.figure(figsize = (10, 10))
sns.heatmap(unaltered_dataframe.select_dtypes(include=[np.number]).corr(), annot = True, cmap = "coolwarm")

## Step 0: The Super Naive Prediction
Let's start by using the unaltered data and creating a model.

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Handle missing data by removing it.
naive_df = unaltered_dataframe.dropna()
print(len(naive_df))

X = naive_df[['Rooms', 'Bedroom2']]
y = naive_df["Price"]

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(train_X, train_y)

In [None]:
price_predictions = linear_model.predict(val_X)

In [None]:
mean_absolute_percentage_error(val_y, price_predictions)

In [None]:
mse = mean_squared_error(val_y, price_predictions)
math.sqrt(mse)

In [None]:
# Let's plot both our variables with price

fig, axes = plt.subplots(1, 2, figsize = (15, 5))
sns.scatterplot(x = "Rooms", y = "Price", data = naive_df, ax = axes[0])
sns.scatterplot(naive_df, x = "Bedroom2", y = "Price", ax = axes[1])

These two variables basically look the same. Are they each telling my model something new?

In [None]:
print(linear_model.coef_)
print(linear_model.intercept_)
print(list(zip(linear_model.coef_, X.columns)))

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (15, 5))
sns.regplot(x="Rooms", y="Price", data=naive_df, ax=axes[0], line_kws={'color':'red'})
sns.regplot(x="Bedroom2", y="Price", data=naive_df, ax=axes[1], line_kws={'color':'red'})

In [None]:
# ... existing code ...
    # Let's plot both our variables with price

fig, axes = plt.subplots(1, 2, figsize = (15, 5))
sns.scatterplot(x = "Rooms", y = "Price", data = naive_df, ax = axes[0])
sns.scatterplot(x = "Bedroom2", y = "Price", data = naive_df, ax = axes[1])

# To plot a line from a multi-variable model, we have to hold the other variables constant.
# A common practice is to use the mean.
mean_rooms = naive_df['Rooms'].mean()
mean_bedroom2 = naive_df['Bedroom2'].mean()

# The equation is: Price = m1*Rooms + m2*Bedroom2 + c
m1, m2 = linear_model.coef_
c = linear_model.intercept_

# --- Create and plot the line for the 'Rooms' plot ---
# Create a range of values for the x-axis
rooms_x = np.linspace(naive_df['Rooms'].min(), naive_df['Rooms'].max(), 100)
# Calculate the predicted price, holding 'Bedroom2' at its mean
price_y_rooms = m1 * rooms_x + m2 * mean_bedroom2 + c
axes[0].plot(rooms_x, price_y_rooms, color='red', linewidth=2)

# --- Create and plot the line for the 'Bedroom2' plot ---
# Create a range of values for the x-axis
bedroom2_x = np.linspace(naive_df['Bedroom2'].min(), naive_df['Bedroom2'].max(), 100)
# Calculate the predicted price, holding 'Rooms' at its mean
price_y_bedroom2 = m2 * bedroom2_x + m1 * mean_rooms + c
axes[1].plot(bedroom2_x, price_y_bedroom2, color='red', linewidth=2)


In [None]:
# Visualising the predicted price vs perfect price
pred = linear_model.predict(train_X)
sns.scatterplot(x = pred, y = train_y)
sns.scatterplot(x = train_y, y = train_y)

In [None]:
sns.scatterplot(x = pred, y = train_y)

What about two other variables?

In [None]:
# Create a list of the pairs of features and loop through with the results
import itertools

In [None]:
lst = [1, 2, 3, 4]
els = [list(x) for x in itertools.combinations(lst, 2)]
els

In [None]:
unaltered_dataframe.dropna(inplace=True)

numeric_features = unaltered_dataframe.drop("Price", axis=1).select_dtypes(include=[np.number]).dropna()

els = [list(x) for x in itertools.combinations(numeric_features.columns, 2)]
els

In [None]:
from sklearn.model_selection import cross_val_score

https://www.kaggle.com/code/alexisbcook/cross-validation

It is a little surprising that we specify negative MAE. Scikit-learn has a convention where all metrics are defined so a high number is better. Using negatives here allows them to be consistent with that convention, though negative MAE is almost unheard of elsewhere.

In [None]:
cross_val_score(LinearRegression(), numeric_features[['Rooms', 'Distance']], unaltered_dataframe["Price"], cv=5, scoring='neg_mean_absolute_error').mean()


In [None]:
variable_pair_scores = {x: cross_val_score(LinearRegression(), numeric_features[list(x)], unaltered_dataframe["Price"], cv=5, scoring='neg_mean_absolute_error').mean() for x in itertools.combinations(numeric_features.columns, 2)}
variable_pair_scores

In [None]:
pd.DataFrame(list(variable_pair_scores.items()), columns=["Variable Pair", "Score"])

In [None]:
variable_trip_scores = {x: cross_val_score(LinearRegression(), numeric_features[list(x)], unaltered_dataframe["Price"], cv=5, scoring='neg_mean_absolute_error').mean() for x in itertools.combinations(numeric_features.columns, 3)}
variable_trip_scores

In [None]:
pd.DataFrame(list(variable_trip_scores.items()), columns=["Variable Trip", "Score"])

In [None]:
def feature_combinations(X, y, n_features, model=LinearRegression()):
    variable_trip_scores = { (len(list(x)), x) : cross_val_score(model, X[list(x)], y, cv=5, scoring='neg_mean_absolute_error').mean() for x in itertools.combinations(X.columns, n_features)}

    return pd.DataFrame(list(variable_trip_scores.items()), columns=[f"Variables", "Score"])

In [None]:
# feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=4, model=LinearRegression())

In [None]:
# feature_combinations(5)

In [None]:
# feature_combinations(6)

In [None]:
# feature_combinations(7)

In [None]:
len(numeric_features.columns)

In [None]:
# frames = []
# for i in range(12):
#     frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=i+1, model=LinearRegression()))
#
# all_combos = pd.concat(frames)
# all_combos

In [None]:
# frames = []
# for i in range(12):
#     frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=i+1, model=Ridge()))
#
# all_combos = pd.concat(frames)
# all_combos

In [None]:
# frames = []
# for i in range(12):
#     frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=i+1, model=RandomForestRegressor()))
#
# all_combos = pd.concat(frames)
# all_combos

In [None]:
# Cross-validate each pair to truly know which pair is best
# Assign Cross validation score for each pair. What about more features?
# GridSearchCV

In [None]:
X = naive_df[['Rooms', 'YearBuilt']]
y = naive_df["Price"]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
linear_model = LinearRegression()
linear_model.fit(train_X, train_y)
price_predictions = linear_model.predict(val_X)
mse = mean_squared_error(val_y, price_predictions)
math.sqrt(mse)

In [None]:
X = numeric_features
y = unaltered_dataframe["Price"]

X = sm.add_constant(X)

model = sm.OLS(y, X)
# Can be used for linear regression only (assess features). Use errors to discard features.
results = model.fit()

print(results.summary())

# test for non-linear features x^2 terms.
# find some regressors in sklearn library.
# which regressors give the best results?
# include random forest and 2 others

The next step is maybe cross-validate?

In [None]:
council_price_df = unaltered_dataframe.copy(deep=True)[['CouncilArea', 'Price']]

In [None]:
council_price_df.CouncilArea.value_counts()
# Moreland, Boroondara are my biggest council areas; is there a significant difference between these areas?

In [None]:
stats.ttest_ind(council_price_df.loc[council_price_df.CouncilArea == "Boroondara", "Price"], council_price_df.loc[council_price_df.CouncilArea == "Moreland", "Price"])

In [None]:
council_price_df.loc[council_price_df.CouncilArea == "Moreland", "Price"].mean()

In [None]:
council_price_df.loc[council_price_df.CouncilArea == "Boroondara", "Price"].mean()

In [None]:
unaltered_dataframe

In [None]:
# all_combos.loc[all_combos['Score'] == all_combos['Score'].max(), 'Variables']

In [None]:
# all_combos.iloc[all_combos[['Score']].idxmax()]

In [None]:
unaltered_dataframe[['BuildingArea', 'YearBuilt']]

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

KAGGLE_API_TOKEN = os.getenv("KAGGLE_API_TOKEN")

In [None]:
!kaggle datasets download -d titanic

Test x^2 terms. Are there any non-linear features?

In [None]:
# Create x^2 features for all numerical features
# First, identify the numerical columns (excluding Price)
numerical_cols = unaltered_dataframe.select_dtypes(include=[np.number]).columns.tolist()
if 'Price' in numerical_cols:
    numerical_cols.remove('Price')

for col in numerical_cols:
    unaltered_dataframe[f'{col}^2'] = unaltered_dataframe[col] ** 2

# Also update numeric_features if it was already defined to include these new features
numeric_features = unaltered_dataframe.drop("Price", axis=1).select_dtypes(include=[np.number])
numeric_features.head()

In [None]:
feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=len(numeric_features.columns)-2, model=LinearRegression())

In [None]:
frames = []
for i in range(1):
    frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=len(numeric_features.columns)-i, model=LinearRegression()))

all_combos = pd.concat(frames)
all_combos

In [None]:
frames = []
for i in range(1):
    frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=len(numeric_features.columns)-i, model=Ridge()))

all_combos = pd.concat(frames)
all_combos

In [None]:
frames = []
for i in range(1):
    frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=len(numeric_features.columns)-i, model=RandomForestRegressor()))

all_combos = pd.concat(frames)
all_combos

In [None]:
import sklearn

In [None]:
sklearn.linear_model.__all__[1]

In [None]:
for i in range(len(sklearn.linear_model.__all__)):
    frames.append(feature_combinations(X=numeric_features, y=unaltered_dataframe['Price'], n_features=len(numeric_features.columns)-i, model=sklearn.linear_model.__all__[i]))

In [73]:
numeric_features

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,...,Postcode^2,Bedroom2^2,Bathroom^2,Car^2,Landsize^2,BuildingArea^2,YearBuilt^2,Lattitude^2,Longtitude^2,Propertycount^2
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.00,1900.0,-37.80790,...,9406489.0,4.0,1.0,0.0,24336.0,6241.0000,3610000.0,1429.437302,21023.086044,16152361.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.00,1900.0,-37.80930,...,9406489.0,9.0,4.0,0.0,17956.0,22500.0000,3610000.0,1429.543166,21023.376031,16152361.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.00,2014.0,-37.80720,...,9406489.0,9.0,1.0,4.0,14400.0,20164.0000,4056196.0,1429.384372,21023.289035,16152361.0
6,3,2.5,3067.0,4.0,2.0,0.0,245.0,210.00,1910.0,-37.80240,...,9406489.0,16.0,4.0,0.0,60025.0,44100.0000,3648100.0,1429.021446,21024.797000,16152361.0
7,2,2.5,3067.0,2.0,1.0,2.0,256.0,107.00,1890.0,-37.80600,...,9406489.0,4.0,1.0,4.0,65536.0,11449.0000,3572100.0,1429.293636,21023.666021,16152361.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12205,3,35.5,3757.0,3.0,2.0,1.0,972.0,149.00,1996.0,-37.51232,...,14115049.0,9.0,4.0,1.0,944784.0,22201.0000,3984016.0,1407.174152,21063.535441,4708900.0
12206,3,6.8,3016.0,3.0,1.0,0.0,179.0,115.00,1890.0,-37.86558,...,9096256.0,9.0,1.0,0.0,32041.0,13225.0000,3572100.0,1433.802149,20997.383674,40704400.0
12207,1,6.8,3016.0,1.0,1.0,1.0,0.0,35.64,1967.0,-37.85588,...,9096256.0,1.0,1.0,1.0,0.0,1270.2096,3869089.0,1433.067651,20995.824528,40704400.0
12209,2,4.6,3181.0,2.0,1.0,1.0,0.0,61.60,2012.0,-37.85581,...,10118761.0,4.0,1.0,1.0,0.0,3794.5600,4048144.0,1433.062351,21022.172595,19184400.0


In [74]:
from sklearn.preprocessing import StandardScaler

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd

# Scale and KEEP column names
scaler = StandardScaler()
X_norm = scaler.fit_transform(numeric_features)

X_norm_df = pd.DataFrame(

    X_norm,
    columns=numeric_features.columns,
    index=numeric_features.index
)

# Fit correctly (instance method)
lr = LinearRegression()
lr.fit(X_norm_df, unaltered_dataframe["Price"])

NameError: name 'numeric_features' is not defined

In [86]:
b_norm = lr.intercept_
w_norm = lr.coef_
print(f"model parameters:                   w: {w_norm}, b:{b_norm}")
print( "model parameters from previous lab: w: [110.56 -21.27 -32.71 -37.97], b: 363.16")

model parameters:                   w: [ 7.38164127e+04 -2.80340365e+05  1.63350343e+06  3.44808739e+05
 -1.47454673e+04  6.94994662e+04  5.63554894e+04  3.09382883e+05
  5.23023791e+05 -3.40934423e+07  6.81800023e+07  4.67173682e+04
  4.48660562e+04  1.39263210e+05 -1.55355912e+06 -3.34131237e+05
  1.51543068e+05 -1.20877671e+04 -4.81194871e+04 -1.50291325e+05
 -6.73865515e+05 -3.40402865e+07 -6.81353358e+07 -6.03064526e+04], b:1068828.2020706234
model parameters from previous lab: w: [110.56 -21.27 -32.71 -37.97], b: 363.16
