In [None]:
import pandas as pd

HOUSING_PATH = '/kaggle/input/housing-in-london/housing_in_london_monthly_variables.csv'

def load_housing_data(path: str = HOUSING_PATH) -> pd.DataFrame:
    return pd.read_csv(path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
from typing import Tuple

plt.style.use('dark_background')

def describe_value_counts(df: pd.DataFrame, column_name: str, figsize: Tuple[int, int]) -> None:
    value_counts: pd.Series = df[column_name].value_counts()
    value_counts.plot.bar(figsize=figsize)
    plt.show()
    
    print(f'Value counts:\n{value_counts}', end='\n\n')
    print(f'Value proportions:\n{value_counts / len(df)}')

In [None]:
describe_value_counts(housing, 'borough_flag', (10, 10))
describe_value_counts(housing, 'code', (30, 10))
describe_value_counts(housing, 'area', (30, 10))

From this we can understand that for the categorical string features of `area` and `code`, they are all represented equally (mostly) within the data.

For the categorical numeric variable of `borough_flag`, it is represented very disproportionately, meaning that it would likely be best to use stratified sampling to avoid sampling bias if selected at random

In [None]:
housing.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_indices, test_indices in split.split(housing, housing['borough_flag']):
    strat_train_set: pd.DataFrame = housing.loc[train_indices]
    strat_test_set: pd.DataFrame = housing.loc[test_indices]

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind='scatter', x='houses_sold', y='average_price', alpha=0.6, figsize=(15, 10), c='no_of_crimes', cmap=plt.get_cmap('cubehelix'), colorbar=True)
plt.show()

It seems that a large majority of the crime is being done the higher `average_price` is, and tends to go down as `houses_sold` increases.

In [None]:
describe_value_counts(housing.dropna(), 'borough_flag', (10, 10))

**EVERY** training sample which has an defined value for `no_of_crimes` guarantees a `borough_flag` value of 1... hmmmm...

This does not *necessarily* mean that areas with the `borough_flag` of 1 are more crime-filled, as a defined value can still be 0 and an undefined value does not guarantee it to be 0

In [None]:
housing.plot(kind='scatter', x='houses_sold', y='average_price', alpha=0.2, figsize=(15, 10))

With the entire other half of the dataset introduced we can start to clearly see that as `houses_sold` increases, `no_of_crimes` quickly decreases
to the point where the entire right side of this graph was not included in the above crime colormap graph

It also didn't show the *very* small values for `houses_sold`, even though `average_price` soars at those low levels

In [None]:
housing.corr()

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(housing, figsize=(20, 12))
plt.show()

From this scatter matrix, we can see that it supports what we've seen from the correlation matrix

In [None]:
housing = strat_train_set.drop('average_price', axis=1)
housing_labels = strat_train_set['average_price'].copy()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [None]:
housing_num = housing.drop(['date', 'area', 'code'], axis=1)
imputer.fit(housing_num)

In [None]:
for feature_name, feature_median in zip(housing_num.columns, housing_num.median().values):
    print(f'{feature_name}: {feature_median}')

In [None]:
import numpy as np

X: np.ndarray = imputer.transform(housing_num)

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
housing_tr.info()

In [None]:
# one hot encoding may be a bit messy here due to the large number of categorical possibilities... but we'll see how it works out

from sklearn.preprocessing import OneHotEncoder

housing_cat = housing.drop(['houses_sold', 'no_of_crimes', 'borough_flag', 'date'], axis=1)

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_predictions, housing_labels)
lin_rmse = np.sqrt(lin_mse)

print(f'lin_rmse = {lin_rmse}')

It's off by 140 thousand...

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores: np.ndarray) -> None:
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Std:', scores.std())
    
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

display_scores(forest_rmse_scores)

Now it's off by 90k, which isn't *too* bad considering the range of the target value

In [None]:
forest_reg.max_features

In [None]:
import joblib

joblib.dump(forest_reg, 'forest_reg.pkl')

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30, 60], 'max_features': [1, 2, 3, 4, 5]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [1, 2, 3, 4, 5]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

attributes = num_attribs + cat_attribs

sorted(zip(feature_importances, attributes), reverse=True)

It seems that the categorical features barely mattered! (at least when one-hot encoded... maybe encoding them another way would be more helpful...)

Though even the non-encoded `borough_flag` barely did anything

In [None]:
final_model = joblib.load('forest_reg.pkl')

X_test = strat_test_set.drop('average_price', axis=1)
y_test = strat_test_set['average_price'].copy()

X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print(f'final_rmse = {final_rmse}')

Looks like the model is off by an average of 90k on the test set

In [None]:
print(final_predictions[:5], end='\n\n')
print(y_test[:5])

In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, final_predictions)

Good enough (ツ)_/¯ 