In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly
import os

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

# Data Visualization and analysis
First lets see the information I can exract directly from the data

In [None]:
df.info()

In [None]:
pd.set_option('display.max_columns', 130)
#since its 'date' lets change its type to datetime
df['date'] = pd.to_datetime(df['date'])

df.head()

In [None]:
df.describe()

In [None]:
plt.subplots(figsize=(10,10))
corr_array = df.corr()
sns.heatmap(corr_array)
plt.show()

Most of them can be used straight up, yet some need some more work to serve us a a solid source of information. 

First, lets visualize the data geographically.

In [None]:
plt.subplots(figsize=(15,15))

plt.scatter(x=df['long'], y=df['lat'], c=df['price'])
plt.show()

We are also creating a subtask, which will be a prediction whether the price is going to exceed 1 million

In [None]:
cols = df.columns.tolist()
#put the sub target next to the original target
cols = cols[:3] + ['price_bin'] + cols[3:]

df['price_bin'] = (df['price'] > 1e6).astype(int)

#set the columns in the right order
df = df[cols]


In [None]:
df['price_bin'].value_counts()

In [None]:


plt.subplots(figsize=(15,15))
plt.scatter(x=df['long'], y=df['lat'], c=df['price_bin'])
plt.show()

Unfortunately, due to a hardware issues i couldn't draw it here on a geographicall map, but I did that outside of this notebook and the houses seem to be all located in one City of Seattle. 

Since their locaction is consistent across all data - I can treat them as near neighbours and thus I can use **interpolation** techniques to get an information about each of the houses neighbourhood. 

The location definitely seems to be an important factor in the pricing, since the houses with simmilar prices  seem to cluster in the near proximity (also that is what I expect from the real-world housing data since the price depends from the neighbourhood and the disance to city centers/beaches/malls etc.).

## WARNING Target Leak!
Whenever I am using a target column to create features I should get a big red flag for a Target Leak. Yet if I remain carefull and make sure that I only include informations outside the data point's target value - I will be fine.

For further more carefull evaluation, I will separate out a validation set with a small portion of data (10%) that will not be used for encoding the variables (so I will basically treat it as a non existant houses). The test set can be now interpretable as a set that the model was not trained on but when making the prediction for the house from test set - the other houses from the test set are treated as existing source of data. From now on - I will be creating aggragation-based features only on the df.iloc[test_indexes | train_indexes] as seen further in the notebook. The difference between scores on test and validation set will show if I have succesfully avoided the target leak!


In [None]:
N = df.count()[0]
val_split_ratio = 0.1
test_train_split_ratio = 0.2
val_size = int(N*val_split_ratio)
test_size = int((N-val_size)*test_train_split_ratio)
train_size = N -val_size - test_size
train_test_val_indexes = np.hstack([np.ones(test_size),np.zeros(train_size), -np.ones(val_size)])
np.random.shuffle(train_test_val_indexes)
train_test_val_indexes = train_test_val_indexes.astype(int)

test_indexes = train_test_val_indexes == 1
train_indexes = train_test_val_indexes == 0
val_indexes = train_test_val_indexes == -1

# Feature engeneering

### Simple column manipulation
Lets first fix the date columns to the more digastable form and add some usefull ones as well.



In [None]:
df['price_per_sqrft_living'] = df['price']/df['sqft_living']
df['price_per_sqrft_lot'] = df['price']/df['sqft_lot']
# I have make sure that these variables are only used when infered from observations outside the data point!

In [None]:
df['age_sold'] = df['date'].dt.year - df['yr_built']

df['was_renovated'] = df['yr_renovated'] != 0
#replace 0's in non renovated houses with the year of the built
df.loc[df['yr_renovated'] == 0, 'yr_renovated'] = df['yr_built']

### Target encoder
I can encode the zipcode with the average price in the zipcode to make it yet another usefull feature. Then I will use the LeaveOneOut option to not consider a datapoint's target value and prevent a TargetLeak

In [None]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [None]:
zipcode_encoder = LeaveOneOutEncoder()

zipcode_encoder.fit(
                X=df.iloc[test_indexes | train_indexes]['zipcode'].astype(str),
                y=df.iloc[test_indexes | train_indexes]['price_per_sqrft_living']
            )
df['zipcode_encoded_prices_per_sqrft_living'] = zipcode_encoder.transform(
                                                    X=df['zipcode'].astype(str),
                                                    y=df['price_per_sqrft_living']
                                                )

zipcode_encoder.fit(
                X=df.iloc[test_indexes | train_indexes]['zipcode'].astype(str),
                y=df.iloc[test_indexes | train_indexes]['price_per_sqrft_lot']
            )
df['zipcode_encoded_prices_per_sqrft_living'] = zipcode_encoder.transform(
                                                    X=df['zipcode'].astype(str),
                                                    y=df['price_per_sqrft_lot']
                                                )


### Date-sold trend normalization

Due to the fact like inflation and general trend for housing prices - they may differ based on the time when they were sold.

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
df.groupby('date').mean().sort_values('date')['price_per_sqrft_lot'].plot()
plt.show()

In [None]:
df.groupby('date').mean().sort_values('date')['price_per_sqrft_living'].plot()
plt.show()

Iteresingly, it does not seem to have any visible trend.

In [None]:
_, p_val, _, _, _,_ = adfuller(df.groupby('date').mean().sort_values('date')['price_per_sqrft_lot'])
print(f'p-value: {p_val} for H0=Data is stationery')

In [None]:
_, p_val, _, _, _,_ = adfuller(df.groupby('date').mean().sort_values('date')['price_per_sqrft_living'])
print(f'p-value: {p_val} for H0=Data is stationery')

The Augmented Dicky-Fuller test fails to reject the H0, hence I do not have to worry about the dates and I can just skip that column.

### Nearest neighbours (literally)
Next, I **could** (and I probably would if i had more time :) ) use some outside sources to locate the malls, beaches, parking lots, subway stations, restaurants, shools, kindergartens etc. but for the sake of this quick work I will have to use the fact that those factors will, or at least should be **correlated** with the **house price of the N-nearest neighbours** from each point within our data range. This simplification not explain all the variance that I could achieve by collecting all of them, but it should explain a significant part of it as you will see in a second. Based on that neighbourhood I will create some features to use alongside the ones in the dataset and with all that - hopefully make some solid predictions.


In [None]:
print(f"repeated coordinates: {df.iloc[:,0].count() - (df['long'].astype(str)+'/'+df['lat'].astype(str)).nunique()}")

First, it seams like we have some repetition in the coordinates. This is going to be a problem for the KNN algrithm since I want to exclude only the point from which I am measuring the neighbors.

I can fix that by adding some small noise to the coordinates. Since the data has the precision to the third place, I can just append the noise after that.

In [None]:
print("Non unique coordinate pairs: {}".format(
df.iloc[:,0].count() - (df['long'].astype(str)+'/'+df['lat'].astype(str)).nunique()
))

In [None]:
df['long'] += np.random.random(size = df.iloc[:,0].count())/100
df['lat'] += np.random.random(size = df.iloc[:,0].count())/100

In [None]:
print("Non unique coordinate pairs after random noise: {}".format(
df.iloc[:,0].count() - (df['long'].astype(str)+'/'+df['lat'].astype(str)).nunique()
))

In [None]:
from sklearn.neighbors import KNeighborsTransformer

In [None]:
def measure_neighbours(input_df, n_neighbors, target, encoding_indexes, return_distances=False):
    src_indexes = input_df.iloc[encoding_indexes].index
    knn = KNeighborsTransformer(n_neighbors=n_neighbors)
    
    knn.fit(input_df.iloc[encoding_indexes][['lat', 'long']], input_df[target])

    distances, indexes = knn.kneighbors(input_df[['lat', 'long']], return_distance=True)
    means = []
    stds = []
    distance_means = []
    distance_stds = []
    
    for distance, point_neighbors in zip(distances, indexes):
        # distance != 0.0 not to consider the point itself for a huge target leak
        means.append(input_df[target].take(src_indexes[point_neighbors[distance != 0.0]]).mean())
        stds.append(input_df[target].take(src_indexes[point_neighbors[distance != 0.0]]).std())
        
        if return_distances is True:
            distance_means.append(distance[distance != 0.0].mean())
            distance_stds.append(distance[distance != 0.0].std())
    
    if return_distances is True:
        return distance_means, distance_stds, means, stds
    else:
        return means, stds

In [None]:
n_neighbors = 15
target = 'price'


means, stds = measure_neighbours(df, n_neighbors, target, test_indexes | train_indexes)
df[f'{n_neighbors}_NN_{target}_mean'] = means
df[f'{n_neighbors}_NN_{target}_std'] = stds

In [None]:
sns.scatterplot(x=df[target], y=df[f'{n_neighbors}_NN_{target}_mean'])
coef = np.corrcoef(x=df[target], y=df[f'{n_neighbors}_NN_{target}_mean'])[0,1]
print(f'Pearson coefficience: {coef}')
plt.show()

In [None]:
sns.scatterplot(x=df[target], y=df[f'{n_neighbors}_NN_{target}_std'])
coef = np.corrcoef(x=df[target], y=df[f'{n_neighbors}_NN_{target}_std'])[0,1]
print(f'Pearson coefficience: {coef}')
plt.show()

As we can see - those are actually some of the **highest** coefficiences so far!

We can now automate the process to create many features like that.

In [None]:
# this can take up to a couple of minutes
n_neighbors_list = [7, 15, 27, 39]
targets = ['price', 'price_bin', 'price_per_sqrft_living', 'price_per_sqrft_lot', 'sqft_living', 'sqft_above', 'sqft_lot', 'yr_built', 'condition', 'grade']

for n_neighbors in n_neighbors_list:
    distances_taken = False
    for target in targets:
        if distances_taken is False:
            dist_means, dist_stds, means, stds = measure_neighbours(df, n_neighbors, target, test_indexes | train_indexes, return_distances=True)
            df[f'{n_neighbors}_NN_distance_mean'] = dist_means
            df[f'{n_neighbors}_NN_distance_std'] = dist_stds
            distances_taken = True
        else:
            means, stds = measure_neighbours(df, n_neighbors, target, test_indexes | train_indexes)
        df[f'{n_neighbors}_NN_{target}_mean'] = means
        df[f'{n_neighbors}_NN_{target}_std'] = stds
 

In [None]:
df

## Train test split
Now that we have our features, I can select the feature columns and separate the training data from the test data

In [None]:
X_cols = df.columns[4:]
X_cols = X_cols[np.where(X_cols != 'price_per_sqrft_living')]
X_cols = X_cols[np.where(X_cols != 'price_per_sqrft_lot')]
X_cols = X_cols[np.where(X_cols != 'zipcode')]
X_cols = X_cols[np.where(X_cols != 'lat')]
X_cols = X_cols[np.where(X_cols != 'long')]


In [None]:
X_train = df.iloc[train_indexes][X_cols]
X_test = df.iloc[test_indexes][X_cols]
X_val = df.iloc[val_indexes][X_cols]

## Polynomial features

We can also try X with polynomial features, but to keep the amount of features from escaping into tens of thousands, I will only transform some of the features.

Using them I can introduce a non-linearity to our models - whether they are simple linear regression or even tree-based models.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly = PolynomialFeatures(2)
n_features_poly = 36
X_train_poly_base = np.hstack([poly.fit_transform(X_train.iloc[:,:n_features_poly]), X_train.iloc[:,n_features_poly:]])
X_test_poly_base = np.hstack([poly.fit_transform(X_test.iloc[:,:n_features_poly]), X_test.iloc[:,n_features_poly:]])
X_val_poly_base = np.hstack([poly.fit_transform(X_val.iloc[:,:n_features_poly]), X_val.iloc[:,n_features_poly:]])

In [None]:
X_train_poly_base.shape

# Modeling price prediction
## Regression

In [None]:
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
y_cols = 'price'
y_test = df.iloc[test_indexes][y_cols]
y_train = df.iloc[train_indexes][y_cols]
y_val = df.iloc[val_indexes][y_cols]

In [None]:
estimators = {
    'ridge' : Ridge(),
    'lasso' : Lasso(),
    'svm' : SVR(),
    'rf' : RandomForestRegressor(),
    'adab' : AdaBoostRegressor(),
    'knn' : KNeighborsRegressor()
}


dummy_pipeline = Pipeline([('preproc', None),
                           ('preproc_step2', None),
                           ('dim_redu', None),
                           ('reg', LinearRegression())]) 

So let's try some estimators

In [None]:
est_parameters = [
    {
        'preproc': (PowerTransformer(), None),
        'preproc_step2': (StandardScaler(), MinMaxScaler(), None), 
        'dim_redu': (PCA(), PCA(20), None,),
        'reg' : (estimators['ridge'],),
        'reg__alpha' : 10.0**np.array([5,6,7,8,9])
    },
    {
        'preproc': (PowerTransformer(), None),
        'preproc_step2': (StandardScaler(), MinMaxScaler(), None), 
        'dim_redu': (PCA(), PCA(20), None,),
        'reg' : (estimators['svm'],),
        'reg__degree' : (1, 2, 3,),
        'reg__C' : 10.0**np.array([-1, -2, -3,])
    },
    {
        'preproc': (PowerTransformer(), None),
        'dim_redu': (PCA(), PCA(20), None,),
        'reg' : (estimators['rf'],),
        'reg__n_estimators' : (50, 100, 200),
        'reg__max_depth' : (5, 10, 25),        
    },
    {
        'preproc': (PowerTransformer(), None),
        'dim_redu': (PCA(), PCA(20), None,),
        'reg' : (estimators['adab'],),
        'reg__n_estimators' : (30, 100, 300),
        'reg__learning_rate' : (0.01, 0.1, 1),
    },
    {
        'preproc': (PowerTransformer(), None),
        'preproc_step2': (StandardScaler(), MinMaxScaler(), None), 
        'dim_redu': (PCA(), PCA(20), None,),
        'reg' : (estimators['knn'],),
        'reg__n_neighbors' : (3, 10, 15)        
    },
]

This above is really just a sample, i did not really run it all at once, the parameters were chosen sort of by looking in which direction i should tweak them and then I would rerun the grid search, tried out many things in the process, switched between default and poly-transformed features etc.

P.S. I would normally try to implement a Baessian Search instead of standard Grid Search but i couldn't get any off-the-shelf solution to work as I would like it to, so because that was basically a weekend project anyway I just had to settle on doing this workaround described above

In [None]:
#Do NOT rerun this cell unless you want to spend a couple of hours frying eggs on your processor

# grid = GridSearchCV(estimator=dummy_pipeline, param_grid=est_parameters, scoring='neg_root_mean_squared_error')
# grid.fit(X=X_train_poly_base, y=y_train)
# print(grid.best_params_)
# print("Root Mean Squared Error: {:.3f}".format(-grid.best_score_))

Out of the standard approaches the winner (by quite a large margin) was a regularized Linear model with degree 2 polynomial features and the value of alpha = 10e7. Surprisingly though, any proprocessing would make it worse (even if by a very small margin) and trying the dimentionality reduction has worsen it significantly.

Before we settle on this, since AdaBoost did only slighlty worse - lets try out some more sophisticated state-of-the-art model called the XGBoost (although if the linear model came out on top - that may suggest the general linear nature of the data and even algorithm so robust as XGBoost may struggle to beat the linear model)

In [None]:
import xgboost as xgb

In [None]:
# X_transformed = PowerTransformer().fit_transform(X=X_train_poly_base,y=y_train)
# X_transformed = X_train_poly_base

# the non-transformed features proved to be the best here
X_transformed = X_train

data_dmatrix = xgb.DMatrix(data=X_transformed,label=y_train)

In [None]:
# best params after GridSearch 

params = {
          'objective':'reg:squarederror',
          'max_depth': 5, 
          'subsample': 0.7,
          'colsample_bytree': 0.7,
          'learning_rate': 0.1,
          'gamma': 10
          }

cv_results = xgb.cv(dtrain=data_dmatrix,
                    params=params,
                    nfold=3,
                    num_boost_round=450,
                    early_stopping_rounds=50,
                    metrics="rmse",
                    as_pandas=True)

In [None]:
cv_results

It was close, but now I will take the both winner of the GridSearch and the best XGBoost model and test them both on our test set to get the final winner.

In [None]:
xgboost_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                               gamma=10,
                               learning_rate = 0.1,
                               colsample_bytree = 0.7,
                               subsample=0.7,
                               max_depth = 5,
                               )

In [None]:
xgboost_reg.fit(X=X_train, y=y_train)

In [None]:
print(f"R^2: {xgboost_reg.score(X=X_test, y=y_test)}")
y_pred = xgboost_reg.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_pred)}")

Let's examine feature importances

In [None]:
feature_importances = pd.DataFrame(data=xgboost_reg.feature_importances_, index=X_cols).reset_index().rename({"index":"feature",0:"importance"}, axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(10,20))
sns.barplot(data=feature_importances.sort_values(by='importance', ascending=False).iloc[:40,:], y='feature', x='importance', ax=ax)

It seems like the new features were more than worth it!

In [None]:
print(f"R^2: {xgboost_reg.score(X=X_val, y=y_val)}")
y_pred = xgboost_reg.predict(X_val)
print(f"RMSE: {np.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_pred)}")

In [None]:
linear_model = Ridge(alpha=10e7)

In [None]:
linear_model.fit(X=X_train_poly_base, y=y_train)

In [None]:
print(f"R^2: {linear_model.score(X=X_test_poly_base, y=y_test)}")
y_pred = linear_model.predict(X_test_poly_base)
print(f"RMSE: {np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))}")
print(f"MAE: {mean_absolute_error(y_true=y_test, y_pred=y_pred)}")

In [None]:
## task 1 solution: TOP regressor (or a draw with XGBoost, but faster to train and make predictions)

print(f"R^2: {linear_model.score(X=X_val_poly_base, y=y_val)}")
y_pred = linear_model.predict(X_val_poly_base)
print(f"RMSE: {np.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))}")
print(f"MAE: {mean_absolute_error(y_true=y_val, y_pred=y_pred)}")

That is way to close to choose a clear winner - but perhaps after some more fine-tuning and more features XGBoost would take over in a more visible way.

Interesing observation we can make here is: MAE is significanly smaller than RMSE - which can imply that the prediction were heavily influenced by **outliers**. Perhaps guessing the price of really expensive houses can be very unpredictable. If that is the case though - we can probably expect that our classification model are going to work way better because they will only have to guess that the house is expensive.

Another thing is the score on the validation set - those were houses non-exisant to our model even when creating features, so because its score is very close to the score of test set - we can safely say that the Target Leak has been avoided succesfully!

There still remain many things to try out like a ton of new features or different transformations, but that will have to be left for a real projects when I will have more time and a team to do so :) 

To boost the score itself we could try things like stacking or using the Deep Learning techniques but that is not what I wanted to show in this work since for the amount of data and level of functionality for comercial projects - those techniques would not fit this project very well. They are simply too quarky, unnecesarily difficult in deployment and maintanance and can be simply to slow, expensive and resource-heavy.

## Benchmarking
Finally, let's check if our features made the positive change

In [None]:
n_features_poly_noft = 15
X_train.iloc[:,:n_features_poly_noft]

In [None]:
X_train_poly_base_noft = poly.fit_transform(X_train.iloc[:,:n_features_poly_noft])
X_test_poly_base_noft = poly.fit_transform(X_test.iloc[:,:n_features_poly_noft])

linear_model = Ridge(alpha=10e2)
linear_model.fit(X=X_train_poly_base_noft, y=y_train)
linear_model.score(X=X_test_poly_base_noft, y=y_test)

Success! The imporvement I have made from that R^2 is more than significant!

## Classification

The cool thing about this task and dataset is - the classification task is nothing else but a regression task activated by a 0/1 threshold. And for that reason - I do not necessarily have to go trough the process of finding the right model again - I can just reuse the model that we have and only change its target column and loss function to a ex. Sigmoid(output), and the optimal parameters for regression should be very near to optimal parameters for classification!

Without getting into the hiperparametrization again - the results confirmed that the parameters were best when left the same.

In [None]:
y_cols = 'price_bin'
y_test = df.iloc[test_indexes][y_cols]
y_train = df.iloc[train_indexes][y_cols]
y_val = df.iloc[val_indexes][y_cols]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, roc_auc_score


In [None]:
#this can take up to a few minutes 
linear_model = LogisticRegression(C=10e-7, max_iter=1e6)

linear_model.fit(X=X_train_poly_base, y=y_train)

As expected, optimal parameters seem to be exactly the same

In [None]:
y_pred = linear_model.predict(X_test_poly_base)
print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
y_pred = linear_model.predict(X_val_poly_base)
print(classification_report(y_true=y_val, y_pred=y_pred))

In [None]:
y_scores = linear_model.predict_proba(X_test_poly_base)
fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_scores[:,1])
auc = roc_auc_score(y_true=y_test, y_score=y_scores[:,1])
sns.lineplot(fpr, tpr)
plt.title(f"ROC curve, AUC score = {auc}")

In [None]:
xgboost_bin = xgb.XGBRegressor(objective ='binary:logistic', 
                               gamma=4,
                               learning_rate = 0.1,
                               colsample_bytree = 0.7,
                               subsample=0.7,
                               max_depth = 5,
                               )
xgboost_bin.fit(X=X_train, y=y_train)

In [None]:
y_scores = xgboost_bin.predict(X_test)
y_pred = y_scores > 0.5
print(classification_report(y_true=y_test, y_pred=y_pred))

In [None]:
y_scores_val = xgboost_bin.predict(X_val)
y_pred = y_scores_val > 0.5
print(classification_report(y_true=y_val, y_pred=y_pred))

In [None]:
## task 2 solution: TOP binary classificator

fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_scores)
auc = roc_auc_score(y_true=y_test, y_score=y_scores)
sns.lineplot(fpr, tpr)
plt.title(f"ROC curve, AUC score = {auc}")

In summary: The best model for the regression turned out to be an LinearRegression with polynomial features with an XGBoost very close to it, and for binary classification the XGBoost took over significantly with an amazing score.

What was left to do: more features, more features and once again more features, then maybe some more tuning of the models. I could also try some feature selection based on the feature importances and perhaps choose the best model based on the AIC or BIC criterion.