In [None]:
import numpy as np
import pandas as pd 

from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
plt.style.use('fivethirtyeight')
np.random.seed(3325)
sns.set_style('ticks', {'axes.edgecolor':'0.2', 'axes.spines.right': False, 'axes.spines.top': False})
sns.set_context('notebook', rc={'axes.linewidth': 1.4, "lines.linewidth": 2.5})

In [None]:
from matplotlib.ticker import FuncFormatter
def kformat(x, p):
  if x < 1_000:
    return f'{x:.0f}'
  elif x < 1_000_000:
    return f'{x / 1e3:.0f}k'
#   elif x < 10_000_000:
#     return f'{x / 1e6:.1f}M'
  else:
    return f'{x / 1e6:.0f}M'
kticker = FuncFormatter(kformat)

# Load Data

In [None]:
data = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')
data['date'] = pd.to_datetime(data.date, format='%Y%m%dT000000')
data.head()

In [None]:
data.drop('date', axis=1).hist(figsize=(21, 16), bins=20);

# Feature Engineering

In [None]:
cols = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
for col in cols:
    data[col + '_log'] = np.log1p(data[col])

data[[c + '_log' for c in cols]].hist(figsize=(21, 10), bins=20);

In [None]:
#Impute yr_renovated zero's with the year that it was built in
data.loc[data.yr_renovated == 0, 'yr_renovated'] = data.loc[data.yr_renovated == 0, 'yr_built']

In [None]:
#Add a basement flag variable and impute missing sqft_basement zero's with the average
data['has_basement'] = data.sqft_basement_log > 0

data.loc[data.sqft_basement_log == 0, 'sqft_basement_log'] = data[data.sqft_basement_log > 0].sqft_basement_log.mean()

In [None]:
#Create time features
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year

In [None]:
#Track which variables we want to continue observing
X = [c for c in data.columns if c not in cols and c not in ['price', 'date', 'price_log', 'id']]
y = 'price_log'
print(X)

## Visualize Relationships with Price

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(21, 16))
axes_iter = iter(np.ravel(axes))

for i, (col, ax) in enumerate(zip(X, axes_iter)):
    sns.scatterplot(x=col, y='price_log', data=data, ax=ax)
    ax.yaxis.grid(True, alpha=0.3)
    ax.yaxis.set_major_formatter(kticker)
    if i % 5 != 0:
        ax.set_ylabel('')
for ax in axes_iter:
    ax.remove()
fig.tight_layout()

In [None]:
import folium
from folium.plugins import HeatMap

m = folium.Map(location=[data.lat.mean(), data.long.mean()], zoom_start=10)

heatmap_data = data.groupby(['lat', 'long']).price.mean().reset_index().values
HeatMap(
    heatmap_data, 
    name='Price', radius=25, max_zoom=18, min_opacity=0.3
).add_to(folium.FeatureGroup(name='Heat Map').add_to(m))

m

In [None]:
import re

def haversine_distance(lat1, lon1, lat2, lon2):
    """Haversine formula for calculating distance between two points"""
    R = 6371.0088  #Radius of the earth
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1,lon1,lat2,lon2])
    
    a = np.sin((lat2 - lat1) / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2 - lon1) / 2) ** 2
    c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
    d = R * c
    return round(d, 4)

landmarks = pd.read_csv('/kaggle/input/seattle-landmarks-map/Seattle_Landmarks_Map.csv')
#Convert "(47.60870201200004, -122.30853251899998)" into lat and long features
landmarks = landmarks.join(landmarks.Shape.str.extract('\((?P<lat>.+), (?P<long>.+)\)').astype(float))
#Convert "101 Pike Place Street" to "landmark_101_Pike_Place_Street"
landmarks['FNAME'] = 'landmark_' + landmarks.NAME.str.replace('[^0-9a-zA-Z]+', '_')

#For each landmark, add distance between house and landmark
landmark_df = pd.DataFrame()
for i, row in landmarks.groupby('FNAME')[['lat', 'long']].mean().iterrows():
    landmark_df[row.name] = haversine_distance(data.lat, data.long, row.lat, row.long)

In [None]:
from sklearn.decomposition import PCA

#400 landmarks is a little too much, so let's convert them into components
pca_dim = 3

landmark_cols = ['Landmark_' + str(i + 1) for i in range(pca_dim)]
X.extend(landmark_cols)
data = data.join(pd.DataFrame(PCA(pca_dim).fit_transform(landmark_df.values), columns=landmark_cols))

In [None]:
m = folium.Map(location=[data.lat.mean(), data.long.mean()], zoom_start=10)

for i, row in landmarks.groupby('NAME')[['lat', 'long']].mean().iterrows():
    folium.Marker(
        [row.lat, row.long], popup=row.index
    ).add_to(m)

m

## Feature Engineering #2

In [None]:
# #Leaving zipcode as an integer will make the model think that zipcode order matters
# data['zipcode'] = data.zipcode.astype(str)
# data = pd.get_dummies(data)
# X.extend([c for c in data.columns if c.startswith('zipcode_')])
# X.remove('zipcode')

# Feature Selection & Linear Modeling

In [None]:
train_bound = '2015-03-01'
train, test = data[data.date < train_bound], data[data.date >= train_bound]

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10, n_jobs=-1)
knn.fit(train[X].values, train[y].values)

neighbor_cols = ['knn_' + str(c + 1) for c in range(10)]
data = data.join(pd.DataFrame(knn.kneighbors(data[X].values)[0], columns=neighbor_cols))
X.extend(neighbor_cols)
train, test = data[data.date < train_bound], data[data.date >= train_bound]

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

rf_model = RandomForestRegressor(n_estimators=300, criterion='mse', max_depth=5, n_jobs=-1)

selection = BorutaPy(rf_model, n_estimators='auto', verbose=2, random_state=1, max_iter=len(X))
selection.fit(train[X].values, train[y].values);

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))
ax.barh(X, selection.ranking_)
ax.set_title("Feature Ranking (lower is better)")
ax.set_xlabel("Rank")
ax.axvspan(2.5, max(selection.ranking_), alpha=0.3, color='red')
ax.axvspan(1.5, 2.5, alpha=0.3, color='yellow')
ax.axvspan(0, 1.5, alpha=0.3, color='green')
fig.tight_layout();

In [None]:
#Drop features which ranked poorly in feature selection
X = np.array(X)[selection.ranking_ <= 2]

In [None]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

models = {
    'RandomForest': rf_model,
    'Ridge': RidgeCV(alphas=(0.01, 0.05, 0.1, 0.3, 1.0, 5.0, 10.0), normalize=True, cv=5),
    'Lasso': LassoCV(eps=0.001, n_alphas=1000, tol=1e-2, alphas=None, normalize=True, cv=5, n_jobs=-1, positive=True),
    'ExtraTrees': ExtraTreesRegressor(n_estimators=600, criterion='mse', max_depth=10, n_jobs=-1),
    'LightGBM': lgb.LGBMRegressor(n_estimators=1000, num_leaves=31, silent=False, max_depth=-1, learning_rate=0.1, n_jobs=-1, importance_type='gain'),
    'XGBoost': xgb.XGBRegressor(n_estimators=2000, verbosity=1, eta=0.1, max_depth=8, n_jobs=-1)
}

preds = {}
for name, model in models.items():
    #Scale features, and include polynomial and interaction terms for regression
    if name in ['Ridge', 'Lasso']:
        pf = PolynomialFeatures()
        X_train = pf.fit_transform(train[X].values)
        X_test = pf.transform(test[X].values)
    else:
        X_train = train[X].values
        X_test = test[X].values
        
    #Scale features for regression
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    #Fit and store predictions
    model.fit(X_train, train[y].values)
    preds[name] = model.predict(X_test)

In [None]:
from sklearn import metrics

def mape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / max(1e-6, np.mean(y_true)))

scores_df = pd.DataFrame()
for name, pred in preds.items():
    scores = {
        'MSE': metrics.mean_squared_error(test.price.values, np.expm1(pred)),
        'MAE': metrics.mean_absolute_error(test.price.values, np.expm1(pred)),
        'MSE (log)': metrics.mean_squared_error(test.price_log.values, pred),
        'MAPE': mape(test.price.values, np.expm1(pred)),
        'R^2': metrics.r2_score(test.price.values, np.expm1(pred)),
    }
    scores_df = scores_df.append(pd.DataFrame(scores, index=[name]))
scores_df

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
order = np.argsort(models['LightGBM'].feature_importances_)
sns.barplot(x=models['LightGBM'].feature_importances_[order][::-1], y=X[order][::-1])
ax.set_title('Feature Importance')
fig.tight_layout()