In [1]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/20192.ai4eng/master/init.py
import init; init.init(force_download=False); init.get_weblink()

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import local.lib.timeseries as ts
from local.lib import calhousing as ch
%matplotlib inline


## The [`cal_housing`](https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html) repository publicly available

In [2]:
!head local/data/cal_housing_small.data
!wc local/data/cal_housing_small.data

In [3]:
d = pd.read_csv("local/data/cal_housing_small.data")
print (d.shape)
d.head()

## Understand data

In [4]:
import seaborn as sns
g = sns.pairplot(d)


## Show house locations on map

observa como el valor de las casas es más caro en zonas urbanas

In [5]:
from bokeh.plotting import *
#from bokeh.charts import *
from bokeh.models import *
import bokeh
from matplotlib import cm
from sklearn.preprocessing import MinMaxScaler

def latlng_to_meters(lat, lng):
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lng * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    return mx, my


def xplot_map(lat, lon, color=None, size=10):
    cmap = cm.rainbow
    wlat, wlong = latlng_to_meters(lat, lon)
    if color is not None:
        colors = MinMaxScaler(feature_range=(0,255)).fit_transform(color)
        colors = ["#%02x%02x%02x"%tuple([int(j*255) for j in cmap(int(i))[:3]]) for i in colors]

    openmap_url = 'http://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png'
    otile_url = 'http://otile1.mqcdn.com/tiles/1.0.0/sat/{Z}/{X}/{Y}.jpg'

    TILES = WMTSTileSource(url=openmap_url)
    tools="pan,wheel_zoom,reset"
    p = figure(tools=tools, plot_width=700,plot_height=600)

    p.add_tile(TILES)

    p.axis.visible = False

    cb = figure(plot_width=40, plot_height=600,  tools=tools)
    yc = np.linspace(np.min(color),np.max(color),20)
    c = np.linspace(0,255,20).astype(int)
    dy = yc[1]-yc[0]
    cb.rect(x=0.5, y=yc, color=["#%02x%02x%02x"%tuple([int(j*255) for j in cmap(int(i))[:3]]) for i in c], width=1, height = dy)
    cb.xaxis.visible = False
    p.circle(np.array(wlat), np.array(wlong), color=colors, size=size)
    pb = gridplot([[p, cb]])
    show(pb)

In [6]:
ds = d.sample(500)
xplot_map(ds["latitude"].values, 
         ds["longitude"].values, ds["medianHouseValue"].values.reshape(-1,1)/1e5) 

## Separate variable to predict

In [7]:
X = d.as_matrix()[:,:-1]
y = d["medianHouseValue"].as_matrix()
print (X.shape, y.shape)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error, r2_score, mean_squared_error

In [9]:
Xtr, Xts, ytr, yts = train_test_split(X,y, test_size=0.3)
print (Xtr.shape, ytr.shape, Xts.shape, yts.shape)

## A linear regression

- check [sklearn LinearRegression doc](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) to understand the `score` function.

In [10]:
lr = LinearRegression()
lr.fit(Xtr, ytr)
lr.score(Xtr, ytr), lr.score(Xts, yts)

In [11]:
r2_score(yts, lr.predict(Xts))

In [12]:
median_absolute_error(yts, lr.predict(Xts))

In [13]:
mean_squared_error(yts, lr.predict(Xts))

**however we will create our score**

mean releative absolute error

In [14]:
def rel_mrae(estimator, X, y):
    preds = estimator.predict(X)
    return np.mean(np.abs(preds-y)/y)

In [15]:
rel_mrae(lr, Xtr, ytr), rel_mrae(lr, Xts, yts)

let's understand prediction errors

In [16]:
preds = lr.predict(Xts)
errors = np.abs(preds-yts)/yts

In [17]:
plt.figure(figsize=(20,3))
cols = ["longitude","latitude","housingMedianAge", "totalRooms","totalBedrooms"]
for i,col in enumerate(cols):
    plt.subplot(1,len(cols),i+1)
    plt.scatter(errors, Xts[:,i])
    plt.ylabel(col)
    plt.xlabel("relative error")
    plt.grid();

we observe there is no significant correlation between the error and any. It does seem that when the `houseMedianAge` is smaller, the error is also smaller, and when the `totalBedrooms` is higher the error is also smaller. However this seems to involve only a fraction of the houses. The correlation coefficients seems to capture this.



In [18]:
corrcoefs = pd.DataFrame([np.corrcoef(Xts[:,i], errors)[0,1] for i in range(len(cols))], index=cols, columns=["corrcoef"])
corrcoefs


## How sure can we be of our model performance

resample, train and measure

- **bootstrap**: resample and put back
- **cross validation**: resample and partition

In [56]:
from progressbar import progressbar as pbar

def bootstrap_score(estimator, X, y, test_size):
  trscores, tsscores = [], []
  for _ in range(10):
    Xtr, Xts, ytr, yts = train_test_split(X,y, test_size=test_size)
    estimator.fit(Xtr, ytr)
    trscores.append(rel_mrae(estimator, Xtr, ytr))
    tsscores.append(rel_mrae(estimator, Xts, yts))

  return (np.mean(trscores), np.std(trscores)), (np.mean(tsscores), np.std(tsscores))

In [57]:
estimator = LinearRegression()
(trmean, trstd), (tsmean, tsstd) = bootstrap_score(estimator, X, y, test_size=0.3)
print ("train score %.3f (±%.4f)"%(trmean, trstd))
print ("test score  %.3f (±%.4f)"%(tsmean, tsstd))

**the `sklearn` library provides several validation methods**

**Bootstrapping**: data is sampled randomly at every split

In [65]:
from sklearn.model_selection import ShuffleSplit, KFold,cross_val_score

In [78]:
ss = ShuffleSplit(n_splits=3, test_size=0.3)

for a,b in ss.split(range(10)):
    print (a, b)

In [68]:
z = cross_val_score(lr, X, y, cv = ShuffleSplit(n_splits=10, test_size=0.3), scoring=rel_mrae)
print (z)
print ("test score  %.3f (±%.4f)"%(np.mean(z), np.std(z)))


**Cross Validation**: data is partitioned

In [79]:
ss = KFold(n_splits=3)

for a,b in ss.split(range(10)):
    print (a, b)

In [75]:
z = cross_val_score(lr, X, y, cv = KFold(n_splits=10), scoring=rel_mrae)
print (z)
print ("test score  %.3f (±%.4f)"%(np.mean(z), np.std(z)))


assess the score with a **learning curve**

In [0]:
from sklearn.model_selection import ShuffleSplit

In [121]:
cv = ShuffleSplit(n_splits=10, test_size=.3)


ch.plot_learning_curve(estimator, estimator.__class__.__name__, X, y, 
                       cv=cv, scoring=rel_mrae, ylim=(0,0.7))

## Diagnosing

**Linear regression BASELINE**

In [166]:
estimator = LinearRegression()
cv = ShuffleSplit(n_splits=10, test_size=.3)
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X, y, cv=cv, scoring=rel_mrae, ylim=(0,0.7))

**We have UNDERFITTING (high bias)**
1. increase model complexity
2. get more columns

In [131]:
# try first increasing model complexity --> a bit better but with overfitting
# experiment with different max_depth values

from sklearn.tree import DecisionTreeRegressor
estimator = DecisionTreeRegressor(max_depth=8)
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X, y, cv=cv, scoring=rel_mrae, ylim=(0,0.7))

In [147]:
# try now with more columns (we have them!!!) --> improves a bit

d2 = pd.read_csv("local/data/cal_housing_small_full.data")
d2.head()

In [148]:
estimator = LinearRegression()
X2 = d2[[col for col in d2.columns if col!="medianHouseValue"]].values
y2 = d2["medianHouseValue"].as_matrix()
print (X2.shape, y2.shape)

ch.plot_learning_curve(estimator, estimator.__class__.__name__, X2, y2, cv=cv, scoring=rel_mrae)

## Diagnosing

**Random Forest BASELINE**


In [151]:
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor(max_depth=10)
(trmean, trstd), (tsmean, tsstd) = bootstrap_score(estimator, X, y, test_size=0.3)
print ("train score %.3f (±%.4f)"%(trmean, trstd))
print ("test score  %.3f (±%.4f)"%(tsmean, tsstd))

In [152]:
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X, y, cv=cv, scoring=rel_mrae)

**We have OVERFITTING (high variance)**
- reduce model complexity
- get more data

In [156]:
# try first reduce model complexity --> more BIAS
estimator = RandomForestRegressor(max_depth=4)
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X, y, cv=cv, scoring=rel_mrae)

In [163]:
# try now with more data (we have A LOT!!!) 

d3 = pd.read_csv("local/data/cal_housing.data")
print ("TOTAL AVAILABLE DATA", d3.shape)
d3 = d3.sample(10000)
estimator = RandomForestRegressor(max_depth=10)
X3 = d3.as_matrix()[:,:-1]
y3 = d3["medianHouseValue"].as_matrix()
print ("building learning curve with", X3.shape, y3.shape)

ch.plot_learning_curve(estimator, estimator.__class__.__name__, X3, y3, cv=cv, scoring=rel_mrae)

## What if we made the wrong choice


- **Linear Regression** UNDERFITTING and choose acquire more data: No improvement!!!

In [159]:
estimator = LinearRegression()
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X3, y3, cv=cv, scoring=rel_mrae)

- **Random Forest** OVERFITTING and choose to add more columns: some improvement!!!

In [160]:
estimator = RandomForestRegressor(max_depth=10)
ch.plot_learning_curve(estimator, estimator.__class__.__name__, X2, y2, cv=cv, scoring=rel_mrae)

- let's now choose more data and more columns (a luxury!!!)

In [168]:
d4 = pd.read_csv("local/data/cal_housing_full.data")
print ("TOTAL AVAILABLE DATA", d4.shape)
d4 = d4.sample(10000)
estimator = RandomForestRegressor(max_depth=10)
X4 = d4.as_matrix()[:,:-1]
y4 = d4["medianHouseValue"].as_matrix()
print ("building learning curve with", X4.shape, y4.shape)

ch.plot_learning_curve(estimator, estimator.__class__.__name__, X4, y4, cv=cv, scoring=rel_mrae)