<a href="https://colab.research.google.com/github/rsfwalters/NEU-OB-MLP/blob/main/Housing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Housing Data

[write intro about the data]

## 1. Load Data

- [load all the data]
- [show some rows -- pandas like chart]
- [select what are input and the label]


In [None]:
import pandas as pd
import numpy as np
import torch
import plotly.graph_objects as go

pd.set_option('display.expand_frame_repr', False)

# You can load other datasets from OpenML: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html

# See details about this dataset: https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
from sklearn.datasets import fetch_california_housing

# Load the data once as a Pandas DataFrame, so we can inspect the features
housing_frame = fetch_california_housing(as_frame=True).frame

In [None]:
print(housing_frame)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  MedHouseVal
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88    -122.23        4.526
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86    -122.22        3.585
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85    -122.24        3.521
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85    -122.25        3.413
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85    -122.25        3.422
...       ...       ...       ...        ...         ...       ...       ...        ...          ...
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48    -121.09        0.781
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49    -121.21        0.771
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43    -121.22    

In [None]:
print(housing_frame.describe())

             MedInc      HouseAge      AveRooms     AveBedrms    Population      AveOccup      Latitude     Longitude   MedHouseVal
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000
mean       3.870671     28.639486      5.429000      1.096675   1425.476744      3.070655     35.631861   -119.569704      2.068558
std        1.899822     12.585558      2.474173      0.473911   1132.462122     10.386050      2.135952      2.003532      1.153956
min        0.499900      1.000000      0.846154      0.333333      3.000000      0.692308     32.540000   -124.350000      0.149990
25%        2.563400     18.000000      4.440716      1.006079    787.000000      2.429741     33.930000   -121.800000      1.196000
50%        3.534800     29.000000      5.229129      1.048780   1166.000000      2.818116     34.260000   -118.490000      1.797000
75%        4.743250     37.000000      6.052381      1.099526   1725.000000 

In [None]:
# For convenience, load the separated data and labels
housing = fetch_california_housing()
data = housing.data
labels = housing.target
print(data.shape)
print(labels.shape)

# The data consists of 20640 items, each with 8 features

(20640, 8)
(20640,)


In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=labels))
fig.update_layout(
    title="Histogram of label variable (median house price) ",
    xaxis_title="Median House Price (x $100K)",
    yaxis_title="Counts",
)
fig.show()

In [None]:
# Split the data into a train, validation, and test set.
# We'll try to extract as much information as we can from the train set.
# Periodically, we can check the validation set to see how we're doing (like a practice quiz)
# Only once, at the very end, we check our performance on the test set.

N_items = data.shape[0]

shuffled_indices = np.random.permutation(N_items)
N_test = int(0.1 * N_items)
N_val = int(0.1 * N_items)

shuffled_data = data[shuffled_indices]
shuffled_labels = labels[shuffled_indices]
test_data, test_labels = shuffled_data[:N_test], shuffled_labels[:N_test]
val_data, val_labels = shuffled_data[N_test:N_test+N_val], shuffled_labels[N_test:N_test+N_val]
train_data, train_labels = shuffled_data[N_test+N_val:], shuffled_labels[N_test+N_val:]

print("Training set:", train_data.shape, train_labels.shape)
print("Validation set:", val_data.shape, val_labels.shape)
print("Test set:", test_data.shape, test_labels.shape)

Training set: (16512, 8) (16512,)
Validation set: (2064, 8) (2064,)
Test set: (2064, 8) (2064,)


In [None]:
# We'll use 32-bit floating point values, which should be plenty of precision for our purposes.
# Note that the data type of our data must match the data type of our model layers later on.

train_data, train_labels = train_data.astype("float32"), train_labels.astype("float32")
val_data, val_labels = val_data.astype("float32"), val_labels.astype("float32")
test_data, test_labels = test_data.astype("float32"), test_labels.astype("float32")

In [None]:
# Convert the data for use in pytorch
from torch.utils.data import TensorDataset, DataLoader


train_dataloader = DataLoader(
    TensorDataset(torch.as_tensor(train_data), torch.as_tensor(train_labels)),
    batch_size=32,     # How many items should we grab for each gradient descent step
    pin_memory=True,   # Helps transfer data to/from GPU faster
    shuffle=True,      # In each epoch, we shuffle the training data
    )

val_dataloader = DataLoader(
    TensorDataset(torch.as_tensor(val_data), torch.as_tensor(val_labels)),
    batch_size=32,     # How many items should we grab for each gradient descent step
    pin_memory=True,   # Helps transfer data to/from GPU faster
    shuffle=False,     
    )

test_dataloader = DataLoader(
    TensorDataset(torch.as_tensor(test_data), torch.as_tensor(test_labels)),
    batch_size=32,     # How many items should we grab for each gradient descent step
    pin_memory=True,   # Helps transfer data to/from GPU faster
    shuffle=False,     
    )

## 2. Define Model

- highlight choices of layer types
- widths
- depth
- normalization

Keep as simple as possible
Comment out options and add comments pointing to things to play with

In [None]:
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiplicativeLR
import torch.nn.functional as F

# The model will take a batch of input items and predict outputs
model = nn.Sequential(
    nn.Linear(8, 32),
    nn.ReLU(),
    nn.Linear(32, 32),
    nn.ReLU(),
    nn.Linear(32, 1)   # We end with a single output value; the predicted price
    # Can we improve performance by restricting the output to a reasonable range?
)
model = model.to("cuda")  # Send the model to the GPU

# We will use the optimizer to adjust the model parameters after predicting each batch
optim = Adam(model.parameters())

# After each epoch, we can reduce the learning rate. 
sched = MultiplicativeLR(optim, lr_lambda=lambda epoch: 0.95)

## 3. Train and Evaluate

- qualitative and quantitative metrics
- show worst predictions
- examples of outputs
- training and validation 

- learning rate
- loss functions 
- regularization

Note - if you adjust the training procedure, be sure to re-initialize the model by re-running that cell above. Otherwise, your model would not be starting "from scratch"; it would start training from the point where it left off.


In [None]:
from tqdm.notebook import tqdm, trange

In [None]:
# We'll keep note of our training loss at every batch, and our validation loss at every epoch
train_loss_tracking, val_loss_tracking = [], [] 
epochs = 10

for epoch in trange(epochs, desc="epochs", leave=True, position=0):
    # Perform an epoch of training
    model.train()  # Set the model into training mode. This affects some layer behavior, such as dropout and batchnorm
    for batch_data, batch_labels in tqdm(train_dataloader, desc="batches", leave=False, position=1):
        batch_data = batch_data.to("cuda")
        batch_labels = batch_labels.to("cuda")

        predictions = model(batch_data).squeeze()
        loss = F.mse_loss(predictions, batch_labels)
        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss_tracking.append(loss.item())

    # Check our performance on the validation set
    with torch.no_grad():
        avg_val_loss = 0.0
        model.eval()
        for batch_data, batch_labels in tqdm(val_dataloader, desc="batches", leave=False, position=1):
            batch_data = batch_data.to("cuda")
            batch_labels = batch_labels.to("cuda")

            predictions = model(batch_data).squeeze()
            avg_val_loss += F.mse_loss(predictions, batch_labels, reduction="sum")  # Sum here, since we'll average ourselves
        avg_val_loss /= len(val_dataloader.dataset)    
        val_loss_tracking.append(avg_val_loss.item())
    sched.step()  # Adjust our learning rate after every epoch


# Finally at the very end, check our performance on the test set
with torch.no_grad():
    model.eval()   # Set the model into evaluation mode.
    avg_test_loss = 0.0
    for batch_data, batch_labels in tqdm(test_dataloader):
        batch_data = batch_data.to("cuda")
        batch_labels = batch_labels.to("cuda")
        predictions = model(batch_data).squeeze()
        avg_test_loss += F.mse_loss(predictions, batch_labels, reduction="sum")
    avg_test_loss /= len(test_dataloader.dataset)

epochs:   0%|          | 0/10 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

batches:   0%|          | 0/516 [00:00<?, ?it/s]

batches:   0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
print("Average test loss:", round(avg_test_loss.item(), 3))

Average test loss: 0.683


In [None]:
from plotly.subplots import make_subplots

fig = go.Figure()
fig.update_layout(
    title="Training and Validation loss",
    xaxis_title="Batch number",
    yaxis_title="Loss value",
)
fig.add_trace(go.Scatter(x=np.arange(len(train_loss_tracking)), y=train_loss_tracking, name="Training"))

# Each item in the validation loss list was measured after an epoch.
# Each epoch consists of multiple batches
batches_per_epoch = len(train_dataloader)
x = np.arange(len(val_loss_tracking)) * batches_per_epoch
fig.add_trace(go.Scatter(x=x, y=val_loss_tracking, name="Validation"))
fig.show()
