# Q1. Predicting housing prices

In this problem, you will generate models for predicting house prices from given house features. The file “house_prices.txt” contains the data.

There are four features used in this regression:

* one binary feature (whether or not the house has covered parking)
* one numerical feature (size, measure in thousands of square meters)
* two categorical features (architectural style and location)

Each of the categorial features is represented as a vector of size 3. This gives us feature vectors of size 8 in total for each house. 200 examples are given in the training set.

In [None]:
#Importing libraries
import pathlib
import pandas as pd
import numpy as np
import torch
from torch import nn, optim, utils
from sklearn import metrics
#import matplotlib.pyplot as plt

In [None]:
#Setting the device to the GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#Training function
def train(dataloader, model, loss_fn, optimizer):
    model.train()
    for X, y in dataloader:
        yhat = model(X)
        loss = loss_fn(yhat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#Test function (loss)
def test_loss(features, target, model, loss_fn):
    with torch.no_grad():
        yhat = model(features)
        loss = loss_fn(yhat, target).item()
        return round(loss, 6)

#Test function (accuracy)
def test_accuracy(features, target, model):
    with torch.no_grad():
        yhat = model(features)

        accuracy = torch.sum(torch.eq(torch.argmax(yhat, dim=1), target))/target.size(dim=0)
        accuracy = accuracy.item()*100
        return f"{round(accuracy, 4)}%"

In [None]:
#Downloading the data
%%bash
gdown 1hLxbWVrnmSnZDGjvu8HeQex6ezbXeLB6

Downloading...
From: https://drive.google.com/uc?id=1hLxbWVrnmSnZDGjvu8HeQex6ezbXeLB6
To: /content/house_prices.txt
  0%|          | 0.00/6.08k [00:00<?, ?B/s]100%|██████████| 6.08k/6.08k [00:00<00:00, 20.3MB/s]


In [None]:
#Creating a path to our data
DATA_DIR = pathlib.Path("/content")

In [None]:
#Loading the data
features_1 = pd.read_csv(
    DATA_DIR / "house_prices.txt",
    delimiter="\t",
    skiprows=[0,1,2,3,4],
    header=None,
    usecols=[0,1,2,3,4,6,7,8],
    names=["parking", "sq_meters", "art_deco", "bungalow", "colonial", "west", "east", "north"],
    dtype=np.float32
)

target_1 = pd.read_csv(
    DATA_DIR / "house_prices.txt",
    delimiter="\t",
    skiprows=[0,1,2,3,4],
    header=None,
    usecols=[5],
    names=["price"],
    dtype=np.float32
)


## a) Split the dataset into training and testing sets

Split the dataset into training and testing sets. Keep 80% of the data for training and 20% of the data for testing.

In [None]:
#Creating an array of indices
n_1 = features_1.shape[0]
ind_1 = np.random.permutation(np.arange(0, n_1))

#Extracting subarrays of corresponding indices
n_train_1 = int(np.floor(0.8*n_1))
ind_train_1 = ind_1[:n_train_1]
n_test_1 = n_1-n_train_1
ind_test_1 = ind_1[n_train_1:]

#Converting our data to numpy arrays
features_np_1 = features_1.to_numpy()
target_np_1 = target_1.to_numpy()

#Splitting the dataset into train and test datasets
features_train_1 = features_np_1[ind_train_1,:]
target_train_1 = target_np_1[ind_train_1,:]
features_test_1 = features_np_1[ind_test_1,:]
target_test_1 = target_np_1[ind_test_1,:]

## b) Build a neural network to predict house prices

Use the provided dataloaders, define a neural network with two hidden layers of 10 units each both with ReLU activation. The output unit should have no activation function.

In [None]:
#Creating datasets
features_train_tensor_1 = torch.from_numpy(features_train_1).to(device)
target_train_tensor_1 = torch.from_numpy(target_train_1).to(device)
dataset_train_1 = utils.data.TensorDataset(features_train_tensor_1, target_train_tensor_1)

features_test_tensor_1 = torch.from_numpy(features_test_1).to(device)
target_test_tensor_1 = torch.from_numpy(target_test_1).to(device)
dataset_test_1 = utils.data.TensorDataset(features_test_tensor_1, target_test_tensor_1)

#Creating dataloader
dataloader_train_1 = utils.data.DataLoader(dataset_train_1, batch_size=40, shuffle=True)

dataloader_test_1 = utils.data.DataLoader(dataset_test_1, batch_size=40, shuffle=False)

Define a neural network with two hidden layers of 10 units each both with ReLU activation. The output unit should have no activation function.

In [None]:
#Defining our neural network
class Net_1(nn.Module):

    def __init__(self):
        super(Net_1, self).__init__()
        self.lin1 = nn.Linear(8, 10)
        self.activ1 = nn.ReLU()
        self.lin2 = nn.Linear(10, 10)
        self.activ2 = nn.ReLU()
        self.out = nn.Linear(10, 1)

    def forward(self, x):
        x = self.lin1(x)
        x = self.activ1(x)
        x = self.lin2(x)
        x = self.activ2(x)
        x = self.out(x)
        return x

## c) Train your neural network using gradient descent

Using mean square loss, train the network with an appropriate optimizer for a few hundred epochs and plot the loss versus the number of training epochs.

In [None]:
#Defining an instance of our network and other parameters required for training
net_1 = Net_1().to(device)
loss_fn_1 = nn.MSELoss()
optimizer_1 = optim.Adam(net_1.parameters(), lr=1e-3)

#Training our network
epochs_1 = 200
for epoch_1 in range(epochs_1):
    train(dataloader_train_1, net_1, loss_fn_1, optimizer_1)

## d) Assess the performance of your neural network

Assess the performance of your neural network on the training data and the testing data. Comment on the results.

In [None]:
#Calculating loss of our network on the train and test datasets
print(test_loss(features_train_tensor_1, target_train_tensor_1, net_1, loss_fn_1))
print(test_loss(features_test_tensor_1, target_test_tensor_1, net_1, loss_fn_1))

0.002593
0.003833


The predictions seem to be good enough, as the errors are of the order of magnitude $10^{-3}$, but the true values are in the $10^{-1}$ range.

# Q2. Classification

Consider the following simulated classification dataset with N = 300 samples labeled with one of three class labels.

In [None]:
#Downloading the data
%%bash
gdown 1_n5odu1KSpBAqO_Ddbqy2Ty0mG12Dr0A
gdown 1C7GK_uY0srlUdLfHfGa4IZcfthmmdROr

Downloading...
From: https://drive.google.com/uc?id=1_n5odu1KSpBAqO_Ddbqy2Ty0mG12Dr0A
To: /content/spiral_features.npy
  0%|          | 0.00/4.93k [00:00<?, ?B/s]100%|██████████| 4.93k/4.93k [00:00<00:00, 18.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1C7GK_uY0srlUdLfHfGa4IZcfthmmdROr
To: /content/spiral_target.npy
  0%|          | 0.00/428 [00:00<?, ?B/s]100%|██████████| 428/428 [00:00<00:00, 2.13MB/s]


In [None]:
#Loading the data
features_2 = np.load(DATA_DIR / "spiral_features.npy").astype(np.float32)
target_2 = np.load(DATA_DIR / "spiral_target.npy")

#Visualizing the data
#plt.scatter(features_2[:, 0], features_2[:, 1], c=target_2, s=20)
#plt.show()

## a) Split the dataset into training and testing sets

Split the dataset into training and testing sets. Keep 80% of the data for training and 20% of the data for testing.

In [None]:
#Creating an array of indices
n_2 = features_2.shape[0]
ind_2 = np.random.permutation(np.arange(0, n_2))

#Extracting subarrays of corresponding indices
n_train_2 = int(np.floor(0.8*n_2))
ind_train_2 = ind_2[:n_train_2]
n_test_2 = n_2-n_train_2
ind_test_2 = ind_2[n_train_2:]

#Splitting the dataset into train and test datasets
features_train_2 = features_2[ind_train_2,:]
target_train_2 = target_2[ind_train_2]
features_test_2 = features_2[ind_test_2,:]
target_test_2 = target_2[ind_test_2]

## b) Train a logistic regression model

Using the provided datasets and dataloaders, build a 3-class logistic regression classifier for this dataset using PyTorch and train it for 2000 epochs using gradient descent.

In [None]:
#Creating datasets
features_train_tensor_2 = torch.from_numpy(features_train_2).to(device)
target_train_tensor_2 = torch.from_numpy(target_train_2).to(device)
dataset_train_2 = utils.data.TensorDataset(features_train_tensor_2, target_train_tensor_2)

features_test_tensor_2 = torch.from_numpy(features_test_2).to(device)
target_test_tensor_2 = torch.from_numpy(target_test_2).to(device)
dataset_test_2 = utils.data.TensorDataset(features_test_tensor_2, target_test_tensor_2)

#Creating dataloaders
dataloader_train_2 = utils.data.DataLoader(dataset_train_2, batch_size=40, shuffle=True)

dataloader_test_2 = utils.data.DataLoader(dataset_test_2, batch_size=40, shuffle=False)

#Adding "higher degree columns" to our logistic regression dataset so our predictions become more accurate
deg_logreg_2 = 4

features_train_tensor_logreg_2 = torch.empty((features_train_tensor_2.size(dim=0),0)).to(device)
for i in range(deg_logreg_2):
    for j in range(deg_logreg_2):
        features_train_tensor_logreg_2 = torch.cat((features_train_tensor_logreg_2, (features_train_tensor_2[:,0]**i * features_train_tensor_2[:,1]**j).reshape(-1,1)), 1)
dataset_train_logreg_2 = utils.data.TensorDataset(features_train_tensor_logreg_2, target_train_tensor_2)
dataloader_train_logreg_2 = utils.data.DataLoader(dataset_train_logreg_2, batch_size=40, shuffle=True)

features_test_tensor_logreg_2 = torch.empty((features_test_tensor_2.size(dim=0),0)).to(device)
for i in range(deg_logreg_2):
    for j in range(deg_logreg_2):
        features_test_tensor_logreg_2 = torch.cat((features_test_tensor_logreg_2, (features_test_tensor_2[:,0]**i * features_test_tensor_2[:,1]**j).reshape(-1,1)), 1)
dataset_test_logreg_2 = utils.data.TensorDataset(features_test_tensor_logreg_2, target_test_tensor_2)
dataloader_test_logreg_2 = utils.data.DataLoader(dataset_test_logreg_2, batch_size=40, shuffle=False)

In [None]:
#Defining our logistic regression module
class LogReg_2(nn.Module):

    def __init__(self):
        super(LogReg_2, self).__init__()
        self.out = nn.Linear(16, 3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.out(x)
        x = self.softmax(x)
        return x

In [None]:
#Defining an instance of our logistic regression module and other parameters required for training
logreg_2 = LogReg_2().to(device)
loss_fn_logreg_2 = nn.CrossEntropyLoss()
optimizer_logreg_2 = optim.Adam(logreg_2.parameters(), lr=1e-3)

#Training our network
epochs_logreg_2 = 2000
for epoch_logreg_2 in range(epochs_logreg_2):
    train(dataloader_train_logreg_2, logreg_2, loss_fn_logreg_2, optimizer_logreg_2)

## c) Assess the performance of your logistic regression model

Assess the performance of your model on the training data and the testing data in terms of both loss and accuracy. Comment on the results.

In [None]:
#Calculating loss and accuracy of our network on the train and test datasets
print(test_loss(features_train_tensor_logreg_2, target_train_tensor_2, logreg_2, loss_fn_logreg_2))
print(test_accuracy(features_train_tensor_logreg_2, target_train_tensor_2, logreg_2))

print()

print(test_loss(features_test_tensor_logreg_2, target_test_tensor_2, logreg_2, loss_fn_logreg_2))
print(test_accuracy(features_test_tensor_logreg_2, target_test_tensor_2, logreg_2))

0.681968
90.4167%

0.703065
86.6667%


The results are pretty good, however, there is still some room for improvement.

## d) Build a neural network classifier

Build a 3-class classifier using a neural network with one hidden layer of 100 units and a ReLU activation. Train the network with a gradient descent algorithm and for 2000 iterations.

In [None]:
#Defining our neural network
class Net_2(nn.Module):

    def __init__(self):
        super(Net_2, self).__init__()
        self.lin = nn.Linear(2, 100)
        self.activ = nn.ReLU()
        self.out = nn.Linear(100, 3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.lin(x)
        x = self.activ(x)
        x = self.out(x)
        x = self.softmax(x)
        return x

In [None]:
#Defining an instance of our network and other parameters required for training
net_2 = Net_2().to(device)
loss_fn_2 = nn.CrossEntropyLoss()
optimizer_2 = optim.Adam(net_2.parameters(), lr=1e-3)

#Training our network
epochs_2 = 2000
for epoch_2 in range(epochs_2):
    train(dataloader_train_2, net_2, loss_fn_2, optimizer_2)

## e) Assess the performance of your neural network

Assess the performance of your neural network on the training data and the testing data in terms of both loss and accuracy. Comment on the results.

In [None]:
#Calculating loss and accuracy of our network on the train and test datasets
print(test_loss(features_train_tensor_2, target_train_tensor_2, net_2, loss_fn_2))
print(test_accuracy(features_train_tensor_2, target_train_tensor_2, net_2))

print()

print(test_loss(features_test_tensor_2, target_test_tensor_2, net_2, loss_fn_2))
print(test_accuracy(features_test_tensor_2, target_test_tensor_2, net_2))

0.560955
99.1667%

0.551863
100.0%


The error and the accuracy are nearly the same on both datasets and the accuracy is close to perfect. These things imply that we did a good job of constructing a neural network

## f) Interpret the confusion matrix

Compute the confusion matrix for your classifier on both the training and testing data and interpret the results. You may either code your own confusion matrix or use the following function from Scikit-Learn.

In [None]:
#Printing confusion matrices using the built-in function
with torch.no_grad():
    yhat_train_2 = net_2(features_train_tensor_2)
    yhat_test_2 = net_2(features_test_tensor_2)

    print(metrics.confusion_matrix(target_train_tensor_2.cpu(), torch.argmax(yhat_train_2, dim=1).cpu()))
    print()
    print(metrics.confusion_matrix(target_test_tensor_2.cpu(), torch.argmax(yhat_test_2, dim=1).cpu()))

[[85  1  0]
 [ 0 78  0]
 [ 0  1 75]]

[[14  0  0]
 [ 0 22  0]
 [ 0  0 24]]


Since everything outside the main diagonal is zero or close to zero, that means that are network is very successful in doing its job of classifying the points.

## g) Create your own deep neural network!

Create your own deep neural network and try to get as high an accuracy score on the test set as possible. Be creative: this is an opportunity for you to demonstrate what you can do!

In [None]:
#Creating an array of indices
n_2b = features_2.shape[0]
ind_2b = np.random.permutation(np.arange(0, n_2b))

#Extracting subarrays of corresponding indices
n_train_2b = int(np.floor(0.667*n_2b))
ind_train_2b = ind_2b[:n_train_2b]
n_validation_2b = int(np.floor(0.167*n_2b))
ind_validation_2b = ind_2b[n_train_2b:n_train_2b+n_validation_2b]
n_test_2b = n_2b-n_train_2b-n_validation_2b
ind_test_2b = ind_2b[n_train_2b+n_validation_2b:]


#Splitting the dataset into train, validation and test datasets
features_train_2b = features_2[ind_train_2b,:]
target_train_2b = target_2[ind_train_2b]
features_validation_2b = features_2[ind_validation_2b,:]
target_validation_2b = target_2[ind_validation_2b]
features_test_2b = features_2[ind_test_2b,:]
target_test_2b = target_2[ind_test_2b]

In [None]:
#Creating datasets
features_train_tensor_2b = torch.from_numpy(features_train_2b).to(device)
target_train_tensor_2b = torch.from_numpy(target_train_2b).to(device)
dataset_train_2b = utils.data.TensorDataset(features_train_tensor_2b, target_train_tensor_2b)

features_validation_tensor_2b = torch.from_numpy(features_validation_2b).to(device)
target_validation_tensor_2b = torch.from_numpy(target_validation_2b).to(device)
dataset_validation_2b = utils.data.TensorDataset(features_validation_tensor_2b, target_validation_tensor_2b)

features_test_tensor_2b = torch.from_numpy(features_test_2b).to(device)
target_test_tensor_2b = torch.from_numpy(target_test_2b).to(device)
dataset_test_2b = utils.data.TensorDataset(features_test_tensor_2b, target_test_tensor_2b)

#Creating dataloaders
dataloader_train_2b = utils.data.DataLoader(dataset_train_2b, batch_size=25, shuffle=True)

dataloader_validation_2b = utils.data.DataLoader(dataset_validation_2b, batch_size=25, shuffle=False)

dataloader_test_2b = utils.data.DataLoader(dataset_test_2b, batch_size=25, shuffle=False)

In [None]:
#Defining my custom neural network with 2 hidden layers (100 units and 80 units)
class Net_2b(nn.Module):

    def __init__(self):
        super(Net_2b, self).__init__()
        self.lin1 = nn.Linear(2, 100)
        self.activ1 = nn.ReLU()
        self.lin2 = nn.Linear(100, 80)
        self.activ2 = nn.ReLU()
        self.out = nn.Linear(80, 3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.lin1(x)
        x = self.activ1(x)
        x = self.lin2(x)
        x = self.activ2(x)
        x = self.out(x)
        x = self.softmax(x)
        return x

In [None]:
#Defining an instance of my network and other parameters required for training
net_2b = Net_2b().to(device)
loss_fn_2b = nn.CrossEntropyLoss()
optimizer_2b = optim.Adam(net_2b.parameters(), lr=1e-4)

#Training our network
epochs_2b = 1000
for epoch_2b in range(epochs_2b):
    train(dataloader_train_2b, net_2b, loss_fn_2b, optimizer_2b)

In [None]:
#Calculating loss and accuracy of our network on the train and validation datasets
print(test_loss(features_train_tensor_2b, target_train_tensor_2b, net_2b, loss_fn_2b))
print(test_accuracy(features_train_tensor_2b, target_train_tensor_2b, net_2b))

print()

print(test_loss(features_validation_tensor_2b, target_validation_tensor_2b, net_2b, loss_fn_2b))
print(test_accuracy(features_validation_tensor_2b, target_validation_tensor_2b, net_2b))

0.587577
98.5%

0.558024
100.0%


In [None]:
#Calculating loss and accuracy of our network on the test dataset
print(test_loss(features_test_tensor_2b, target_test_tensor_2b, net_2b, loss_fn_2b))
print(test_accuracy(features_test_tensor_2b, target_test_tensor_2b, net_2b))

0.571689
100.0%


I am consistently getting a $100\%$ success rate on the test dataset, which means that our network is working great (even better than the first one). However, we are limited because we only have $300$ input datapoints, so there might be an even better neural network that we could only find if we had more data.

To find the optimal hyperparameters, I varied the number of layers, number of units per layer, the learning rate, number of total epochs, and I ended up with this model. I deleted the loops which helped me find the optimal values more efficiently, because we no longer need them.