In [None]:
# ======= Variables =========

# Fill in the variables below and run cell first
# Note: Use forward slashes / instead of backslashes \ in file paths

DATASET_FOR_MODEL_CSV_FILE_PATH = 'C:/cal-housing-prices/dataset/dataset.csv'

TRAINING_SET_CSV_FILE_PATH = 'C:/cal-housing-prices/dataset/training_set.csv'
TEST_SET_CSV_FILE_PATH = 'C:/cal-housing-prices/dataset/test_set.csv'
TEST_SET_SIZE = 0.1  # fraction of the dataset to be used as test set

RANDOM_SEED = 123


In [2]:
# ========= Libraries =========

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, Dataset, random_split

## ML

In [None]:
# ========= CustomDataset class =========

torch.manual_seed(RANDOM_SEED)  # Set the random seed in PyTorch for reproducibility

class CustomDataset(Dataset):
    """
    A custom dataset class for handling data directly from a pandas dataframe.

    """
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        # Assuming the last column is the output/target
        data = torch.tensor(row[:-1].values, dtype=torch.float32)
        target = torch.tensor(row[-1], dtype=torch.float32)
        return data, target


In [8]:
# ========= Create Training and Test sets =========

# Load dataset into a pandas DataFrame
df = pd.read_csv(DATASET_FOR_MODEL_CSV_FILE_PATH)

# Separate a Test set and keep it aside
dfTest = df.sample(frac=TEST_SET_SIZE, axis='index', random_state=RANDOM_SEED) # random_state parameter is included for reproducibility
df = df.drop(dfTest.index) # the remaining data is used as the training set

# Save the Training and Test sets to CSV files
df.to_csv(TRAINING_SET_CSV_FILE_PATH, index=False)
dfTest.to_csv(TEST_SET_CSV_FILE_PATH, index=False)

In [12]:
# ========= Neural Network and Training classes =========

class SimpleNN(torch.nn.Module):
    def __init__(self, input_size=3, output_size=1, neurons=10):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_size, neurons)
        self.fc2 = torch.nn.Linear(neurons, output_size)

    def forward(self, x):
        # print("input: ", x.shape)
        x = self.fc1(x)
        x = torch.nn.functional.relu(x)
        x = self.fc2(x)
        # print("output: ", x.shape)
        return x
    
class NeuralNetworkRegressor:
    def __init__(self, input_size=5, output_size=1, neurons=10, learning_rate=0.001, batch_size=32, epochs=100):
        self.norm_params = {}
        self.model = SimpleNN(input_size=input_size, output_size=output_size, neurons=neurons)
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.device = torch.device("cpu")
        self.model.to(self.device)
        self.loss_function = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)

    def normalize(self, dataframe, train=False):
        # Min-Max Normalization
        for column in dataframe.columns:
            min_value = dataframe[column].min() if train else self.norm_params[column][0]
            max_value = dataframe[column].max() if train else self.norm_params[column][1]
            if train:
                self.norm_params[column] = (min_value, max_value)
            dataframe[column] = (dataframe[column] - min_value) / (max_value - min_value)
        return dataframe
    
    def denormalize(self, dataframe):
        # Min-Max De-normalization
        for column in dataframe.columns:
            min_value, max_value = self.norm_params[column]
            dataframe[column] = dataframe[column] * (max_value - min_value) + min_value
        return dataframe

    def train(self, dataframe):

        dataframe = self.normalize(dataframe.copy(), train=True)
        dataset = CustomDataset(dataframe)
        train_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            total_loss = 0

            for data, target in train_loader:

                data, target = data.to(self.device), target.to(self.device)
                # print(data.shape, target.shape)
                
                self.model.zero_grad()
                output = self.model(data)
                loss = self.loss_function(output, target.view(-1, 1))
                
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()

            print(f"Epoch: {epoch + 1}, Loss: {total_loss}")

    def predict(self, dataframe):
        # assume the last column in the dataframe is the target initialized to 0 and will be replaced by the prediction
        dataframe = self.normalize(dataframe, train=False)
        dataset = CustomDataset(dataframe)

        self.model.eval()

        for i in range(len(dataset)):
            data, _ = dataset[i]
            data = data.to(self.device)
            output = self.model(data)
            dataframe.iloc[i, -1] = output.item()

        dataframe = self.denormalize(dataframe)
        return dataframe
    

## Train Model

In [9]:
# ========= Load Training and Test sets =========

# Load the Training set into a pandas dataframe
dfTraining = pd.read_csv(TRAINING_SET_CSV_FILE_PATH)

# Load the Test set into a pandas dataframe
dfTest = pd.read_csv(TEST_SET_CSV_FILE_PATH)

In [None]:
# ========= Training =========

model = NeuralNetworkRegressor(input_size=5, output_size=1, neurons=100, learning_rate=0.001, batch_size=32, epochs=100)

model.train(dfTraining)

## Training Set Prediction

In [None]:
import matplotlib.pyplot as plt

# make a copy of dfTraining to keep the original data
dfPredict = dfTraining.copy()

model.predict(dfPredict)

# plot the test target vs prediction in scatter plot. also plot the y = x line
plt.scatter(df.iloc[:,-1], dfPredict.iloc[:,-1], s=5, marker='o')
plt.plot([0, df.iloc[:,-1].max()], [0, df.iloc[:,-1].max()], color='black', linewidth=1)
plt.title("Training Set")
plt.xlabel("Target")
plt.ylabel("Prediction")
plt.show()

## Test Set Prediction

In [None]:
import matplotlib.pyplot as plt

# make a copy of dfTest to keep the original data
dfTestPredict = dfTest.copy()

model.predict(dfTestPredict)

# plot the test target vs prediction in scatter plot. also plot the y = x line
plt.scatter(dfTest.iloc[:,-1], dfTestPredict.iloc[:,-1], s=5, marker='o')
plt.plot([0, dfTest.iloc[:,-1].max()], [0, dfTest.iloc[:,-1].max()], color='black', linewidth=1)
plt.title("Test Set")
plt.xlabel("Target")
plt.ylabel("Prediction")
plt.show()