## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

## Load the data

In [2]:
# Randomly read one of the training files

file_num = 1
file_num = random.randint(1, 5)

train_data_path = "data/train_data" + str(file_num) + ".csv"
train_label_path = "data/train_label" + str(file_num) + ".csv"

df = pd.read_csv(train_data_path)
labels = pd.read_csv(train_label_path)

# Combine both DataFrames
df["BEDS"] = labels["BEDS"]

## Data Preprocessing

In [3]:
df.head()

Unnamed: 0,BROKERTITLE,TYPE,PRICE,BATH,PROPERTYSQFT,ADDRESS,STATE,MAIN_ADDRESS,ADMINISTRATIVE_AREA_LEVEL_2,LOCALITY,SUBLOCALITY,STREET_NAME,LONG_NAME,FORMATTED_ADDRESS,LATITUDE,LONGITUDE,BEDS
0,Brokered by Douglas Elliman -111 Fifth Ave,Condo for sale,315000,2.0,1400,2 E 55th St Unit 803,"New York, NY 10022","2 E 55th St Unit 803, New York, NY 10022",New York County,New York,Manhattan,East 55th Street,Regis Residence,"Regis Residence, 2 E 55th St #803, New York, N...",40.761255,-73.974483,2
1,Brokered by Serhant,Condo for sale,195000000,10.0,17545,Central Park Tower Penthouse-217 W 57th New Yo...,"New York, NY 10019",Central Park Tower Penthouse-217 W 57th New Yo...,United States,New York,New York County,New York,West 57th Street,"217 W 57th St, New York, NY 10019, USA",40.766393,-73.980991,7
2,Brokered by Sowae Corp,House for sale,260000,2.0,2015,620 Sinclair Ave,"Staten Island, NY 10312","620 Sinclair Ave, Staten Island, NY 10312",United States,New York,Richmond County,Staten Island,Sinclair Avenue,"620 Sinclair Ave, Staten Island, NY 10312, USA",40.541805,-74.196109,4
3,Brokered by Sotheby's International Realty - E...,Townhouse for sale,55000000,2.5,14175,5 E 64th St,"New York, NY 10065","5 E 64th St, New York, NY 10065",United States,New York,New York County,New York,East 64th Street,"5 E 64th St, New York, NY 10065, USA",40.767224,-73.969856,7
4,Brokered by Sowae Corp,House for sale,690000,2.0,4004,584 Park Pl,"Brooklyn, NY 11238","584 Park Pl, Brooklyn, NY 11238",United States,New York,Kings County,Brooklyn,Park Place,"584 Park Pl, Brooklyn, NY 11238, USA",40.674363,-73.958725,5


In [4]:
# Drop some columns

cols_to_drop = [
    "BROKERTITLE", "ADDRESS", "STATE", "MAIN_ADDRESS", "ADMINISTRATIVE_AREA_LEVEL_2",
    "LOCALITY", "SUBLOCALITY", "STREET_NAME", "LONG_NAME", "FORMATTED_ADDRESS",
]

df = df.drop(cols_to_drop, axis=1)

# df = df.drop(["BROKERTITLE"], axis=1)
# df = df.drop(["ADDRESS"], axis=1)
# df = df.drop(["STATE"], axis=1)
# df = df.drop(["MAIN_ADDRESS"], axis=1)
# df = df.drop(["ADMINISTRATIVE_AREA_LEVEL_2"], axis=1)
# df = df.drop(["LOCALITY"], axis=1)
# df = df.drop(["SUBLOCALITY"], axis=1)
# df = df.drop(["STREET_NAME"], axis=1)
# df = df.drop(["LONG_NAME"], axis=1)
# df = df.drop(["FORMATTED_ADDRESS"], axis=1)

In [5]:
df.head()

Unnamed: 0,TYPE,PRICE,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE,BEDS
0,Condo for sale,315000,2.0,1400,40.761255,-73.974483,2
1,Condo for sale,195000000,10.0,17545,40.766393,-73.980991,7
2,House for sale,260000,2.0,2015,40.541805,-74.196109,4
3,Townhouse for sale,55000000,2.5,14175,40.767224,-73.969856,7
4,House for sale,690000,2.0,4004,40.674363,-73.958725,5


In [6]:
# Encode the TYPE column
encoder_TYPE = OneHotEncoder(sparse_output=False)
encoded_data = encoder_TYPE.fit_transform(df[["TYPE"]])
encoded_df = pd.DataFrame(encoded_data, columns=encoder_TYPE.get_feature_names_out(["TYPE"]))
df = pd.concat([df.drop(["TYPE"], axis=1), encoded_df], axis=1)

In [7]:
df.head()

Unnamed: 0,PRICE,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE,BEDS,TYPE_Co-op for sale,TYPE_Coming Soon,TYPE_Condo for sale,TYPE_Contingent,TYPE_For sale,TYPE_Foreclosure,TYPE_House for sale,TYPE_Land for sale,TYPE_Mobile house for sale,TYPE_Multi-family home for sale,TYPE_Pending,TYPE_Townhouse for sale
0,315000,2.0,1400,40.761255,-73.974483,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,195000000,10.0,17545,40.766393,-73.980991,7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,260000,2.0,2015,40.541805,-74.196109,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,55000000,2.5,14175,40.767224,-73.969856,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,690000,2.0,4004,40.674363,-73.958725,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Scale the PRICE column
scaler_PRICE = MinMaxScaler()
df["PRICE_SCALED"] = scaler_PRICE.fit_transform(df[["PRICE"]])
df = df.drop(["PRICE"], axis=1)

# Scale the BATH column
scaler_BATH = MinMaxScaler()
df["BATH_SCALED"] = scaler_BATH.fit_transform(df[["BATH"]])
df = df.drop(["BATH"], axis=1)

# Scale the PROPERTYSQFT column
scaler_PROPERTYSQFT = MinMaxScaler()
df["PROPERTYSQFT_SCALED"] = scaler_PROPERTYSQFT.fit_transform(df[["PROPERTYSQFT"]])
df = df.drop(["PROPERTYSQFT"], axis=1)

# Scale the LATITUDE column
scaler_LATITUDE = MinMaxScaler()
df["LATITUDE_SCALED"] = scaler_LATITUDE.fit_transform(df[["LATITUDE"]])
df = df.drop(["LATITUDE"], axis=1)

# Scale the LONGITUDE column
scaler_LONGITUDE = MinMaxScaler()
df["LONGITUDE_SCALED"] = scaler_LONGITUDE.fit_transform(df[["LONGITUDE"]])
df = df.drop(["LONGITUDE"], axis=1)

In [9]:
df.head()

Unnamed: 0,BEDS,TYPE_Co-op for sale,TYPE_Coming Soon,TYPE_Condo for sale,TYPE_Contingent,TYPE_For sale,TYPE_Foreclosure,TYPE_House for sale,TYPE_Land for sale,TYPE_Mobile house for sale,TYPE_Multi-family home for sale,TYPE_Pending,TYPE_Townhouse for sale,PRICE_SCALED,BATH_SCALED,PROPERTYSQFT_SCALED,LATITUDE_SCALED,LONGITUDE_SCALED
0,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000146,0.046512,0.017675,0.633396,0.503643
1,7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090803,0.232558,0.26496,0.645833,0.49177
2,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00012,0.046512,0.027095,0.102276,0.099262
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.02561,0.05814,0.213344,0.647842,0.512086
4,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00032,0.046512,0.057559,0.423098,0.532397


In [10]:
X = np.array(df.drop(["BEDS"], axis=1), dtype="float32")
y = np.array(df[["BEDS"]], dtype="float32")

## Linear Regression (cuz why not?)

In [11]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model = lr_model.fit(X, y)

pred = [round(x[0]) for x in lr_model.predict(X)]
label = [round(x[0]) for x in y]

mse = mean_squared_error(label, pred)
rmse = root_mean_squared_error(label, pred)
r2 = r2_score(label, pred)
acc = accuracy_score(label, pred)

print("Mean Squared Error =", mse)
print("Root Mean Squared Error =", rmse)
print("R Squared Error =", r2)
print(f"Accuracy = {round(acc * 100, 3)} %")

Mean Squared Error = 2.012349309026757
Root Mean Squared Error = 1.4185729833275258
R Squared Error = 0.6596304296874749
Accuracy = 40.459 %


## Single Layer NN using PyTorch

In [12]:
# Import PyTorch
import torch
from torch import nn

# Set device to run on
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
device = "cpu"

In [13]:
# Create custom dataset
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X).to(device)
        self.y = torch.tensor(y).to(device)

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
data = MyDataset(X, y)

In [14]:
# Define the model
class NeuralNetwork(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_shape, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_shape),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork(data.X.shape[1], data.y.shape[1]).to(device)

In [15]:
# Model parameters
loss_fn = nn.MSELoss()
learning_rate = 1e-3
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training parameters
batch_size = 128
num_epochs = 100

In [16]:
# Function to train the model
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader)
    model.train()

    train_loss = 0

    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    avg_train_loss = train_loss / batch_size

    return avg_train_loss

In [17]:
# Train the model
from torch.utils.data import DataLoader
from tqdm import tqdm

train_dataloader = DataLoader(data, batch_size, shuffle=True)

best_loss = 999

for epoch in tqdm(range(0, num_epochs)):
    train_loss = train(train_dataloader, model, loss_fn, optimizer)
    if train_loss <= best_loss:
        best_loss = train_loss
        torch.save(model, "model.pt")

print(f"Best model loss = {best_loss}")

100%|██████████| 100/100 [00:03<00:00, 29.30it/s]

Best model loss = 0.3345192540436983





In [18]:
saved_model = NeuralNetwork(data.X.shape[1], data.y.shape[1]).to(device)
saved_model = torch.load("model.pt")
pred = []
label = []
for X, y in data:
    pred.append(round(model(X).cpu().detach().numpy()[0]))
    label.append(y.cpu().detach().numpy()[0])

mse = mean_squared_error(label, pred)
rmse = root_mean_squared_error(label, pred)
r2 = r2_score(label, pred)
acc = accuracy_score(label, pred)

print("Mean Squared Error =", mse)
print("Root Mean Squared Error =", rmse)
print("R Squared Error =", r2)
print(f"Accuracy = {round(acc * 100, 3)} %")

Mean Squared Error = 1.7403704792708028
Root Mean Squared Error = 1.319231018158231
R Squared Error = 0.7056330293455736
Accuracy = 45.398 %
