# Imports

In [1]:
import torch
import torch.nn as nn
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Data extraction and preprocessing

In [2]:
# Load the CSV
csv_file = "HouseTS_with_images.csv"
df = pd.read_csv(csv_file)

In [3]:
# Count the number of entries per zipcode
count_per_zipcode = df["zipcode"].value_counts()

# Calculate the average
average_count = count_per_zipcode.mean()

print(f"Average number of entries per zipcode: {average_count:.2f}")

Average number of entries per zipcode: 142.00


In [4]:
df['date'] = pd.to_datetime(df['date'])

# Filter for the desired zipcode
zipcode = 20001
df_zip = df[df['zipcode'] == zipcode].sort_values('date')

# Calculate the time differences between consecutive dates
time_diffs = df_zip['date'].diff().dropna()

# Calculate the average timestep (in days)
average_timestep = time_diffs.mean()

print(f"Average timestep for zipcode {zipcode}: {average_timestep}")
print(f"In days: {average_timestep.days} days")

Average timestep for zipcode 20001: 30 days 10:33:11.489361702
In days: 30 days


In [5]:
# Select non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns.tolist()

print("Non-numeric columns in the dataset:")
for col in non_numeric_cols:
    print(col)

Non-numeric columns in the dataset:
date
city
city_full


In [6]:
unique_cities = df['city_full'].unique()

print("Unique values in the 'city' column:")
for city in unique_cities:
    print(city)

df = df.drop(columns=['city', 'city_full'])

Unique values in the 'city' column:
DC_Metro


In [7]:
window_length = 12
step_size = 1

In [8]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['zipcode', 'date']).reset_index(drop=True)

all_windows = []
all_meta = []

def extract_windows(df_sub, window_len, step):
    windows = []
    for i in range(0, len(df_sub) - window_len + 1, step):
        window = df_sub.iloc[i:i+window_len].drop(columns=['date', 'zipcode', 'year']).values
        windows.append(window)
    return np.array(windows)

# Extract windows for each zipcode
for zipcode, group in df.groupby('zipcode'):
    group = group.reset_index(drop=True)
    if len(group) < window_length:
        continue

    windows = extract_windows(group, window_length, step_size)
    all_windows.append(windows)

    # Save metadata: for example, the start date and zipcode of the window
    window_meta = [(zipcode, group.loc[i, 'date'], group.loc[i, 'year']) for i in range(len(group) - window_length + 1)]
    all_meta.extend(window_meta)

# Concatenate all windows
X = np.vstack(all_windows)  # shape: (num_samples, window_length, num_features without date/zipcode)

print("Shape of X:", X.shape)
print("Number of metadata entries:", len(all_meta))

# Example of printing first metadata
print("Example metadata for window 0:", all_meta[0])

Shape of X: (40348, 12, 34)
Number of metadata entries: 40348
Example metadata for window 0: (20001, Timestamp('2012-03-31 00:00:00'), 2012)


In [9]:
from sklearn.utils import shuffle

X, window_ids = shuffle(X, all_meta, random_state=42)

train_size = int(0.7 * len(X))
val_size = int(0.1 * len(X))
test_size = len(X) - train_size - val_size

X_train = X[:train_size]
X_val = X[train_size:train_size + val_size]
X_test = X[train_size + val_size:]

# Encoder and Decoder

Our goal was to build an encoder-decoder model able to learn a compressed representaion of the input time series, so to allow a more efficient search of similar time series in a smaller dimensional space, speeding up the task of finding k nearest neighbours. </br> The encoder gets as input a tensor of shape (batch_size, seq_len, num_features) and compresses it into a tensor of shape (batch_size, embedding_dim), while the decoder takes the output of the encoder and tries to reconstruct the original input. 

In [10]:
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super().__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size] if self.chomp_size > 0 else x

class TemporalBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout):
        super().__init__()
        padding = (kernel_size - 1) * dilation  # full causal
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size,
                               padding=padding, dilation=dilation)
        self.chomp1 = Chomp1d(padding)

        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size,
                               padding=padding, dilation=dilation)
        self.chomp2 = Chomp1d(padding)

        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(
            self.conv1, self.chomp1, self.relu1, self.dropout1,
            self.conv2, self.chomp2, self.relu2, self.dropout2,
        )

        self.downsample = nn.Conv1d(in_channels, out_channels, 1) \
            if in_channels != out_channels else None
        self.relu = nn.ReLU()

    def forward(self, x):
        res = x if self.downsample is None else self.downsample(x)
        out = self.net(x)
        if out.shape != res.shape:
            # Align time dimension by cropping the residual (this might be necessary in some edge cases)
            min_len = min(out.size(-1), res.size(-1))
            out = out[..., :min_len]
            res = res[..., :min_len]
        return self.relu(out + res)

# Encoder
class TCNEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, num_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        layers = []
        for i in range(len(num_channels)):
            in_ch = input_dim if i == 0 else num_channels[i - 1]
            out_ch = num_channels[i]
            dilation = 2 ** i
            layers.append(TemporalBlock(in_ch, out_ch, kernel_size, dilation, dropout))
        self.tcn = nn.Sequential(*layers)

        # Projection from [B, C, T] to [B, emb_dim]
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.project = nn.Linear(num_channels[-1], emb_dim)

    def forward(self, x):
        # x: [B, T, D] → [B, D, T]
        x = x.permute(0, 2, 1)
        x = self.tcn(x)  # [B, C, T]
        x = self.pool(x).squeeze(-1)  # [B, C]
        x = self.project(x)  # [B, emb_dim]
        return x

# Decoder
class TCNDecoder(nn.Module):
    def __init__(self, emb_dim, output_dim, seq_len, num_channels, kernel_size=3, dropout=0.2):
        super().__init__()
        self.seq_len = seq_len
        self.output_dim = output_dim

        # Project embedding back to a sequence shape: [B, C, T]
        self.expand = nn.Linear(emb_dim, num_channels[0] * seq_len)

        layers = []
        for i in range(len(num_channels) - 1):
            in_ch = num_channels[i]
            out_ch = num_channels[i + 1]
            dilation = 2 ** i
            layers.append(TemporalBlock(in_ch, out_ch, kernel_size, dilation, dropout))

        self.tcn = nn.Sequential(*layers)
        self.out_proj = nn.Conv1d(num_channels[-1], output_dim, kernel_size=1)

    def forward(self, x):
        # x: [B, emb_dim] → [B, C0, T]
        x = self.expand(x)  # [B, C0 * T]
        x = x.view(x.size(0), -1, self.seq_len)  # [B, C0, T]
        x = self.tcn(x)  # [B, Cn, T]
        x = self.out_proj(x)  # [B, D, T]
        return x.permute(0, 2, 1)  # [B, T, D]
    
class TCNAutoencoder(nn.Module):
    def __init__(self, input_dim, emb_dim, seq_len, channels, kernel_size=3, dropout=0.2):
        super().__init__()
        self.encoder = TCNEncoder(input_dim, emb_dim, channels, kernel_size, dropout)
        self.decoder = TCNDecoder(emb_dim, input_dim, seq_len, channels[::-1], kernel_size, dropout)

    def forward(self, x, only_encoder = False):
        # x: [B, T, D]
        z = self.encoder(x)       # [B, emb_dim]
        if only_encoder:
            return z
        x_recon = self.decoder(z) # [B, T, D]
        return x_recon

In [11]:
# Compute the statistics of the train dataset and normalize with respect to it 

mean = X_train.mean(axis=(0, 1), keepdims=True)  # shape (1, 1, num_features)
std = X_train.std(axis=(0, 1), keepdims=True)

X_train_norm = (X_train - mean) / std
X_val_norm = (X_val - mean) / std
X_test_norm = (X_test - mean) / std

print(mean, std)

mean = X_train_norm.mean(axis=(0, 1), keepdims=True)  # shape (1, 1, num_features)
std = X_train_norm.std(axis=(0, 1), keepdims=True)

print(mean, std)

[[[4.54852826e+05 5.58716962e+05 2.27982484e+02 2.46098550e+02
   6.83864232e+01 7.45339848e+01 8.01239924e+01 4.91713935e+01
   5.41263056e+01 9.88508572e-01 2.73734409e-01 3.22142625e-01
   2.36917260e+01 7.18886096e-01 2.33493255e+00 1.18643558e+00
   8.91169936e+01 9.04543250e+01 5.97777915e+01 4.15741364e+00
   1.16968157e+01 1.90285746e+04 3.99743190e+01 4.78702971e+04
   1.87499750e+04 7.36403461e+03 1.59877327e+03 4.40833370e+05
   1.08915696e+04 6.19120623e+02 1.82830710e+04 1.82830710e+04
   9.33441450e+03 4.57644868e+05]]] [[[2.47671163e+05 8.77060063e+06 1.15036342e+02 5.82713229e+02
   7.01868096e+01 7.54677556e+01 8.32473874e+01 4.95942947e+01
   7.83625340e+01 3.22186448e-02 1.94536395e-01 2.48884578e-01
   4.77475403e+01 1.64222823e+00 4.26626174e+00 1.79608954e+00
   1.57134652e+02 1.90464107e+02 8.16188975e+01 8.63331047e+00
   1.59101982e+01 1.75557344e+04 7.11650516e+00 1.92278578e+04
   1.73431754e+04 6.86866352e+03 4.32730094e+02 1.83839663e+05
   1.01493864e+04 7

In [12]:
# Convert into PyTorch tensors
X_train_tensor = torch.tensor(X_train_norm, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_norm, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_norm, dtype=torch.float32)

# Dataset & DataLoader
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor), batch_size=batch_size)
test_loader = DataLoader(TensorDataset(X_test_tensor), batch_size=batch_size)

# Model training

In [14]:
seq_len = X_train.shape[1]
num_features = X_train.shape[2]

# Parameters
B, T, D = batch_size, seq_len, num_features
emb_dim = 64
channels = [32, 64, 128]

# Instantiate model
model = TCNAutoencoder(input_dim=D, emb_dim=emb_dim, seq_len=T, channels=channels).to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model parameters: {count_parameters(model):,}")

params = list(model.parameters())

optimizer = torch.optim.Adam(params, lr=1e-3)
loss_fn = nn.MSELoss()

epochs = 100

best_loss = 1e10

encoder_path = "/kaggle/working/encoder.pth"

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for (x,) in loader:
            x = x.to(device)
            x_hat = model(x)
            loss = loss_fn(x_hat, x)
            total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, epochs + 1):
    model.train()
    train_loss = 0

    for (x,) in train_loader:
        x = x.to(device)
        x_hat = model(x)
        loss = loss_fn(x_hat, x)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss = evaluate(model, val_loader)
    test_loss = evaluate(model, test_loader)

    if val_loss + test_loss < best_loss:
        best_loss = val_loss + test_loss
        # Saves the weights
        torch.save(model.state_dict(), encoder_path)
        print("Models saved successfully.")

    print(f"Epoch {epoch:02d} | Train MSE: {train_loss:.6f} | Val MSE: {val_loss:.6f} | Test MSE: {test_loss:.6f}")

Model parameters: 276,322
Models saved successfully.
Epoch 01 | Train MSE: 0.329717 | Val MSE: 0.194305 | Test MSE: 0.226819
Models saved successfully.
Epoch 02 | Train MSE: 0.211920 | Val MSE: 0.149754 | Test MSE: 0.189440
Models saved successfully.
Epoch 03 | Train MSE: 0.182255 | Val MSE: 0.138717 | Test MSE: 0.172429
Models saved successfully.
Epoch 04 | Train MSE: 0.159782 | Val MSE: 0.105315 | Test MSE: 0.146662
Epoch 05 | Train MSE: 0.141739 | Val MSE: 0.111816 | Test MSE: 0.143787
Models saved successfully.
Epoch 06 | Train MSE: 0.141896 | Val MSE: 0.099963 | Test MSE: 0.134545
Models saved successfully.
Epoch 07 | Train MSE: 0.132991 | Val MSE: 0.098059 | Test MSE: 0.122779
Models saved successfully.
Epoch 08 | Train MSE: 0.124939 | Val MSE: 0.088298 | Test MSE: 0.110709
Models saved successfully.
Epoch 09 | Train MSE: 0.125151 | Val MSE: 0.085099 | Test MSE: 0.110870
Models saved successfully.
Epoch 10 | Train MSE: 0.115700 | Val MSE: 0.081556 | Test MSE: 0.105681
Epoch 11 | 

In [15]:
for epoch in range(101, 151):
    model.train()
    train_loss = 0

    for (x,) in train_loader:
        x = x.to(device)
        x_hat = model(x)
        loss = loss_fn(x_hat, x)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss = evaluate(model, val_loader)
    test_loss = evaluate(model, test_loader)

    if val_loss + test_loss < best_loss:
        best_loss = val_loss + test_loss
        # Saves the weights
        torch.save(model.state_dict(), encoder_path)
        print("Models saved successfully.")

    print(f"Epoch {epoch:02d} | Train MSE: {train_loss:.6f} | Val MSE: {val_loss:.6f} | Test MSE: {test_loss:.6f}")

Epoch 101 | Train MSE: 0.043482 | Val MSE: 0.033593 | Test MSE: 0.048350
Models saved successfully.
Epoch 102 | Train MSE: 0.044059 | Val MSE: 0.029884 | Test MSE: 0.047465
Epoch 103 | Train MSE: 0.040157 | Val MSE: 0.030338 | Test MSE: 0.047257
Models saved successfully.
Epoch 104 | Train MSE: 0.040385 | Val MSE: 0.031780 | Test MSE: 0.044534
Epoch 105 | Train MSE: 0.044626 | Val MSE: 0.030329 | Test MSE: 0.047372
Epoch 106 | Train MSE: 0.041013 | Val MSE: 0.030838 | Test MSE: 0.052234
Epoch 107 | Train MSE: 0.044169 | Val MSE: 0.029955 | Test MSE: 0.047066
Models saved successfully.
Epoch 108 | Train MSE: 0.039915 | Val MSE: 0.029572 | Test MSE: 0.042532
Epoch 109 | Train MSE: 0.040812 | Val MSE: 0.028459 | Test MSE: 0.046656
Epoch 110 | Train MSE: 0.046086 | Val MSE: 0.031689 | Test MSE: 0.049078
Epoch 111 | Train MSE: 0.039199 | Val MSE: 0.033407 | Test MSE: 0.053452
Epoch 112 | Train MSE: 0.039879 | Val MSE: 0.029803 | Test MSE: 0.044940
Epoch 113 | Train MSE: 0.039956 | Val MSE: 

In [16]:
for epoch in range(151, 201):
    model.train()
    train_loss = 0

    for (x,) in train_loader:
        x = x.to(device)
        x_hat = model(x)
        loss = loss_fn(x_hat, x)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss = evaluate(model, val_loader)
    test_loss = evaluate(model, test_loader)

    if val_loss + test_loss < best_loss:
        best_loss = val_loss + test_loss
        # Saves the weights
        torch.save(model.state_dict(), encoder_path)
        print("Models saved successfully.")

    print(f"Epoch {epoch:02d} | Train MSE: {train_loss:.6f} | Val MSE: {val_loss:.6f} | Test MSE: {test_loss:.6f}")

Epoch 151 | Train MSE: 0.035499 | Val MSE: 0.026993 | Test MSE: 0.047837
Epoch 152 | Train MSE: 0.036469 | Val MSE: 0.024778 | Test MSE: 0.043585
Epoch 153 | Train MSE: 0.037096 | Val MSE: 0.026355 | Test MSE: 0.048484
Epoch 154 | Train MSE: 0.033922 | Val MSE: 0.025274 | Test MSE: 0.045618
Epoch 155 | Train MSE: 0.031814 | Val MSE: 0.025095 | Test MSE: 0.050864
Epoch 156 | Train MSE: 0.035852 | Val MSE: 0.025641 | Test MSE: 0.047051
Epoch 157 | Train MSE: 0.035775 | Val MSE: 0.024176 | Test MSE: 0.044478
Epoch 158 | Train MSE: 0.032847 | Val MSE: 0.024202 | Test MSE: 0.046343
Models saved successfully.
Epoch 159 | Train MSE: 0.037993 | Val MSE: 0.023369 | Test MSE: 0.040921
Models saved successfully.
Epoch 160 | Train MSE: 0.031047 | Val MSE: 0.023494 | Test MSE: 0.039360
Epoch 161 | Train MSE: 0.034264 | Val MSE: 0.025722 | Test MSE: 0.040771
Epoch 162 | Train MSE: 0.034376 | Val MSE: 0.024751 | Test MSE: 0.045739
Epoch 163 | Train MSE: 0.029455 | Val MSE: 0.023503 | Test MSE: 0.0490

# Save the models

In [None]:
"""encoder_path = "/kaggle/working/encoder.pth"

# Saves the weights
torch.save(model.state_dict(), encoder_path)

print("Models saved successfully.")"""

In [17]:
model_loaded = TCNAutoencoder(input_dim=D, emb_dim=emb_dim, seq_len=T, channels=channels)

model_loaded.load_state_dict(torch.load(encoder_path))

model_loaded.to(device)

model_loaded.eval()

print("Models reloaded successfully.")

Models reloaded successfully.
