In [None]:
import numpy as np
import pandas as pd
import torch
import random
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm

from torch.utils.data import DataLoader, Dataset

np.set_printoptions(suppress=True)
np.set_printoptions(precision=3)

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# hannover, braunschweig, wolfsburg
source_city = 'hannover'

# Load data
feats_mx = np.load(f'../data/feats_mx2_{source_city}.npy')
labels = pd.read_csv(f'../data/labels_{source_city}.csv')


torch.manual_seed(123)
np.random.seed(123)
random.seed(123)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == "cuda":
    torch.cuda.get_device_name()
    
embedding_dims = 256
batch_size = 32
epochs = 300
k_avg_streets = 10

# Triplet Loss Margin
margin = 1

In [None]:
f"Cluster Size: {labels['cluster'].max()+1}"

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(feats_mx[..., 1:])
feats_mx[..., 1:] = scaler.transform(feats_mx[..., 1:])

In [None]:
feats_mx[0,:]

In [None]:
# Merge them to one table
# Transform np array to df and merge with labels
feats_df = pd.DataFrame(data=feats_mx)
feats_df = feats_df.rename(columns={feats_df.columns[0]: 'id'})
# join inner (we have less labels than street feats)
train_df = pd.merge(labels, feats_df, on="id")
# drop id column
train_df = train_df.drop(columns = ['id'])

In [None]:
train_df.head()

In [None]:
train_df.iloc[:,:20]

In [None]:
##### Create Dataset Class ######

class RoadData(Dataset):
    def __init__(self, df, train=True):
        self.is_train = train
        
        if self.is_train:            
            self.feats = df.iloc[:, 1:].values.astype(np.float32)
            self.labels = df.iloc[:, 0].values
            self.index = df.index.values
        else:
            self.feats = df.values.astype(np.float32)
        
    def __len__(self):
        return len(self.feats)
    
    def __getitem__(self, item):
        anchor = self.feats[item]
        
        if self.is_train:
            anchor_label = self.labels[item]

            positive_list = self.index[self.index!=item][self.labels[self.index!=item]==anchor_label]

            positive_item = random.choice(positive_list)
            positive = self.feats[positive_item]
            
            negative_list = self.index[self.index!=item][self.labels[self.index!=item]!=anchor_label]
            negative_item = random.choice(negative_list)
            negative = self.feats[negative_item]
            
            return anchor, positive, negative, anchor_label
        
        else:
            return anchor
        



Papers have shown that selcting randomly the positive and negative samples is not the best approach. So possible to improve.

In [None]:
#### Initialize dataset and dataloader ####
train_ds = RoadData(train_df, train=True)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)

In [None]:
train_df.shape

In [None]:
##### Implementation of Model and Loss ######

# Model quite simple. Try different ones?
class TripletNet(nn.Module):
    def __init__(self, emb_dim=128):
        super(TripletNet, self).__init__()
        
        
        
        self.fc1 = nn.Sequential(nn.Linear(16, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU()
                                )
        
        
        self.fc2 = nn.Sequential(nn.Linear(emb_dim+8, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim)
                                )
        
        self.ge_extraction = nn.Sequential(nn.Linear(128, 64),
                                nn.ReLU(),
                                nn.Linear(64, 32),
                                nn.ReLU(),
                                nn.Linear(32, 16),
                                nn.ReLU(),
                                nn.Linear(16, 8),
                                nn.ReLU()
                                )
        
        
        
        self.dense = nn.Sequential(
                                nn.Linear(90, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim),
                                nn.ReLU(),
                                nn.Linear(emb_dim, emb_dim)
                                )
        
        
        
        
    def forward(self, x):
        #streat_feats = x[:,:16]
        #streat_feats = self.fc1(streat_feats)
        #ge = x[:,16:]
        #ge = self.ge_extraction(ge)
        #x = torch.cat((streat_feats, ge), dim=1)
        #x = self.fc2(x)
        x = self.dense(x)
        return x


class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:

        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()

In [None]:
### Init instances ###
model = TripletNet(embedding_dims)
#model.apply(init_weights)

# JIT Script for performance
#model = torch.jit.script(model).to(device)#
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
#criterion = torch.jit.script(TripletLoss(margin))
criterion = TripletLoss(margin)

In [None]:
#### TRAINING ####
model.train()
for epoch in tqdm(range(epochs), desc="Epochs"):
    running_loss = []
    for step, (anchor, positive, negative, anchor_label) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
        anchor = anchor.to(device)
        positive = positive.to(device)
        negative = negative.to(device)
        
        optimizer.zero_grad()
        anchor_out = model(anchor)
        positive_out = model(positive)
        negative_out = model(negative)
        
        loss = criterion(anchor_out, positive_out, negative_out)
        loss.backward()
        optimizer.step()
        
        running_loss.append(loss.cpu().detach().numpy())
    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch+1, epochs, np.mean(running_loss)))

In [None]:
### Save Params ###
#torch.save(model.state_dict(), "trained_model.pth")

## Test Model

We need to evaluate the model. We do this by application.

For every target street, we select the closest n source streets and calculate the missmatch.

In [None]:
# hannover, braunschweig, wolfsburg
source_city = 'hannover'
target_city = 'braunschweig'

train_date_begin = "2019-01-10"
train_date_end = "2019-01-31"
hour_from = "7"
hour_to = "21"

In [None]:

# Load Road Features for Source and Target city
# Load data
source_feats_mx = np.load(f'../data/feats_mx2_{source_city}.npy')
target_feats_mx = np.load(f'../data/feats_mx2_{target_city}.npy')
# Sort them (when we create gwn input its also sorted)
source_feats_mx = source_feats_mx[np.argsort(source_feats_mx[:, 0])]
target_feats_mx = target_feats_mx[np.argsort(target_feats_mx[:, 0])]

In [None]:
# Standerdize
source_feats_mx[..., 1:] = scaler.transform(source_feats_mx[..., 1:])
target_feats_mx[..., 1:] = scaler.transform(target_feats_mx[..., 1:])    

In [None]:
### Get the roads, which have historical speed data ###

# This cell filters out those roads on the source which have bad data quality
import sys
import os
sys.path.append(os.path.join(sys.path[0], '..'))

import yaml
from src import db_requests
from src import data_preperation


with open("../db.yaml", 'r') as dbfi:
    db_credentials = yaml.safe_load(dbfi)

min_measurements = 3000

source_hist_data = db_requests.getTrafficDataMinMeasurements(source_city, min_measurements, train_date_begin, train_date_end, hour_from, hour_to, db_credentials=db_credentials)
# Get the ids which are in the data
source_ids = source_hist_data['id'].unique()
# For the Feature Mx filter out those that are not in the data
mask = np.isin(source_feats_mx[:,0],source_ids)
source_feats_mx = source_feats_mx[mask]

In [None]:
# Seperate idx to id mapping and road feats
source_idx2id = source_feats_mx[:,0]
source_feats_mx = source_feats_mx[:,1:]


target_idx2id = target_feats_mx[:,0]
target_feats_mx = target_feats_mx[:,1:]

In [None]:
feats_mx.shape

In [None]:
source_feats_mx.shape

In [None]:
##### SOURCE EMBEDDINGS #######
# Use DataLoader
source_feats_df = pd.DataFrame(data=source_feats_mx)
#source_feats_df = source_feats_df.drop(columns={source_feats_df.columns[0]}) # ether drop at mx or here

source_feats_ds = RoadData(source_feats_df, train=False)
source_loader = DataLoader(source_feats_ds, batch_size=batch_size, shuffle=False, num_workers=4)


# Run Embedding Model
source_embeddings = []
with torch.no_grad():
    for data in tqdm(source_loader): # Here should be the test set
        source_embeddings.append(model(data.to(device)).cpu().numpy())
        
source_embeddings = np.concatenate(source_embeddings)
source_embeddings.shape

In [None]:
##### TARGET EMBEDDINGS #######
# Use DataLoader
target_feats_df = pd.DataFrame(data=target_feats_mx)
#source_feats_df = source_feats_df.drop(columns={source_feats_df.columns[0]}) # ether drop at mx or here

target_feats_ds = RoadData(target_feats_df, train=False)
target_loader = DataLoader(target_feats_ds, batch_size=batch_size, shuffle=False, num_workers=4)


# Run Embedding Model
target_embeddings = []
with torch.no_grad():
    for data in tqdm(target_loader): # Here should be the test set
        target_embeddings.append(model(data.to(device)).cpu().numpy())
        
target_embeddings = np.concatenate(target_embeddings)
target_embeddings.shape

In [None]:
# Returns the indices of closest k Vectors (not sorted!)
def closest_vector_filtered(b,A, sl_filter, k=1):
    subs = (b[None,:] - A)
    sq_dist = np.einsum('ij,ij->i',subs, subs)
    # As we only want embeddings with same SL,
    # we set all distances, which have a different speed limit to 1000
    sq_dist[sl_filter] = 10000000
    return np.argpartition(sq_dist, k)[:k]

In [None]:
# We need to find for each target street closest k source streets
k = k_avg_streets
mapping = {}
for i in range(target_embeddings.shape[0]):
    # Get embedding
    emb_i = target_embeddings[i]
    # Get speedlimit
    sl_i = target_feats_mx[i,1]
    # Get all indices from source with same speedlimit
    indices = source_feats_mx[:,1] == target_feats_mx[i,1]
    sl_filter = np.invert(indices) 
    idx = closest_vector_filtered(emb_i,source_embeddings, sl_filter, k=k)
    mapping[i] = idx

In [None]:
mapping

In [None]:
# ID to ID mapping
id_mapping = {}
for key, value in mapping.items():
    new_key = int(target_idx2id[key])
    #new_value = int(source_idx2id[value])
    new_value = [int(source_idx2id[x]) for x in value]
    
    id_mapping[new_key] = new_value

In [None]:
id_mapping

In [None]:
###### FETCH DATA #####

# load traffic speed data
source_data = db_requests.getTrafficDataMinMeasurements(source_city, min_measurements, train_date_begin, train_date_end, hour_from, hour_to, db_credentials=db_credentials)
target_data = db_requests.getTrafficDataMinMeasurements(target_city, min_measurements, train_date_begin, train_date_end, hour_from, hour_to, db_credentials=db_credentials)

# Get Street Features: id, length, speed_limit as max_speed, type, source, target
streetFeats = db_requests.getStreetGraph(db_credentials)

In [None]:
# DCRNN Function needs a format like this:
#     id1  id2  id3
# t1
# t2
# t3
source_mx = pd.pivot_table(source_data, values='speed', index='time', columns=['id'], aggfunc=np.mean)
target_mx = pd.pivot_table(target_data, values='speed', index='time', columns=['id'], aggfunc=np.mean)

In [None]:
target_mx.head()

In [None]:
# iterate through columns
# find matching street in source
# fill column with source street values
# NEW: Aggregate different streets then fill
target_source_mx = pd.DataFrame(index=target_mx.index, columns=target_mx.columns)
for road_id in target_mx.columns:
    matched_source_id = id_mapping[road_id]
    #print(matched_source_id)
    target_source_mx[road_id] = source_mx[matched_source_id].mean(axis=1)

In [None]:
target_source_mx.head()

In [None]:
# Get an Error Value
T = target_mx.values
T_new = target_source_mx.values

# Fill NANs
T[np.isnan(T)] = 0
T_new[np.isnan(T_new)] = 0


diff_mx = target_mx.values - target_source_mx.values
#diff_mx = target_mx.values[382:] - target_source_mx.values[:-382]

In [None]:
# np.set_printoptions(precision=3)

In [None]:
MAE_cols = np.mean(np.abs(diff_mx), axis = 0)
MAE_cols[0:100]

In [None]:
MAE_cols.shape

In [None]:
MAE = np.mean(MAE_cols)
MAE

- 13.760675093470507

In [None]:
rmse = np.sqrt(np.mean(np.square(diff_mx), axis = 0))
np.mean(rmse)

- 14.630686162941847



In [None]:
### SAFE MAPPING FOR GWN MODEL 
safe = False
if safe:
    import json
    mapping_save_path = f'../data/mapping_{source_city}_{target_city}.json'
    # Save mapping
    with open(mapping_save_path, 'w') as f:
        json.dump(id_mapping, f)

14.479080117425092

### Look into bigger Errors

In [None]:
idxs = np.where(MAE_cols>50)[0]
idxs

In [None]:
first_idx = idxs[6]

In [None]:
# Get id
target_mx.columns[first_idx]

In [None]:
target_mx.iloc[:,first_idx]

In [None]:
target_source_mx.iloc[:,first_idx]

In [None]:
first_id = target_mx.columns[first_idx]

In [None]:
first_source_id = id_mapping[first_id]

In [None]:
streetFeats.loc[streetFeats['id']==first_id]

In [None]:
first_source_id

In [None]:
streetFeats.loc[streetFeats['id'].isin(first_source_id)]