In [1]:
import pandas as pd
import numpy as np
import yaml
import os
import pickle
import random
import sys
from tqdm import tqdm


sys.path.append('../src')

import threads as threads
from utils import create_attacked_sets, pkl2h5, read_train_file, pkl2h5_wo_time

os.environ["CUDA_VISIBLE_DEVICES"]="0"

data_path = '/home/schestakov/projects/re-identification/data/sf/db'
data_save_path = '/home/schestakov/projects/re-identification/data/sf/db'

In [2]:
# List of Trajectories
# Trajectory: [[x1,y1,t1], [x2,y2,t2], ... , [xn,yn,tn]]
traj_list = pickle.load(open(os.path.join(data_path, "traj_list.pkl"), "rb"))

n_samples  = len(traj_list)
n_samples

695675

In [3]:
random.shuffle(traj_list)

In [4]:
# Split by 80%, 10%, 10%
train = traj_list[:int(0.8*n_samples)]
val = traj_list[int(0.8*n_samples):int(0.9*n_samples)]
test = traj_list[int(0.9*n_samples):]

In [5]:
######## JUST FOR IMPLEMENTATION PHASE REDUCE SIZE #########
print(len(train))
#test = test[:10000]
print(len(test))

556540
69568


In [5]:
# Save train, val, test
with open(os.path.join(data_save_path, 'train.pkl'), "wb") as f:
    pickle.dump(train, f)
with open(os.path.join(data_save_path, 'val.pkl'), "wb") as f:
    pickle.dump(val, f)
with open(os.path.join(data_save_path, 'test.pkl'), "wb") as f:
    pickle.dump(test, f)

### Apply Threads

In [6]:
###### Add Threads to test #######

# split test into own and other set
# For each thread: 
#     - attack on own set
#     - attack on other set (add to not evaluating finding attack structures in trace)
#     - add labels: true if attack on own set, else false

#own_share = 0.1

#own_set = test[:int(own_share*len(test))]
#other_set = test[int(own_share*len(test)):]

own_size = 1000
own_set = test[:own_size]
other_set = test[own_size:]


print(len(own_set))
print(len(other_set))

1000
68568


In [None]:
attacks_dict = {
    'GWN:white_noise': threads.add_white_noise,
    'SNR:signal_to_noise': threads.add_signal_noise,
    'OSNR:outliers_with_snr': threads.add_outliers_with_signal_to_noise_ratio,
    'RIP:rounding': threads.remove_non_significant_bits, 
    'DS:downsample': threads.downsample,
    'RRPP:replace_random_points_with_path': threads.replace_random_points_with_path,
    'RNSPP:replace_non_skeleton_points_with_path': threads.replace_non_skeleton_points_with_path,
    'RAP:resample_along_path': threads.resample_along_path,
    'C:cropping': threads.cropping,
    'Multi:DS+GWN+C': [threads.add_white_noise, threads.downsample, threads.resample_along_path],
    }

In [9]:
total_set = []
total_set_labels = []
total_set_description = []
for attack_name, attack_function in attacks_dict.items():
    attacked_traj, labels, attack_descr = create_attacked_sets(own_set, other_set, attack_name, attack_function)
    
    # Add to total sets
    total_set = total_set + attacked_traj
    total_set_labels = total_set_labels + labels
    total_set_description = total_set_description + attack_descr


In [10]:
# Shuffel all sets
zipped_set = list(zip(total_set, total_set_labels, total_set_description))
random.shuffle(zipped_set)
total_set, total_set_labels, total_set_description = zip(*zipped_set)

In [11]:
# Save own_set, total_set, total_set_description
with open(os.path.join(data_save_path, 'own_set.pkl'), "wb") as f:
    pickle.dump(own_set, f)
with open(os.path.join(data_save_path, 'total_set.pkl'), "wb") as f:
    pickle.dump(total_set, f)
with open(os.path.join(data_save_path, 'total_set_labels.pkl'), "wb") as f:
    pickle.dump(total_set_labels, f)
with open(os.path.join(data_save_path, 'total_set_description.pkl'), "wb") as f:
    pickle.dump(total_set_description, f)

In [2]:
# Open all files
with open(os.path.join(data_save_path, 'train.pkl'), "rb") as f:
    train = pickle.load(f)
with open(os.path.join(data_save_path, 'val.pkl'), "rb") as f:
    val = pickle.load(f)
with open(os.path.join(data_save_path, 'test.pkl'), "rb") as f:
    test = pickle.load(f)
with open(os.path.join(data_save_path, 'own_set.pkl'), "rb") as f:
    own_set = pickle.load(f)
with open(os.path.join(data_save_path, 'total_set.pkl'), "rb") as f:
    total_set = pickle.load(f)
with open(os.path.join(data_save_path, 'total_set_labels.pkl'), "rb") as f:
    total_set_labels = pickle.load(f)
with open(os.path.join(data_save_path, 'total_set_description.pkl'), "rb") as f:
    total_set_description = pickle.load(f)


### Preprecess further for DL models 

In [3]:
julia_path = "/home/schestakov/downloads/julia-1.8.5/bin/julia"
hyper_param_path = "./hyper-parameters_sf.json"

In [5]:
data_save_path = os.path.join(data_save_path, '50')

#### Train

In [7]:
##### Training ####

cityname = "sf"
# for training
pkl2h5(train, data_save_path, "train.h5")
pkl2h5(val, data_save_path, "val.h5")
# To build the spatial region we create a city.h5 file with all trajectories
pkl2h5(train+val+test, data_save_path, f"{cityname}.h5")

Completed writing 556540 to /home/schestakov/projects/re-identification/data/sf/db/50/train.h5
Completed writing 69567 to /home/schestakov/projects/re-identification/data/sf/db/50/val.h5
Completed writing 695675 to /home/schestakov/projects/re-identification/data/sf/db/50/sf.h5


In [8]:
from subprocess import call
# This function creates preprocesses train.h5 and val.h5 for training. Output is train.src, train.trg, val.src, val.trg
call([julia_path, "preprocess.jl", "--datapath", data_save_path, "--parampath", hyper_param_path])

Building spatial region with:
        cityname=sf,
        minlon=-122.5183,
        minlat=37.085,
        maxlon=-121.5927,
        maxlat=38.3473,
        xstep=50.0,
        ystep=50.0,
        minfreq=50
Creating paramter file /home/schestakov/projects/re-identification/data/sf/db/50/sf-param-cell50
Processed 100000 trips
Processed 200000 trips
Processed 300000 trips
Processed 400000 trips
Processed 500000 trips
Processed 600000 trips
*
Cell count at max_num_hotcells:40000
*
138
*
Cell count at max_num_hotcells:40000 is 138
Vocabulary size 40004 with cell size 50.0 (meters)
Creating training and validation datasets...
Opening H5 file at /home/schestakov/projects/re-identification/data/sf/db/50/train.h5
Scaned 10000 trips...
Scaned 20000 trips...
Scaned 30000 trips...
Scaned 40000 trips...
Scaned 50000 trips...
Scaned 60000 trips...
Scaned 70000 trips...
Scaned 80000 trips...
Scaned 90000 trips...
Scaned 100000 trips...
Scaned 110000 trips...
Scaned 120000 trips...
Scaned 130000 tr

0

In [9]:
# For our model we convert into numpy as save as .npz
max_len = 100
train_src, train_src_len = read_train_file(os.path.join(data_save_path, "train.src"), max_len)
train_trg, train_trg_len = read_train_file(os.path.join(data_save_path, "train.trg"), max_len)
val_src, val_src_len = read_train_file(os.path.join(data_save_path, "val.src"), max_len)
val_trg, val_trg_len = read_train_file(os.path.join(data_save_path, "val.trg"), max_len)

Importing file: /home/schestakov/projects/re-identification/data/sf/db/50/train.src


11130800it [01:14, 149434.44it/s]


Importing file: /home/schestakov/projects/re-identification/data/sf/db/50/train.trg


11130800it [01:29, 123764.90it/s]


Importing file: /home/schestakov/projects/re-identification/data/sf/db/50/val.src


1391340it [00:07, 176284.65it/s]


Importing file: /home/schestakov/projects/re-identification/data/sf/db/50/val.trg


1391340it [00:08, 155375.16it/s]


In [10]:
train_src = np.array(train_src)
train_trg = np.array(train_trg)
val_src = np.array(val_src)
val_trg = np.array(val_trg)

train_src_len = np.array(train_src_len)
train_trg_len = np.array(train_trg_len)
val_src_len = np.array(val_src_len)
val_trg_len = np.array(val_trg_len)

print(train_src.shape)
print(train_trg.shape)
print(val_src.shape)
print(val_trg.shape)

(556539, 19, 100)
(556539, 19, 100)
(69566, 19, 100)
(69566, 19, 100)


In [11]:
# Also we dont use validation set so stack it with training set
train_src = np.concatenate((train_src,val_src), axis=0)
train_trg = np.concatenate((train_trg,val_trg), axis=0)
train_src_len = np.concatenate((train_src_len,val_src_len), axis=0)
train_trg_len = np.concatenate((train_trg_len,val_trg_len), axis=0)



In [12]:
# Save compressed
np.savez_compressed(os.path.join(data_save_path,"train.npz"), src=train_src, trg=train_trg, src_len=train_src_len, trg_len=train_trg_len)


In [None]:
# Loading example:
loaded = np.load(os.path.join(data_save_path,"train.npz"))
src= loaded['src']
trg= loaded['trg']
src_len= loaded['src_len']
trg_len= loaded['trg_len']

#### Evaluation

In [13]:
own_set[0][0]

[37.78235000000001, -122.42520000000002]

In [14]:
##### Evaluation ####


# Format of .pkl files
# List of Trajectories: [T1, T2, ... , Tn]
# Trajectory: T = [[x1,y1,t1], [x2,y2,t2], ... , [xn,yn,tn]]

load = False
if load:
    own_set = pickle.load(open(os.path.join(data_save_path, "own_set.pkl"), "rb"))
    total_set = pickle.load(open(os.path.join(data_save_path, "total_set.pkl"), "rb"))


# We need to convert train, val, own_set, total_set
print(f"own: {len(own_set)} \ntotal: {len(total_set)}")

# For evaluation
pkl2h5_wo_time(own_set, data_save_path, "own.h5")
pkl2h5_wo_time(total_set, data_save_path, "total.h5")

own: 1000 
total: 20000
Completed writing 1000 to /home/schestakov/projects/re-identification/data/sf/db/50/own.h5
Completed writing 20000 to /home/schestakov/projects/re-identification/data/sf/db/50/total.h5


In [15]:
from subprocess import call
# From .h5 files map to grid and safe as .t file
filenames = ['own', 'total']
for name in filenames:
    call([julia_path, "traj2gridseq.jl", "--datapath", data_save_path, "--filename", name,  "--parampath", hyper_param_path])

Building spatial region with:
        cityname=sf,
        minlon=-122.5183,
        minlat=37.085,
        maxlon=-121.5927,
        maxlat=38.3473,
        xstep=50.0,
        ystep=50.0,
        minfreq=50
Reading parameter file from /home/schestakov/projects/re-identification/data/sf/db/50/sf-param-cell50
Loaded /home/schestakov/projects/re-identification/data/sf/db/50/sf-param-cell50 into region
Building spatial region with:
        cityname=sf,
        minlon=-122.5183,
        minlat=37.085,
        maxlon=-121.5927,
        maxlat=38.3473,
        xstep=50.0,
        ystep=50.0,
        minfreq=50
Reading parameter file from /home/schestakov/projects/re-identification/data/sf/db/50/sf-param-cell50
Loaded /home/schestakov/projects/re-identification/data/sf/db/50/sf-param-cell50 into region
