In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
import os, os.path 
import numpy 
import pickle
from glob import glob

"""Change to the data folder"""
train_path = "./new_train/new_train/"
vali_path = "./new_val_in/new_val_in"
# number of sequences in each dataset
# train:205942  val:3200 test: 36272 
# sequences sampled at 10HZ rate

### Create a dataset class 

In [2]:
class ArgoverseDataset(Dataset):
    """Dataset class for Argoverse"""
    def __init__(self, data_path: str, transform=None):
        super(ArgoverseDataset, self).__init__()
        self.data_path = data_path
        self.transform = transform

        self.pkl_list = glob(os.path.join(self.data_path, '*'))
        self.pkl_list.sort()
        
    def __len__(self):
        return len(self.pkl_list)

    def __getitem__(self, idx):

        pkl_path = self.pkl_list[idx]
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
            
        if self.transform:
            data = self.transform(data)

        return data


# intialize a dataset
train_dataset  = ArgoverseDataset(data_path=train_path)
vali_dataset = ArgoverseDataset(data_path=vali_path)

### Create a loader to enable batch processing

In [3]:
batch_sz = 4

def my_collate(batch):
    """ collate lists of samples into batches, create [ batch_sz x agent_sz x seq_len x feature] """
    inp = [numpy.dstack([scene['p_in'], scene['v_in']]) for scene in batch]
    out = [numpy.dstack([scene['p_out'], scene['v_out']]) for scene in batch]
    inp = torch.LongTensor(inp)
    out = torch.LongTensor(out)
    return [inp, out]

train_loader = DataLoader(train_dataset,batch_size=batch_sz, shuffle = False, collate_fn=my_collate, num_workers=0)
vali_loader = DataLoader(vali_dataset,batch_size=batch_sz, shuffle = False, collate_fn=my_collate, num_workers=0)

In [4]:
len(train_dataset)

205942

In [5]:
len(vali_dataset)

3200

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [7]:
#for i in range(0, len(train_dataset)):

x = train_dataset[0]['p_in']
#    print(type(x))
    #print(x.shape)

In [8]:
type(x)

numpy.ndarray

In [9]:
x[0]

array([[3277.29638672, 1947.62609863],
       [3277.29614258, 1947.62597656],
       [3277.29614258, 1947.62585449],
       [3277.29614258, 1947.62597656],
       [3277.29589844, 1947.62561035],
       [3277.29589844, 1947.62561035],
       [3277.29589844, 1947.62548828],
       [3277.29614258, 1947.62573242],
       [3277.29638672, 1947.62573242],
       [3277.29614258, 1947.62573242],
       [3277.29614258, 1947.62561035],
       [3277.29638672, 1947.62573242],
       [3277.29638672, 1947.62585449],
       [3277.29663086, 1947.62597656],
       [3277.29638672, 1947.62585449],
       [3277.29638672, 1947.62585449],
       [3277.29638672, 1947.62561035],
       [3277.29638672, 1947.62573242],
       [3277.29638672, 1947.62585449]])

In [10]:
p_x = []
p_y = []
out_x = []
out_y = []
for n in range(0, 10000):
    x = train_dataset[n]['track_id'] 
    id = train_dataset[n]['agent_id']
    i = np.where(x == id)[0][0]
    position = train_dataset[n]['p_in'][i]
    position = position.reshape(38)
    label = train_dataset[n]['p_out'][i]
    label = label.reshape(60)
    p_x =  p_x + list(position[::2])
    p_y = p_y + list(position[1::2])
    out_x = out_x + list(label[::2])
    out_y = out_y + list(label[1::2])
    if(n % 1000 == 0):
        print("n is " + str(n))


n is 0
n is 1000
n is 2000
n is 3000
n is 4000
n is 5000
n is 6000
n is 7000
n is 8000
n is 9000


In [11]:
print(len(p_x))
print(len(p_y))
print(len(out_x))
print(len(out_y))

190000
190000
300000
300000


In [12]:
p_x = np.array(p_x).reshape((10000,19))
p_y = np.array(p_y).reshape((10000,19))
out_x = np.array(out_x).reshape((10000,30))
out_y = np.array(out_y).reshape((10000,30))

In [13]:
def get_out(n, label):
    out = []
    for i in range(0,len(label)):
        out.append(label[i][n])
    return out

In [14]:
y_0 = get_out(0, out_x)

In [15]:
len(y_0)

10000

In [16]:
t_x= []
t_y = []
for n in range(0, len(vali_dataset)):
    x = vali_dataset[n]['track_id'] 
    id = vali_dataset[n]['agent_id']
    i = np.where(x == id)[0][0]
    position = vali_dataset[n]['p_in'][i]
    position = position.reshape(38)
    t_x =  t_x + list(position[::2])
    t_y = t_y + list(position[1::2])
    if(n % 1000 == 0):
        print("n is " + str(n))


n is 0
n is 1000
n is 2000
n is 3000


In [17]:
len(t_x)

60800

In [18]:
len(t_y)

60800

In [19]:
t_x = np.array(t_x).reshape((3200,19))
t_y = np.array(t_y).reshape((3200,19))

In [72]:
#vali_dataset[0]

In [75]:
s_id = [None] * 3200
for i in range(0, len(vali_dataset)):
    s_id[i] = vali_dataset[i]['scene_idx']
s_id

[10002,
 10015,
 10019,
 10028,
 1003,
 10069,
 10075,
 10078,
 10082,
 10095,
 10096,
 10116,
 10161,
 10165,
 10189,
 10196,
 10200,
 10214,
 10219,
 10232,
 10240,
 10242,
 10250,
 10251,
 10278,
 10279,
 10281,
 10298,
 10303,
 10312,
 10313,
 10317,
 10340,
 10355,
 10357,
 10360,
 10361,
 10374,
 104,
 10413,
 10439,
 10449,
 10467,
 10472,
 10474,
 10487,
 10495,
 10517,
 10521,
 10523,
 10535,
 10545,
 10560,
 10561,
 10589,
 10605,
 10646,
 1065,
 10662,
 10693,
 10699,
 10701,
 10705,
 10744,
 10750,
 10759,
 1076,
 10778,
 1078,
 10797,
 10802,
 10810,
 10816,
 10819,
 10828,
 10836,
 10896,
 10909,
 10910,
 10916,
 10945,
 10947,
 10951,
 10967,
 10994,
 10995,
 10999,
 11002,
 11015,
 11019,
 11025,
 11039,
 11042,
 11067,
 11070,
 11089,
 11100,
 11104,
 11110,
 11114,
 11148,
 11152,
 11157,
 11173,
 11193,
 11201,
 11204,
 11211,
 11220,
 11221,
 11225,
 11227,
 1123,
 11232,
 11244,
 11260,
 11261,
 11268,
 11281,
 11286,
 11305,
 11312,
 11340,
 1137,
 11372,
 11381,


In [76]:
dfc = pd.DataFrame(columns=np.arange(60))
dfc[0] = list(s_id)
dfc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,10002,,,,,,,,,,...,,,,,,,,,,
1,10015,,,,,,,,,,...,,,,,,,,,,
2,10019,,,,,,,,,,...,,,,,,,,,,
3,10028,,,,,,,,,,...,,,,,,,,,,
4,1003,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,9897,,,,,,,,,,...,,,,,,,,,,
3196,99,,,,,,,,,,...,,,,,,,,,,
3197,9905,,,,,,,,,,...,,,,,,,,,,
3198,9910,,,,,,,,,,...,,,,,,,,,,


In [78]:
for i in range(0,30):
    real_x = get_out(i, out_x)
    real_y = get_out(i, out_y)
    regressor_x = LinearRegression()
    regressor_x.fit(p_x, real_x)
    pred_x_i = regressor_x.predict(t_x)
    regressor_y = LinearRegression()
    regressor_y.fit(p_y, real_y)
    pred_y_i = regressor_y.predict(t_y)
    dfc[i * 2 + 1] = list(pred_x_i)
    dfc[2 * i + 2] = list(pred_y_i)

In [79]:
dfc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,10002,1714.788506,337.228093,1715.770160,338.178221,1716.744825,339.182978,1717.662611,340.107479,1718.639533,...,1738.847846,361.868088,1739.780176,362.773865,1740.711759,363.769535,1741.642076,364.765002,1742.499082,365.717179
1,10015,725.516588,1229.649441,725.460338,1229.407828,725.380382,1229.202429,725.309051,1229.011473,725.242812,...,723.520020,1225.754191,723.411709,1225.639305,723.327407,1225.528407,723.250348,1225.432552,723.166557,1225.317566
2,10019,574.021669,1244.656077,574.239441,1244.590012,574.477481,1244.515491,574.714584,1244.435796,574.966838,...,580.031888,1242.736048,580.284681,1242.660719,580.537067,1242.586704,580.756047,1242.515877,580.990602,1242.451207
3,10028,1691.338758,315.593776,1691.972160,316.180711,1692.607087,316.769144,1693.250038,317.378249,1693.890185,...,1707.306417,331.075932,1707.931304,331.678899,1708.567855,332.327606,1709.193429,332.960671,1709.777407,333.554785
4,1003,2122.749869,677.068091,2121.456154,675.987435,2120.154004,674.865093,2118.889223,673.781991,2117.574461,...,2090.550863,649.908295,2089.307012,648.891224,2088.029607,647.846243,2086.804865,646.801834,2085.596353,645.823442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,9897,256.421638,805.446967,256.666989,805.258694,256.924563,805.046931,257.173135,804.832768,257.427186,...,262.989747,799.874370,263.259405,799.652229,263.534216,799.432042,263.810267,799.203696,264.084828,798.988585
3196,99,587.862670,1153.989829,587.896663,1153.380172,587.930974,1152.713310,587.965669,1152.018946,587.998699,...,588.665872,1137.085028,588.694670,1136.419879,588.724737,1135.751289,588.754977,1135.087015,588.787665,1134.434757
3197,9905,1755.582719,444.199834,1755.422131,444.536716,1755.260784,444.852227,1755.096159,445.156072,1754.928098,...,1751.444949,452.171990,1751.273567,452.486907,1751.110731,452.815072,1750.953185,453.135181,1750.793228,453.436180
3198,9910,574.602209,1288.831811,574.499954,1288.608010,574.401896,1288.393276,574.301177,1288.182044,574.191353,...,571.529813,1283.104124,571.390309,1282.878224,571.251984,1282.641295,571.107633,1282.413269,570.981107,1282.197185


In [80]:
dfc.to_csv('sub_dfc.csv', index=False)

In [None]:

df

In [None]:
#train_dataset[0]

### Visualize the batch of sequences

In [None]:
import matplotlib.pyplot as plt
import random

agent_id = 0

def show_sample_batch(sample_batch, agent_id):
    """visualize the trajectory for a batch of samples with a randon agent"""
    inp, out = sample_batch
    batch_sz = inp.size(0)
    agent_sz = inp.size(1)
    
    fig, axs = plt.subplots(1,batch_sz, figsize=(15, 3), facecolor='w', edgecolor='k')
    fig.subplots_adjust(hspace = .5, wspace=.001)
    axs = axs.ravel()   
    for i in range(batch_sz):
        axs[i].xaxis.set_ticks([])
        axs[i].yaxis.set_ticks([])
        
        # first two feature dimensions are (x,y) positions
        axs[i].scatter(inp[i, agent_id,:,0], inp[i, agent_id,:,1])
        axs[i].scatter(out[i, agent_id,:,0], out[i, agent_id,:,1])

        
for i_batch, sample_batch in enumerate(train_loader):
    inp, out = sample_batch
    """TODO:
      Deep learning model
      training routine
    """
    show_sample_batch(sample_batch, agent_id)
    
    break