In [9]:
import numpy as np
import pandas as pd
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import MinMaxScaler

In [3]:
location = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")

In [4]:
df = pd.read_csv('train.csv')
df['LENGTH'] = df['POLYLINE'].apply(lambda x: len(json.loads(x)))
df = df[df['LENGTH'] > 8]
sample = df[df.MISSING_DATA==False].sample(n=10000).copy(deep=True)

In [6]:
coordinates = [item for trajectory in list(sample['POLYLINE'].apply(lambda x: json.loads(x))) for item in trajectory]

In [10]:
coordinates = np.array(coordinates)
scaler = MinMaxScaler()
scaler.fit(coordinates)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [12]:
class CustomDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.x = X_data
        self.y = y_data
        self.n_samples = len(y_data)
    
    def __getitem__(self, idx):
        data = self.x[idx], self.y[idx]
        return data
    
    def __len__(self):
        return self.n_samples

In [13]:
def convert_data(trajectory):
    trajectory = np.array(json.loads(trajectory))
    trajectory = scaler.transform(trajectory)
    length = trajectory.shape[0]
    data = []
    label = []
    for i in range(8, length):
        data.append(trajectory[i-8:i])
        label.append(trajectory[i])
    data = np.stack(data, axis=0)
    label = np.stack(label, axis=0)
    return data, label

def prepare_data(data):
    results = list(data['POLYLINE'].apply(lambda x: convert_data(x)))
    X, y = list(zip(*results))
    X = np.concatenate(X, axis=0)
    y = np.concatenate(y, axis=0)
    X = torch.from_numpy(X).float()
    y = torch.from_numpy(y).float()
    return X, y

In [14]:
(train_X, train_y), (valid_X, valid_y), (test_X, test_y) = prepare_data(sample[:7000]), prepare_data(sample[7000:8000]), prepare_data(sample[8000:10000])

In [15]:
train_set = CustomDataset(train_X, train_y)
valid_set = CustomDataset(valid_X, valid_y)
test_set = CustomDataset(test_X, test_y)


train_loader = DataLoader(train_set, batch_size=1000, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_set, batch_size=1000, shuffle=True, num_workers=4)
test_loader = DataLoader(test_set, batch_size=1, shuffle=True, num_workers=1)

In [22]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers=2, dropout=0.2):
        super(Model, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.in2lstm1 = nn.Linear(input_size, hidden_size)
        self.lstm1 = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=False, batch_first=True, dropout=dropout)
        
        self.in2lstm2 = nn.Linear(input_size, hidden_size)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=False, batch_first=True, dropout=dropout)
        
        self.fc0 = nn.Linear(hidden_size, hidden_size*2)
        self.fc1 = nn.Linear(hidden_size*2, int(hidden_size/2))
        self.fc2 = nn.Linear(int(hidden_size/2), output_size)
        
        self.tanh = nn.Tanh()
        
    def forward(self, x):
        lstm_out1, _ = self.lstm1(self.in2lstm1(x))
        lstm_out2, _ = self.lstm2(self.in2lstm2(x))
        out = self.tanh(self.fc0(lstm_out1 + lstm_out2))
        out = self.tanh(self.fc1(out))
        output = self.fc2(out)[:, -1]
        return output

In [None]:
INPUT_SIZE=2
OUTPUT_SIZE=2
HIDDEN_SIZE=128
NUM_LAYERS = 2
LEARNING_RATE = 0.0005
N_EPOCHS = 100
BATCH_SIZE = train_loader.batch_size

model = Model(INPUT_SIZE, OUTPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


for epoch in range(1, N_EPOCHS+1):
    train_loss_cache = []
    valid_loss_cache = []

    train_loss = 0
    valid_loss = 0
    for X, y in train_loader:
        optimizer.zero_grad()
        prediction = model(X)
        loss = criterion(prediction, y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    with torch.no_grad():
        for X, y in valid_loader:
            prediction = model(X)
            loss = criterion(prediction, y)
            valid_loss += loss.item()
            
    train_loss = train_loss/BATCH_SIZE
    valid_loss = valid_loss/BATCH_SIZE
    
    train_loss_cache.append(train_loss)
    valid_loss_cache.append(valid_loss)
    
    print(f'Epoch: {epoch}, Train Loss: {train_loss/BATCH_SIZE}, Valid Loss: {valid_loss/BATCH_SIZE}')

Epoch: 1, Train Loss: 1.2519536109102774e-06, Valid Loss: 4.0710645043873225e-09
Epoch: 2, Train Loss: 2.2558230040885976e-08, Valid Loss: 2.0078584384464194e-09
Epoch: 3, Train Loss: 1.2302679622735014e-08, Valid Loss: 1.176501764348359e-09
Epoch: 4, Train Loss: 7.905334110546392e-09, Valid Loss: 8.584805545979179e-10
Epoch: 5, Train Loss: 5.795697501525865e-09, Valid Loss: 6.245593076528167e-10
Epoch: 6, Train Loss: 4.8794509184517666e-09, Valid Loss: 4.238547135173576e-10
Epoch: 7, Train Loss: 4.193665038201289e-09, Valid Loss: 2.9650086116816965e-10
Epoch: 8, Train Loss: 3.923364332422352e-09, Valid Loss: 3.7653248909919056e-10
Epoch: 9, Train Loss: 3.5696005988938853e-09, Valid Loss: 4.3098550668219106e-10
Epoch: 10, Train Loss: 3.4494811361582834e-09, Valid Loss: 2.51320924689935e-10
Epoch: 11, Train Loss: 3.3752880535757864e-09, Valid Loss: 2.9557667085100546e-10
Epoch: 12, Train Loss: 3.1910504430925354e-09, Valid Loss: 2.967395894302172e-10
Epoch: 13, Train Loss: 3.41877207893