## Importing libraries

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from matplotlib import rc
from matplotlib.ticker import MaxNLocator

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler


plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

## Setting the global seed

In [2]:
pl.seed_everything(42)

Global seed set to 42


42

## Setting up GPU acceleration

In [3]:
device = torch.device('mps')

## Loading the data

In [4]:
data = pd.read_csv('set1.csv', parse_dates=['SEQUENCE_DTTM'])
data.shape,data.head()

((13714, 7),
    OBJECT_ID     VID       SEQUENCE_DTTM        LAT        LON  \
 0          1  100008 2022-12-07 14:00:00  36.906850 -76.089022   
 1          2  100015 2022-12-07 14:00:00  36.950000 -76.026834   
 2          3  100016 2022-12-07 14:00:00  36.906783 -76.089084   
 3          4  100019 2022-12-07 14:00:00  37.003000 -76.283167   
 4          5  100016 2022-12-07 14:00:01  36.906783 -76.089084   
 
    SPEED_OVER_GROUND  COURSE_OVER_GROUND  
 0                  1                1641  
 1                 11                2815  
 2                  0                2632  
 3                148                2460  
 4                  0                2632  )

## Preprocessing the data

In [5]:
# Converting the dataframe into a features matrix and a target vector
rows = []
# iterating over the dataframe
for _, row in tqdm(data.iterrows(), total=data.shape[0]):
    # creating a list of features for each row
    row_data = dict(
        hour=row['SEQUENCE_DTTM'].hour,
        min=row['SEQUENCE_DTTM'].minute,
        sec=row['SEQUENCE_DTTM'].second,
        latitude=row['LAT'],
        longitude=row['LON'],
        COG=row['COURSE_OVER_GROUND'],
        SOG=row['SPEED_OVER_GROUND'], 
        # VID is the last row and will be the target
        VID=row['VID']
    )
    rows.append(row_data)

  0%|          | 0/13714 [00:00<?, ?it/s]

In [6]:
features_df = pd.DataFrame(rows)
features_df.head()

Unnamed: 0,hour,min,sec,latitude,longitude,COG,SOG,VID
0,14,0,0,36.90685,-76.089022,1641,1,100008
1,14,0,0,36.95,-76.026834,2815,11,100015
2,14,0,0,36.906783,-76.089084,2632,0,100016
3,14,0,0,37.003,-76.283167,2460,148,100019
4,14,0,1,36.906783,-76.089084,2632,0,100016


## Splitting the data into train and test sets

In [7]:
train_size = int(len(features_df) * 0.8)
train_size

10971

In [8]:
train_df, test_df = features_df[:train_size], features_df[train_size:]
train_df.shape, test_df.shape

((10971, 8), (2743, 8))

In [9]:
data.shape

(13714, 7)

In [10]:
# Scaling the features
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [11]:
# Transforming the scaled features back into a dataframe
train_df = pd.DataFrame(
    scaler.transform(train_df), 
    columns=train_df.columns,
    index=train_df.index)

In [12]:
# Scaling and transforming the test set
test_df = pd.DataFrame(
    scaler.transform(test_df),
    columns=test_df.columns,
    index=test_df.index)

In [13]:
train_df.head()

Unnamed: 0,hour,min,sec,latitude,longitude,COG,SOG,VID
0,-1.0,-1.0,-1.0,-0.995191,0.377254,-0.08808,-0.992395,-0.263158
1,-1.0,-1.0,-1.0,-0.393756,0.732772,0.564323,-0.91635,0.473684
2,-1.0,-1.0,-1.0,-0.996125,0.376899,0.462629,-1.0,0.578947
3,-1.0,-1.0,-1.0,0.344972,-0.732641,0.367046,0.125475,0.894737
4,-1.0,-1.0,-0.966102,-0.996125,0.376899,0.462629,-1.0,0.578947


## Creating tensors

In [None]:
# Converting the training and test sets into sequences
def create_sequences(input_data: pd.DataFrame, target_column, seq_length):
    sequences = []
    data_size = len(input_data)

    for i in tqdm(range(data_size - seq_length)):

        seq = input_data[i:i+seq_length]

        label_index = i + seq_length
        
        label = input_data[target_column][label_index]
        sequences.append((seq, label))
    
    return sequences

In [None]:
# Testing the function
sample_df = pd.DataFrame(dict(
    f1 = [1, 2, 3, 4, 5],
    lab = [6, 7, 8, 9, 10]
))
sample_df

In [None]:
sample_sequences = create_sequences(sample_df, 'lab', seq_length = 4)
sample_sequences

In [None]:
print(sample_sequences[0][0])
print()
print(f'label: {sample_sequences[0][1]}')

In [None]:
SEQ_LENGTH = 50
TARGET_COLUMN = 'VID'

ais_sequence = create_sequences(train_df, TARGET_COLUMN, SEQ_LENGTH)



In [None]:
print(ais_sequence[0][0])
print()
print(f'label: {ais_sequence[0][1]}')

In [17]:
# train_sequences = train_df.astype(np.float32).to_numpy().tolist()
# test_sequences = train_df.astype(np.float32).to_numpy().tolist()

In [19]:
def create_dataset(sequences):
    dataset = [torch.tensor(s).unsqueeze(1)]

10971