In [1]:
import numpy as np
import pandas as pd
import matplotlib
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
%matplotlib inline

from os import path, getcwd
from glob import glob
import pykalman
import datetime as dt

In [2]:
data_dir = '../data/workshop-content18/3-snc/data/'
ais_pathnames = glob(data_dir + '*.txt')
ais_basenames = [path.basename(pn) for pn in ais_pathnames]
for bn in ais_basenames:
    print(bn)

PMV_AIS_NewWestminster_Current.txt
PMV_AIS_Deltaport_Current.txt
PMV_AIS_Deltaport_History.txt
PMV_AIS_NewWestminster_History.txt


In [3]:
delta_cur_basename = next(bn for bn in ais_basenames if 'Deltaport_Current' in bn)
delta_his_basename = next(bn for bn in ais_basenames if 'Deltaport_History' in bn)

In [4]:
#max_rows=int(3e7)
delta_cur = pd.read_csv(
    data_dir + delta_cur_basename, sep='\t',
    parse_dates=['ReceivedTime'],usecols=['UserID','ROT','SOG','Longitude','Latitude',
                                         'TrueHeading','ReceivedTime'])

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
delta_cur = delta_cur.dropna(axis=0)
delta_cur.head()

Unnamed: 0,UserID,ROT,SOG,Longitude,Latitude,TrueHeading,ReceivedTime
0,316005621,-127.0,0.1,-122.77156,49.23065,511.0,2013-10-22 01:05:24.510
1,316018851,0.0,1.7,-123.05445,49.29853,110.0,2013-10-22 01:05:25.400
2,316003679,-127.0,0.1,-123.10751,49.31308,511.0,2013-10-22 01:05:25.853
3,316014621,127.0,12.3,-123.09534,49.2994,210.0,2013-10-22 01:05:26.027
4,316005721,-127.0,0.1,-123.10684,49.31094,511.0,2013-10-22 01:05:26.620


In [6]:
if delta_cur.isna().iloc[-1,:].sum() > 10:
    delta_cur = delta_cur.iloc[:-1, :]

print('# na:', (delta_cur.isna().sum(axis=1) > 5).sum())

# na: 0


In [7]:
delta_cur.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2560654 entries, 0 to 2560653
Data columns (total 7 columns):
UserID          object
ROT             float64
SOG             float64
Longitude       float64
Latitude        float64
TrueHeading     float64
ReceivedTime    datetime64[ns]
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 236.3+ MB


In [8]:
#extract only lat and lon needed
mask = (delta_cur['Longitude']>-127)&(delta_cur['Longitude']<-120)
mask = mask & (delta_cur['Latitude']>48)|(delta_cur['Latitude']<50)

delta_cur = delta_cur[mask]

In [9]:
# 6 minutes = 360 seconds
delta_cur['Time2'] = delta_cur['ReceivedTime'].apply(lambda x: int(360*round(x.timestamp()/360)))

In [10]:
#delta_cur = delta_cur.drop(['ReceivedTime'],axis=1)

In [11]:
delta_cur.head()

Unnamed: 0,UserID,ROT,SOG,Longitude,Latitude,TrueHeading,ReceivedTime,Time2
0,316005621,-127.0,0.1,-122.77156,49.23065,511.0,2013-10-22 01:05:24.510,1382403960
1,316018851,0.0,1.7,-123.05445,49.29853,110.0,2013-10-22 01:05:25.400,1382403960
2,316003679,-127.0,0.1,-123.10751,49.31308,511.0,2013-10-22 01:05:25.853,1382403960
3,316014621,127.0,12.3,-123.09534,49.2994,210.0,2013-10-22 01:05:26.027,1382403960
4,316005721,-127.0,0.1,-123.10684,49.31094,511.0,2013-10-22 01:05:26.620,1382403960


In [12]:
delta_cur.shape

(2502883, 8)

In [13]:
nPoints_by_UserID = pd.value_counts(delta_cur.UserID)
nPoints_threshold = 1000
print('num rows for which userid count <= 1000:', nPoints_by_UserID.loc[nPoints_by_UserID <= 1000].values.sum())
userID_allowed = nPoints_by_UserID.index[nPoints_by_UserID > nPoints_threshold]

num rows for which userid count <= 1000: 54122


In [14]:
#number of ships selected
len(list(userID_allowed))

266

In [15]:
delta_cur = delta_cur[delta_cur['UserID'].isin(list(userID_allowed))]

In [16]:
delta_cur.head()

Unnamed: 0,UserID,ROT,SOG,Longitude,Latitude,TrueHeading,ReceivedTime,Time2
0,316005621,-127.0,0.1,-122.77156,49.23065,511.0,2013-10-22 01:05:24.510,1382403960
1,316018851,0.0,1.7,-123.05445,49.29853,110.0,2013-10-22 01:05:25.400,1382403960
2,316003679,-127.0,0.1,-123.10751,49.31308,511.0,2013-10-22 01:05:25.853,1382403960
3,316014621,127.0,12.3,-123.09534,49.2994,210.0,2013-10-22 01:05:26.027,1382403960
4,316005721,-127.0,0.1,-123.10684,49.31094,511.0,2013-10-22 01:05:26.620,1382403960


In [17]:
delta_cur.to_csv("delta_processed.csv",sep='\t')

In [18]:
delta_cur.head()

Unnamed: 0,UserID,ROT,SOG,Longitude,Latitude,TrueHeading,ReceivedTime,Time2
0,316005621,-127.0,0.1,-122.77156,49.23065,511.0,2013-10-22 01:05:24.510,1382403960
1,316018851,0.0,1.7,-123.05445,49.29853,110.0,2013-10-22 01:05:25.400,1382403960
2,316003679,-127.0,0.1,-123.10751,49.31308,511.0,2013-10-22 01:05:25.853,1382403960
3,316014621,127.0,12.3,-123.09534,49.2994,210.0,2013-10-22 01:05:26.027,1382403960
4,316005721,-127.0,0.1,-123.10684,49.31094,511.0,2013-10-22 01:05:26.620,1382403960
