In [3]:
# data handling
import pandas as pd 
import numpy as np 
from datetime import datetime
import scipy 
import math 

# visualizations
import matplotlib.pyplot as plt 
import seaborn as sns

# pickle 
import pickle 

# sklearn 
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# pytorch 
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data 
from torch.autograd import Variable
import torch.optim as optim 

# optuna 
import optuna 

In [7]:
SEED = 1

# check for GPU 
DEVICE = 'cpu'
cuda = torch.cuda.is_available()
if cuda: 
    print('cuda available..')
    DEVICE = 'cuda'
    torch.cuda.manual_seed(SEED)
print(f'Device: {DEVICE}')

cuda available..
Device: cuda


# Data Notes
---
Files (by fishing gear types):  
1. Drifting longline vessels 
2. Fixed gear vessels 
3. Pole and line vessels 
4. Purse seines vessels
5. Trawler vessels 
6. Troller vessels 
7. Vessels with unknown gear types  
  
CSV Table Schema:  
- mmsi: anonymized vessel identifier
- timestamp: unix timestamp
- distance_from_shore: distance from shore in meters
- distance_from_port: distance from port in meters
- speed: vessel speed in knots
- course: vessel's course over ground (represented in degrees)
    - Some records will have course values of 511, which is larger than the maximum value of 360 degrees. That means the data is not available according to the [US Coast Guard Class A AIS Position Report Documentation](https://www.navcen.uscg.gov/?pageName=AISMessagesA).
- lat: latitude in decimal degrees
- lon: longitude in decimal degrees
- is_fishing: lable indicating fishing activity
    - 0 = not fishing
    - \> 0 = fishing; data values between 0 and 1 indicate the average score for the position if scored by multiple people
    - -1 = no data
- source: the training data batch; data was prepared by GFW, Dalhousie, and a crowd sourcing campaign (false positives are marked as false_positives)

In [9]:
# filepaths 
drifting_longlines_file = './data/drifting_longlines.csv'
fixed_gear_file = './data/fixed_gear.csv'
pole_and_line_file = './data/pole_and_line.csv'
purse_seines_file = './data/purse_seines.csv'
trawlers_file = './data/trawlers.csv'
trollers_file = './data/trollers.csv'
unknown_file = './data/unknown.csv'

# for mapping vessel paths later 
# ocean_map = gpd.read_file('./mapping/world_map/ne_50m_ocean.shp')
# south_atlantic = gpd.read_file('./mapping/south_atlantic/iho.shp')
# bornholm = gpd.read_file('./mapping/bornholm/europeislands.shp')
# arrecife = gpd.read_file('./mapping/north_atlantic/iho.shp')
# phillippine_sea = gpd.read_file('./mapping/phillippine_sea/iho.shp')
# north_pacific = gpd.read_file('./mapping/north_pacific/eez_iho.shp')
# crs = {'init': 'epsg:4326'}

# Exploratory Analysis / Initial Data Wrangling on Separated Datasets  
---  
  


## Drifting Longlines


In [27]:
# read in data from the drifting long line vessel file 
drifting_longlines_df = pd.read_csv(drifting_longlines_file)

# peak at the data
drifting_longlines_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,12639560000000.0,1327137000.0,232994.28125,311748.65625,8.2,230.5,14.865583,-26.853662,-1.0,dalhousie_longliner
1,12639560000000.0,1327137000.0,233994.265625,312410.34375,7.3,238.399994,14.86387,-26.8568,-1.0,dalhousie_longliner
2,12639560000000.0,1327137000.0,233994.265625,312410.34375,6.8,238.899994,14.861551,-26.860649,-1.0,dalhousie_longliner
3,12639560000000.0,1327143000.0,233994.265625,315417.375,6.9,251.800003,14.822686,-26.865898,-1.0,dalhousie_longliner
4,12639560000000.0,1327143000.0,233996.390625,316172.5625,6.1,231.100006,14.821825,-26.867579,-1.0,dalhousie_longliner


In [28]:
# check data characteristics 
drifting_longlines_df.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,13968730.0,13968730.0,13968730.0,13968730.0,13968630.0,13968630.0,13968730.0,13968730.0,13968730.0
mean,129385000000000.0,1434290000.0,584531.1,789750.5,5.464779,181.4876,-8.997629,3.758693,-0.9743015
std,78873570000000.0,39842750.0,542006.8,691543.8,4.043567,105.0503,24.39311,109.5971,0.2119947
min,5601266000000.0,1325376000.0,0.0,0.0,0.0,0.0,-75.19017,-180.0,-1.0
25%,62603840000000.0,1410706000.0,101909.2,213020.6,2.1,90.7,-26.0155,-88.08668,-1.0
50%,118485900000000.0,1447302000.0,457639.3,637524.9,5.5,181.1,-14.97954,-1.716495,-1.0
75%,198075800000000.0,1466506000.0,960366.4,1210432.0,8.5,271.1,4.48579,100.9811,-1.0
max,281205800000000.0,1480032000.0,4430996.0,7181037.0,102.3,511.0,83.33266,179.9938,1.0


In [29]:
# data shape 
drifting_longlines_df.shape

(13968727, 10)

In [30]:
# drop rows without data for is_fishing 
drifting_longlines_df = drifting_longlines_df[drifting_longlines_df['is_fishing'] != -1]

# recheck data shape
drifting_longlines_df.shape

(219741, 10)

In [31]:
# check label distribution 
drifting_longlines_df['is_fishing'].value_counts()

1.000000    138163
0.000000     79574
0.666667      1076
0.333333       809
0.750000       110
0.250000         9
Name: is_fishing, dtype: int64

In [15]:
# recheck data characteristics after dropping rows missing is_fishing data 
drifting_longlines_df.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,13830560.0,13830560.0,13830560.0,13830560.0,13830470.0,13830470.0,13830560.0,13830560.0,13830560.0
mean,129314700000000.0,1434526000.0,584525.2,789468.8,5.466335,181.5022,-8.986096,3.826357,-0.9940241
std,78845010000000.0,39945210.0,542829.3,692495.9,4.050818,105.0709,24.41379,109.6255,0.07786687
min,5601266000000.0,1325376000.0,0.0,0.0,0.0,0.0,-75.19017,-180.0,-1.0
25%,62603840000000.0,1411057000.0,101200.3,211600.6,2.1,90.7,-26.02216,-88.06982,-1.0
50%,118485900000000.0,1447828000.0,456818.1,636824.2,5.5,181.2,-14.97248,-1.744785,-1.0
75%,198075800000000.0,1466633000.0,961199.1,1210012.0,8.5,271.1,4.522833,101.0775,-1.0
max,281205800000000.0,1480032000.0,4430996.0,7181037.0,102.3,511.0,83.33266,179.9938,0.75


In [16]:
# check for null values 
drifting_longlines_df.isnull().sum()

mmsi                    0
timestamp               0
distance_from_shore     0
distance_from_port      0
speed                  98
course                 98
lat                     0
lon                     0
is_fishing              0
source                  0
dtype: int64

In [32]:
# reformat unix timestamps into datetime
drifting_long_lines_formatted_timestamps = pd.to_datetime(drifting_longlines_df['timestamp'], unit = 's')
drifting_longlines_df.insert(2, 'timestamp_reformat', drifting_long_lines_formatted_timestamps)
drifting_longlines_df.head() 

Unnamed: 0,mmsi,timestamp,timestamp_reformat,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
330,12639560000000.0,1338658000.0,2012-06-02 17:19:50,111123.328125,147593.8125,8.2,162.800003,18.608902,-17.205967,1.0,dalhousie_longliner
423,12639560000000.0,1340094000.0,2012-06-19 08:12:54,329078.53125,362363.84375,5.0,333.899994,18.839836,-19.453779,1.0,dalhousie_longliner
570,12639560000000.0,1345706000.0,2012-08-23 07:06:20,86831.046875,196301.0625,0.7,260.100006,19.27672,-17.323196,1.0,dalhousie_longliner
638,12639560000000.0,1346391000.0,2012-08-31 05:24:34,98881.34375,166465.140625,4.2,43.400002,18.852455,-17.255033,1.0,dalhousie_longliner
677,12639560000000.0,1346825000.0,2012-09-05 06:11:05,74247.757812,177480.859375,7.0,333.5,19.101482,-17.147205,1.0,dalhousie_longliner


In [33]:
# add separate month, day of the week, year, and hour columns derived from timstamps
drifting_longlines_df.insert(3, 'year', drifting_longlines_df['timestamp_reformat'].dt.year)
# round to closest hour 
drifting_longlines_df.insert(4, 'hour', drifting_longlines_df['timestamp_reformat'].dt.round('H').dt.hour)
drifting_longlines_df.insert(5, 'day_of_week', drifting_longlines_df['timestamp_reformat'].dt.day_name())
drifting_longlines_df.insert(6, 'month', drifting_longlines_df['timestamp_reformat'].dt.month_name())

drifting_longlines_df.head()

Unnamed: 0,mmsi,timestamp,timestamp_reformat,year,hour,day_of_week,month,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
330,12639560000000.0,1338658000.0,2012-06-02 17:19:50,2012,17,Saturday,June,111123.328125,147593.8125,8.2,162.800003,18.608902,-17.205967,1.0,dalhousie_longliner
423,12639560000000.0,1340094000.0,2012-06-19 08:12:54,2012,8,Tuesday,June,329078.53125,362363.84375,5.0,333.899994,18.839836,-19.453779,1.0,dalhousie_longliner
570,12639560000000.0,1345706000.0,2012-08-23 07:06:20,2012,7,Thursday,August,86831.046875,196301.0625,0.7,260.100006,19.27672,-17.323196,1.0,dalhousie_longliner
638,12639560000000.0,1346391000.0,2012-08-31 05:24:34,2012,5,Friday,August,98881.34375,166465.140625,4.2,43.400002,18.852455,-17.255033,1.0,dalhousie_longliner
677,12639560000000.0,1346825000.0,2012-09-05 06:11:05,2012,6,Wednesday,September,74247.757812,177480.859375,7.0,333.5,19.101482,-17.147205,1.0,dalhousie_longliner


In [34]:
# count the unique mmsi's 
len(pd.unique(drifting_longlines_df['mmsi']))

110

## Fixed Gear 

In [20]:
# read in the data from the fixed gear vessels file 
fixed_gear_df = pd.read_csv(fixed_gear_file)

# peak at data 
fixed_gear_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,7572519000000.0,1347664000.0,0.0,36054.625,0.0,0.0,42.798748,-8.944992,-1.0,gfw
1,7572519000000.0,1348056000.0,0.0,36054.625,0.0,0.0,42.798717,-8.945075,-1.0,gfw
2,7572519000000.0,1350409000.0,0.0,90970.296875,0.0,198.199997,43.106419,-9.215466,-1.0,gfw
3,7572519000000.0,1350410000.0,0.0,90970.296875,0.0,186.899994,43.106434,-9.215431,-1.0,gfw
4,7572519000000.0,1350411000.0,0.0,90970.296875,0.0,190.5,43.10643,-9.215442,-1.0,gfw


In [21]:
# check data characteristics
fixed_gear_df.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0,1559137.0
mean,153075200000000.0,1421486000.0,37618.78,59898.48,2.227195,187.7938,50.95086,1.274018,-0.96591
std,89763830000000.0,37828300.0,109018.8,126972.9,3.41279,117.7506,5.894565,8.512244,0.2173124
min,7572519000000.0,1325625000.0,0.0,0.0,0.0,0.0,-83.2646,-179.2441,-1.0
25%,88780180000000.0,1387594000.0,0.0,5656.715,0.0,77.4,47.45566,-3.909275,-1.0
50%,130528900000000.0,1427254000.0,0.0,26906.59,0.1,205.5,50.50242,-2.333808,-1.0
75%,261683000000000.0,1455255000.0,34131.26,55143.91,3.8,287.0,56.02126,8.220293,-1.0
max,280291300000000.0,1480032000.0,3099833.0,11816760.0,102.3,511.0,84.79108,170.9277,1.0


In [23]:
# data shape
fixed_gear_df.shape

(1559137, 10)

In [24]:
# drop rows without data for is_fishing 
fixed_gear_df = fixed_gear_df[fixed_gear_df['is_fishing'] != -1]
fixed_gear_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
26358,7572519000000.0,1420185000.0,38012.222656,103307.679688,1.8,29.0,43.41658,-9.328747,0.0,gfw
26359,7572519000000.0,1420185000.0,38909.199219,103389.921875,1.0,13.9,43.418369,-9.326941,0.0,gfw
26360,7572519000000.0,1420186000.0,38909.199219,103389.921875,0.8,28.5,43.420197,-9.325145,0.0,gfw
26361,7572519000000.0,1420187000.0,38909.199219,103389.921875,1.3,46.5,43.421734,-9.323038,0.0,gfw
26362,7572519000000.0,1420188000.0,38909.199219,103389.921875,0.7,217.300003,43.420063,-9.331503,0.0,gfw


In [25]:
# recheck data shape 
fixed_gear_df.shape

(41858, 10)

In [26]:
# check label distribution 
fixed_gear_df['is_fishing'].value_counts()

0.000000    29790
1.000000    10665
0.666667      538
0.250000      423
0.333333      405
0.800000       29
0.750000        6
0.400000        2
Name: is_fishing, dtype: int64

In [35]:
# check for null values 
fixed_gear_df.isnull().sum()

mmsi                   0
timestamp              0
distance_from_shore    0
distance_from_port     0
speed                  0
course                 0
lat                    0
lon                    0
is_fishing             0
source                 0
dtype: int64

In [37]:
# reformat unix timestamps into datetime 
fixed_gear_formatted_timestamps = pd.to_datetime(fixed_gear_df['timestamp'], unit = 's')
fixed_gear_df.insert(2, 'timestamp_reformat', fixed_gear_formatted_timestamps)
fixed_gear_df.head()

ValueError: cannot insert timestamp_reformat, already exists

In [38]:
# add separate month, day of the week, year, and hour columns derived from timstamps
fixed_gear_df.insert(3, 'year', fixed_gear_df['timestamp_reformat'].dt.year)
# round to closest hour
fixed_gear_df.insert(4, 'hour', fixed_gear_df['timestamp_reformat'].dt.round('H').dt.hour)
fixed_gear_df.insert(5, 'day_of_week', fixed_gear_df['timestamp_reformat'].dt.day_name())
fixed_gear_df.insert(6, 'month', fixed_gear_df['timestamp_reformat'].dt.month_name())

fixed_gear_df.head()

Unnamed: 0,mmsi,timestamp,timestamp_reformat,year,hour,day_of_week,month,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
26358,7572519000000.0,1420185000.0,2015-01-02 07:44:07,2015,8,Friday,January,38012.222656,103307.679688,1.8,29.0,43.41658,-9.328747,0.0,gfw
26359,7572519000000.0,1420185000.0,2015-01-02 07:56:08,2015,8,Friday,January,38909.199219,103389.921875,1.0,13.9,43.418369,-9.326941,0.0,gfw
26360,7572519000000.0,1420186000.0,2015-01-02 08:08:09,2015,8,Friday,January,38909.199219,103389.921875,0.8,28.5,43.420197,-9.325145,0.0,gfw
26361,7572519000000.0,1420187000.0,2015-01-02 08:23:08,2015,8,Friday,January,38909.199219,103389.921875,1.3,46.5,43.421734,-9.323038,0.0,gfw
26362,7572519000000.0,1420188000.0,2015-01-02 08:38:07,2015,9,Friday,January,38909.199219,103389.921875,0.7,217.300003,43.420063,-9.331503,0.0,gfw


In [39]:
# count the unique mmsi's 
len(pd.unique(fixed_gear_df['mmsi']))

36

## Pole and Lines 

In [40]:
# read in data from the pole and lines vessel file
pole_and_lines_df = pd.read_csv(pole_and_line_file)

# peak at the data 
pole_and_lines_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,18483460000000.0,1340882000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537797,-1.0,gfw
1,18483460000000.0,1340884000.0,0.0,2236.013184,0.0,125.199997,28.967373,-13.537838,-1.0,gfw
2,18483460000000.0,1340885000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537838,-1.0,gfw
3,18483460000000.0,1340888000.0,0.0,2236.013184,0.0,0.0,28.967354,-13.537838,-1.0,gfw
4,18483460000000.0,1340925000.0,1999.950928,2828.357666,8.7,203.100006,28.929653,-13.543955,-1.0,gfw


In [41]:
# check data characteristics 
pole_and_lines_df.describe()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
count,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0,161315.0
mean,76598410000000.0,1414174000.0,48944.34,73830.4,2.111584,132.245911,33.456753,-4.570087,-0.967918
std,57405770000000.0,43006470.0,203649.9,233360.4,3.696588,117.424918,5.665633,53.524248,0.205442
min,18483460000000.0,1327882000.0,0.0,0.0,0.0,0.0,-41.853848,-70.921013,-1.0
25%,18483460000000.0,1368384000.0,0.0,2236.013,0.0,3.3,28.96594,-28.527719,-1.0
50%,87031420000000.0,1423536000.0,0.0,14421.85,0.0,115.199997,33.029419,-16.734444,-1.0
75%,87031420000000.0,1456109000.0,22802.95,53243.41,1.8,228.699997,38.531128,-13.539565,-1.0
max,214572700000000.0,1480031000.0,2110362.0,3005100.0,102.300003,360.0,77.078987,177.63298,1.0


In [42]:
# data shape 
pole_and_lines_df.shape

(161315, 10)

In [43]:
# drop rows without data for is_fishing 
pole_and_lines_df = pole_and_lines_df[pole_and_lines_df['is_fishing'] != -1]
pole_and_lines_df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
46903,18483460000000.0,1427865000.0,0.0,2236.013184,0.0,137.899994,28.96549,-13.539565,0.0,gfw
46904,18483460000000.0,1427866000.0,0.0,2236.013184,0.0,144.800003,28.965481,-13.539565,0.0,gfw
46905,18483460000000.0,1427867000.0,0.0,2236.013184,0.0,0.0,28.965487,-13.539558,0.0,gfw
46906,18483460000000.0,1427867000.0,0.0,2236.013184,0.0,112.0,28.965469,-13.539557,0.0,gfw
46907,18483460000000.0,1427868000.0,0.0,2236.013184,0.0,95.300003,28.965473,-13.53956,0.0,gfw


In [44]:
# recheck data shape 
pole_and_lines_df.shape

(4250, 10)

In [48]:
# check label distribution 
pole_and_lines_df['is_fishing'].value_counts()

0.000000    3176
1.000000     833
0.333333     189
0.750000      31
0.166667      12
0.400000       7
0.666667       2
Name: is_fishing, dtype: int64