## Isolation Forest Anomaly detection
Using unsupervised learning Machine Learning model based on the fact that the dataset from Mariine Cadastre (https://hub.marinecadastre.gov/pages/vesseltraffic) is unlabelled data.

In [1]:
# Import necessary libraries and utilities
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
# import matplotlib.pyplot as plt

# Load the preprocessed AIS data
ais = pd.read_csv('./preprocessed_ais.csv')

# Features selection based on engineered features and other relevant attributes
features = ais[['VesselType', 'Length', 'Width', 'calculated_speed', 'heading_deviation', 'sog_mps', 'distance', 'time_diff']]

# Impute 0 for missing values
features = features.fillna(0)

# Train Isolation Forest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
isolation_forest.fit(features)

# Predict anomalies
ais['forest_anomaly'] = isolation_forest.predict(features)

# Extract rows marked as anomalies
anomalies = ais[ais['forest_anomaly'] == -1]

# Print summary of anomalies
print(f"Number of anomalies detected: {len(anomalies)}")
print(anomalies.head())

# Export anomalies generated to csv
anomalies.to_csv('isolation_forest_anomalies.csv', index=False)

Number of anomalies detected: 121806
    MMSI         BaseDateTime  ...  cumulative_distance  forest_anomaly
8     11  2023-01-01 01:08:06  ...            92.792298              -1
11    11  2023-01-01 01:28:59  ...           131.328400              -1
17    11  2023-01-01 02:02:04  ...           191.851842              -1
19    11  2023-01-01 02:20:08  ...           207.813865              -1
20    11  2023-01-01 02:32:03  ...           214.458147              -1

[5 rows x 24 columns]
Anomalies saved to 'isolation_forest_anomalies.csv'
