## AutoEncoder Anomaly detection¶

Using unsupervised learning Machine Learning model based on the fact that the dataset from Mariine Cadastre (https://hub.marinecadastre.gov/pages/vesseltraffic) is unlabelled data.

In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load the preprocessed AIS data
ais = pd.read_csv('preprocessed_ais.csv')

# Feature selection based on important attributes and engineered features
features = ais[['VesselType', 'Length', 'Width', 'calculated_speed', 'heading_deviation', 'sog_mps', 'distance', 'time_diff']]

# Ensure there are no missing values
features = features.fillna(0).to_numpy()

# Define the Autoencoder model
input_dim = features.shape[1]
autoencoder = Sequential([
    Dense(32, activation='relu', input_dim=input_dim),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(32, activation='relu'),
    Dense(input_dim, activation='sigmoid')
])

# Autoencoder compilation 
autoencoder.compile(optimizer='adam', loss='mse')

# Model training
autoencoder.fit(features, features, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

# Generate predictions
reconstructions = autoencoder.predict(features)

# Calculate the reconstruction error
reconstruction_errors = np.mean(np.square(reconstructions - features), axis=1)

# Set threshold for anomaly detection
threshold = np.percentile(reconstruction_errors, 95)

# Mark anomalies in the dataset
ais['autoencoder_anomaly'] = (reconstruction_errors > threshold).astype(int)

# Extract rows marked as anomalies
anomalies = ais[ais['autoencoder_anomaly'] == 1]

# Print summary of anomalies
print(f"Number of anomalies detected: {len(anomalies)}")
print(anomalies.head())

# Export generated anomalies to csv
anomalies.to_csv('autoencoder_anomalies.csv', index=False)
print("Anomalies saved to 'autoencoder_anomalies.csv'")

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 413us/step - loss: 251572.9844 - val_loss: 1096212.7500
Epoch 2/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 402us/step - loss: 328110.5625 - val_loss: 1096212.7500
Epoch 3/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 402us/step - loss: 251961.0156 - val_loss: 1096212.7500
Epoch 4/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 403us/step - loss: 245495.7969 - val_loss: 1096212.7500
Epoch 5/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 399us/step - loss: 247311.2188 - val_loss: 1096212.7500
Epoch 6/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 398us/step - loss: 311732.6250 - val_loss: 1096212.7500
Epoch 7/10
[1m68516/68516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 397us/step - loss: 284674.0625 - val_loss: 1096212.7500
Epoch 8/10
[1m68516/68516[0m [32m━━━━━━━━━