## Preprocessing of AIS (Automatic Identification System) dataset
Preprocess the data to remove noise and also engineer some features that will help in anomaly detection

In [None]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np

# Load AIS data
ais1 = pd.read_csv('AIS_2023_12_24.csv')
ais2 = pd.read_csv('AIS_2023_12_25.csv')
ais3 = pd.read_csv('AIS_2023_12_26.csv')

# Initialize an empty list to hold DataFrames
dataframes = []

# Append each CSV to the list
dataframes.append(ais1)
dataframes.append(ais2)
dataframes.append(ais3)

# Concatenate DataFrames
ais = pd.concat(dataframes, ignore_index=True)

total_rows = len(ais)
print(f"total rows: {total_rows}")

# Exploring the dataset
print("Initial shape of the dataset:", ais.shape)

# Remove duplicate rows
ais = ais.drop_duplicates()
print("After dropping duplicates:", ais.shape)

# Remove invalid LAT or LON values
ais = ais[(ais['LAT'] >= -90) & (ais['LAT'] <= 90)]
ais = ais[(ais['LON'] >= -180) & (ais['LON'] <= 180)]
print("After invalid coordinates:", ais.shape)

# Convert BaseDateTime to datetime and sort
ais['BaseDateTime'] = pd.to_datetime(ais['BaseDateTime'], errors='coerce')
ais = ais.dropna(subset=['BaseDateTime'])
ais = ais.sort_values(by=['MMSI', 'BaseDateTime'])
print("After handling timestamps:", ais.shape)

# Feature engineering for Distance, Time Difference, Speed, etc
# Function to calculate the distance
def calculate_distance(row1, row2):
    try:
        return geodesic((row1['LAT'], row1['LON']), (row2['LAT'], row2['LON'])).meters
    except ValueError:
        return 0

# Add distance and time differences
ais['distance'] = 0.0
ais['time_diff'] = 0.0

for mmsi, group in ais.groupby('MMSI'):
    group = group.sort_values(by='BaseDateTime')
    distances = []
    time_diffs = []

    for i in range(1, len(group)):
        row1 = group.iloc[i - 1]
        row2 = group.iloc[i]
        distances.append(calculate_distance(row1, row2))
        time_diffs.append((row2['BaseDateTime'] - row1['BaseDateTime']).total_seconds())

    ais.loc[group.index[1:], 'distance'] = distances
    ais.loc[group.index[1:], 'time_diff'] = time_diffs

# Remove rows with zero time difference
ais = ais[ais['time_diff'] > 0]

# Remove rows with distance less than 5
ais = ais[ais['distance'] >= 5]
print("After removing rows with distance < 5:", ais.shape)

# Speed calculation (distance/time in meters per second)
ais['calculated_speed'] = ais['distance'] / ais['time_diff']

# Heading deviation calculation
ais['heading_deviation'] = ais.groupby('MMSI')['Heading'].diff().fillna(0).abs()

# Convert speed over ground (SOG) to m/s to maintain uniformity with calculated_speed
ais['sog_mps'] = ais['SOG'] * 0.514444  # 1 knot = 0.514444 m/s

# Cumulative distance traveled by vessel
ais['cumulative_distance'] = ais.groupby('MMSI')['distance'].cumsum()

# Replace missing values with median or fill with zeros
ais.fillna({
    'calculated_speed': ais['calculated_speed'].median(),
    'heading_deviation': 0,
    'sog_mps': ais['sog_mps'].median(),
}, inplace=True)

# Remove extreme outliers in calculated speed
ais = ais[ais['calculated_speed'] < 50]  # Arbitrary threshold for speed (50 m/s ~ 180 km/h)

print("Final dataset shape after preprocessing:", ais.shape)

# Export preprocessed data
ais.to_csv('preprocessed_ais.csv', index=False)
print("Preprocessed data saved to 'preprocessed_ais.csv'")

total rows: 21595522
Initial shape of the dataset: (21595522, 17)
