In [None]:
import os
import pandas as pd
from datetime import datetime

In [None]:
dca_data = pd.read_csv("processed/combined.csv")
dca_data

In [None]:
mmsi_data = pd.read_excel("data/MMSI_rc_20211027_.xlsx")
mmsi_link = mmsi_data[["mmsi", "kallesignal"]]
mmsi_link

In [None]:

dca_mmsi = dca_data.merge(mmsi_link, left_on="Radiokallesignal (ERS)", right_on="kallesignal").drop(columns=["kallesignal"])
dca_mmsi["Starttidspunkt"] = pd.to_datetime(dca_mmsi["Starttidspunkt"])
dca_mmsi["Stopptidspunkt"] = pd.to_datetime(dca_mmsi["Stopptidspunkt"])
dca_date_slice = dca_mmsi[["Starttidspunkt", "Stopptidspunkt", "mmsi"]]
# dca_date_slice = dca_date_slice.where(dca_date_slice["Starttidspunkt"].dt.year == 2015).dropna()
# TODO: Date should be read fro ais file name
# dca_date_slice = dca_date_slice.where(dca_date_slice["Starttidspunkt"].dt.date == datetime.strptime('20150101', "%Y%m%d").date()).dropna()
# dca_date_slice


In [None]:
def get_dca_with_mmsi(dca_data_path, mmsi_data_path):
    dca_data = pd.read_csv(dca_data_path)
    mmsi_data = pd.read_excel(mmsi_data_path)
    mmsi_data = mmsi_data[["mmsi", "kallesignal"]]

    merged = dca_data.merge(mmsi_data, left_on="Radiokallesignal (ERS)", right_on="kallesignal").drop(columns=["kallesignal"])
    merged["Starttidspunkt"] = pd.to_datetime(merged["Starttidspunkt"])
    merged["Stopptidspunkt"] = pd.to_datetime(merged["Stopptidspunkt"])
    return merged[["Starttidspunkt", "Stopptidspunkt", "mmsi"]]

In [None]:
# Function to process each chunk and determine if the dates fall within any intervals
def process_chunk(chunk, start_times, stop_times, dca_mmsi):
    ais_dates = chunk['date_time_utc'].values
    chunk_mmsi = chunk["mmsi"].values

    # Use broadcasting to create a boolean array where True means the date is in the interval
    is_in_start = ais_dates[:, None] >= start_times
    is_in_stop = ais_dates[:, None] <= stop_times
    same_mmsi = chunk_mmsi[:, None] == dca_mmsi
    is_in_interval = (is_in_start & is_in_stop) & same_mmsi

    # Any interval containing the ais_date will have True in the row
    chunk['fishing'] = is_in_interval.any(axis=1)
    return chunk

In [None]:
def process_ais(file_path, dca_data):
    # Read AIS file
    column_dtypes = {"mmsi": int, "date_time_utc": object, "lon": float, "lat": float, "sog": float, "cog": float, "true_heading": int, "nav_status": int, "message_nr": int}
    ais_data = pd.read_csv(file_path, sep=";", dtype=column_dtypes, compression="zip")
    ais_data["date_time_utc"] = pd.to_datetime(ais_data["date_time_utc"])

    # Get date from AIS filename and filter DCA data
    ais_date = os.path.basename(file_path)[4:-4]
    dca_slice = dca_data.where(dca_data["Starttidspunkt"].dt.date == datetime.strptime(ais_date, "%Y%m%d").date()).dropna()
    
    # Convert interval times to numpy arrays
    start_times = dca_slice['Starttidspunkt'].values
    stop_times = dca_slice['Stopptidspunkt'].values
    dca_mmsis = dca_slice["mmsi"].values

    print(f"{ais_date}, Size of AIS: {len(ais_data)}, Size of DCA: {len(dca_slice)}")
    result = process_chunk(ais_data, start_times, stop_times, dca_mmsis)
    return result
    

In [None]:
def mark_ais_fishing(ais_data_path, dca_date_slice, save_destination):
    """
    Reads all ais data for a year
    """
    # ais_data_path = "data/ais_data/ais2016/"
    ais_list = os.listdir(ais_data_path)
    for i, ais_day in enumerate(ais_list[:5]):
        print(i, end=" ")
        ais_df = process_ais(os.path.join(ais_data_path, ais_day), dca_date_slice)
        filename = f"{ais_day[:-4]}"
        ais_df.to_parquet(f"{save_destination}/{filename}.parquet")

In [None]:
dca_slice = get_dca_with_mmsi("processed/combined.csv", "data/MMSI_rc_20211027_.xlsx")

In [None]:
mark_ais_fishing("data/ais_data/ais2016/", dca_slice, "ais_processed/")

In [None]:
test_frame = pd.read_parquet("ais_processed/ais_20160515.parquet")
test_frame