In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime

In [None]:
def get_dca_with_mmsi(dca_data_path, mmsi_data_path):
    dca_data = pd.read_csv(dca_data_path)
    mmsi_data = pd.read_excel(mmsi_data_path)
    mmsi_data = mmsi_data[["mmsi", "kallesignal"]]

    merged = dca_data.merge(mmsi_data, left_on="Radiokallesignal (ERS)", right_on="kallesignal").drop(columns=["kallesignal"])
    merged["Starttidspunkt"] = pd.to_datetime(merged["Starttidspunkt"])
    merged["Stopptidspunkt"] = pd.to_datetime(merged["Stopptidspunkt"])
    return merged[["Starttidspunkt", "Stopptidspunkt", "mmsi", "Varighet"]]

In [None]:
def get_fish_trips_with_mmsi(fish_trip_data_path, mmsi_data_path):
    ft = pd.read_csv(fish_trip_data_path)
    mmsi_data = pd.read_excel(mmsi_data_path)
    mmsi_data = mmsi_data[["mmsi", "kallesignal"]]

    merged = ft.merge(mmsi_data, left_on="ERS", right_on="kallesignal").drop(columns=["kallesignal"])
    merged["Avgangstidspunkt"] = pd.to_datetime(merged["Avgangstidspunkt"])
    merged["Ankomsttidspunkt"] = pd.to_datetime(merged["Ankomsttidspunkt"])
    return merged[["Avgangstidspunkt", "Ankomsttidspunkt", "mmsi", "trip_id"]]

In [None]:
def calculate_in_interval(chunk, other, start_column, stop_column):
    dates = chunk["date_time_utc"].values
    chunk_mmsi = chunk["mmsi"].values
    other_mmsi = other["mmsi"].values
    start_times = other[start_column].values
    stop_times = other[stop_column].values

    # Use broadcasting to create a boolean array where True means the date is in the interval
    is_in_start = dates[:, None] >= start_times
    is_in_stop = dates[:, None] <= stop_times
    same_mmsi = chunk_mmsi[:, None] == other_mmsi
    is_in_interval = (is_in_start & is_in_stop) & same_mmsi
    return is_in_interval
    

In [None]:
def ais_modification(chunk, dca_slice, fishing_trips):
    trip_ids = fishing_trips["trip_id"].values
    duration = dca_slice["Varighet"].values
    
    is_in_interval_trip = calculate_in_interval(chunk, fishing_trips, "Avgangstidspunkt", "Ankomsttidspunkt")
    is_in_interval_dca = calculate_in_interval(chunk, dca_slice, "Starttidspunkt", "Stopptidspunkt")
    
    # Initialize the duration column with NaN
    chunk["trip_id"] = np.nan
    chunk["duration"] = np.nan
    
    # Assign the duration and trip id where the interval is True
    for i in range(len(chunk)):
        if is_in_interval_dca[i].any():
            assert len(duration[is_in_interval_dca[i]])==1, "Should only be a single duration per interval"
            chunk.loc[i, "duration"] = duration[is_in_interval_dca[i]].max()
        if is_in_interval_trip[i].any():
            assert len(trip_ids[is_in_interval_trip[i]])==1, "Should only be a single trip per interval"
            chunk.loc[i, "trip_id"] = trip_ids[is_in_interval_trip[i]].max()
            
    chunk['fishing'] = is_in_interval_dca.any(axis=1)
    return chunk

In [None]:
def process_ais(file_path, dca_data, fishing_trips):
    # Read AIS file
    column_dtypes = {"mmsi": int, "date_time_utc": object, "lon": float, "lat": float, "sog": float, "cog": float, "true_heading": int, "nav_status": int, "message_nr": int}
    ais_data = pd.read_csv(file_path, sep=";", dtype=column_dtypes, compression="zip")
    ais_data["date_time_utc"] = pd.to_datetime(ais_data["date_time_utc"])

    # Get date from AIS filename and, filter DCA data and fishing trips
    ais_date = os.path.basename(file_path)[4:-4]
    dca_slice = dca_data.where(dca_data["Starttidspunkt"].dt.date == datetime.strptime(ais_date, "%Y%m%d").date()).dropna()
    fish_trip_slice = fishing_trips.where(fishing_trips["Avgangstidspunkt"].dt.date == datetime.strptime(ais_date, "%Y%m%d").date()).dropna()

    print(f"{ais_date}, Size of AIS: {len(ais_data)}, Size of DCA: {len(dca_slice)}")
    result = ais_modification(ais_data, dca_slice, fish_trip_slice)
    return result
    

In [None]:
def mark_ais_fishing(ais_data_path, dca_date_slice, fishing_trips, save_destination):
    """
    Reads all ais data for a year
    """
    ais_list = os.listdir(ais_data_path)
    for i, ais_day in enumerate(ais_list[:1]):
        print(i, end=" ")
        ais_df = process_ais(os.path.join(ais_data_path, ais_day), dca_date_slice, fishing_trips)
        filename = f"{ais_day[:-4]}"
        ais_df.to_parquet(f"{save_destination}/{filename}.parquet")

In [None]:
fishing_trips = get_fish_trips_with_mmsi("processed/fishing_trips_temp.csv", "data/MMSI_rc_20211027_.xlsx")

In [None]:
dca_slice = get_dca_with_mmsi("processed/dca/combined.csv", "data/MMSI_rc_20211027_.xlsx")

In [None]:
mark_ais_fishing("data/ais_data/AIS_data_2016/", dca_slice, fishing_trips, "ais_processed/ais2016")

In [None]:
test_frame = pd.read_parquet("processed/ais/AIS_data_2016/ais_20161116.parquet")
# test_frame
# f_slice = test_frame.where(test_frame["fishing"] == True & (test_frame["mmsi"]==257247000)).dropna()
f_slice = test_frame.where((test_frame["fishing"] == True) & (test_frame["trip_id"] > 0)).dropna()
f_slice
# for _, row in f_slice.iterrows():
#     print(row["date_time_utc"], row["duration"])