In [None]:
from zipfile import ZipFile
import os
import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [None]:
def read_and_combine(data_folder):
    dframes = []
    for file in os.listdir(data_folder):
        # print(file)
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(data_folder, file), sep=";", low_memory=False)
            dframes.append(df)
    return pd.concat(dframes)

In [None]:
def prepare_data(df, time_column: str):
    columns = ["Melding ID", "Radiokallesignal (ERS)", time_column, "Havn (kode)", "Kvantum type (kode)", "Rundvekt"]
    df = df[columns].drop_duplicates()
    df[time_column] = pd.to_datetime(df[time_column], dayfirst=True, format="mixed")
    return df

In [None]:
dep_data = prepare_data(read_and_combine("data_test/dep/"), "Avgangstidspunkt")
# dep_data = prepare_data(read_and_combine("data/dep/"), "Avgangstidspunkt")
por_data = prepare_data(read_and_combine("data_test/por"), "Ankomsttidspunkt")
# por_data = prepare_data(read_and_combine("data/por"), "Ankomsttidspunkt")
# dep_data = dep_data.sort_values("Avgangstidspunkt")
# por_data = por_data.sort_values("Ankomsttidspunkt")

In [None]:
def prepare_dataframe_for_fishing_trips(df_dep, df_por, vessel_id):
    dep = df_dep[df_dep["Radiokallesignal (ERS)"] == vessel_id]
    agg_func = {
        "Melding ID": "first",
        "Radiokallesignal (ERS)": "first",
        "Avgangstidspunkt": "first",
        "Kvantum type (kode)": "first",
        "Havn (kode)": "first",
        "Rundvekt": "sum"
    }
    dep_agg = dep.groupby("Melding ID", as_index=False).aggregate(agg_func)
    dep_agg = dep_agg.sort_values("Avgangstidspunkt")#.reset_index()
    # dep_agg = dep_agg.drop("index", axis=1)

    por = df_por[df_por["Radiokallesignal (ERS)"] == vessel_id]
    agg_func = {
        "Melding ID": "first",
        "Radiokallesignal (ERS)": "first",
        "Ankomsttidspunkt": "first",
        "Kvantum type (kode)": "first",
        "Rundvekt": "sum"
    }
    por_agg = por.groupby(["Melding ID", "Kvantum type (kode)"], as_index=False).aggregate(agg_func)
    por_piv = por_agg.pivot(index="Melding ID", columns="Kvantum type (kode)", values="Rundvekt")
    por_final = por_piv.join(por[["Melding ID", "Radiokallesignal (ERS)", "Ankomsttidspunkt", "Havn (kode)"]].set_index("Melding ID"), on="Melding ID").drop_duplicates()
    por_final = por_final.sort_values("Ankomsttidspunkt")#.reset_index()

    return dep_agg, por_final
    

In [None]:
# Keep this
def prepare_timestamps(df_dep, df_por):
    temp_dep = df_dep.T
    temp_por = df_por.T
    dep_total = len(temp_dep.columns)
    por_total = len(temp_por.columns)
    
    
    time_stamps = pd.concat([df_dep, df_por])
    time_stamps["Type"] = np.where(time_stamps["Avgangstidspunkt"].isna(), "POR", "DEP")
    time_stamps["Timestamp"] = np.where(time_stamps["Avgangstidspunkt"].isna(), time_stamps["Ankomsttidspunkt"], time_stamps["Avgangstidspunkt"])
    # type(time_stamps.iloc[27]["Avgangstidspunkt"])
    time_stamps = time_stamps.sort_values("Timestamp").reset_index(drop=True)#.drop("index",axis=1)
    # time_stamps
    return time_stamps

In [None]:
def create_single_trip(start, end):
    common_cols = ["Melding ID", "Timestamp", "Type", "Rundvekt", "KG", "OB", "Kvantum type (kode)"]
    start = start.drop(common_cols + ["Ankomsttidspunkt", "Radiokallesignal (ERS)"])
    start = start.rename({"Havn (kode)": "Havn_start (kode)"})
    end = end.drop(common_cols + ["Avgangstidspunkt"])
    end = end.rename({"Havn (kode)": "Havn_slutt (kode)"})
    
    trip = pd.concat([start, end])
    return trip

In [None]:
# Algorithm to define fishing trips for a single vessel
def define_fishing_trips(time_stamps):
    trips = []
    start, end = None, None
    for i, data in time_stamps.iterrows():
        if start is None and data["Type"] == "DEP":
            # print("Start on index: ", i)
            start = data
    
        if start is not None and data["Type"] == "POR" and data["KG"] == data["OB"]:
            # print("End on index: ", i)
            end = data
    
        if end is not None and data["Type"] == "DEP":
            trips.append(create_single_trip(start, end))
            # print("Start on index: ", i)
            start = data
            end = None
            
    # Add the remaining trip
    if start is not None and end is not None:
        trips.append(create_single_trip(start, end))

    # for start, end in trips:
    #     print(start["Timestamp"], end["Timestamp"])
    return pd.concat(trips,axis=1).T

# define_fishing_trips(time_stamps)

In [None]:
def define_fishing_trips_all_vessels(dep_data, por_data):
    trips_vessel = []
    print("Total unique vessels: ", len(dep_data["Radiokallesignal (ERS)"].unique()))
    for vessel in dep_data["Radiokallesignal (ERS)"].unique():
        # print(vessel)
        df_dep, df_por = prepare_dataframe_for_fishing_trips(dep_data, por_data, vessel)
        
        # Skip vessels that does not contain KG or OB in POR data
        if "KG" not in df_por.columns:
            print("KG not in vessel: ", vessel) 
            continue
        elif "OB" not in df_por.columns:
            print("OB not in vessel: ", vessel) 
            continue
            
        
        time_stamps = prepare_timestamps(df_dep, df_por)
        trips = define_fishing_trips(time_stamps)
        trips_vessel.append(trips)

    all_trips = pd.concat(trips_vessel).reset_index(drop=True)
    all_trips["trip_id"] = all_trips["Radiokallesignal (ERS)"] + (all_trips["Avgangstidspunkt"].apply(lambda x: x.timestamp()) + all_trips["Ankomsttidspunkt"].apply(lambda x: x.timestamp())).astype(str)
    return all_trips

In [None]:
all_trips = define_fishing_trips_all_vessels(dep_data, por_data)