In [None]:
import pandas as pd
import numpy as np

dca_data = pd.read_csv("data/elektronisk-rapportering-ers-2018-fangstmelding-dca.csv", sep=";", decimal=",")

In [None]:
# Keep given columns
id_columns = ["Melding ID", "Starttidspunkt", "Stopptidspunkt"]
# keep_columns = ["Melding ID", "Meldingstidspunkt", "Meldingsnummer", "Meldingsversjon", "Redskapsspesifikasjon", "Starttidspunkt", "Stopptidspunkt", "Radiokallesignal (ERS)", "Varighet",
keep_columns = ["Melding ID", "Meldingstidspunkt", "Starttidspunkt", "Stopptidspunkt", "Radiokallesignal (ERS)", "Varighet",
                "Startposisjon bredde", "Startposisjon lengde", "Havdybde start", "Stopposisjon bredde",
                "Stopposisjon lengde", "Havdybde stopp", "Trekkavstand", "Redskap FAO (kode)", "Hovedart FAO",
                "Art FAO", "Rundvekt", "Bruttotonnasje 1969", "Bruttotonnasje annen",
                "Bredde", "Fartøylengde", "Hovedområde start (kode)", "Hovedområde stopp (kode)"]

reduced_data = dca_data[keep_columns]

In [None]:
# Keep only OTB (bottom trawl) and drop rows with no species information
reduced_data = reduced_data.where(reduced_data["Redskap FAO (kode)"] == "OTB")
reduced_data = reduced_data.dropna(subset=["Art FAO"])

In [None]:
reduced_data

In [None]:
# Sum the round weights for message id, start time, and stop time
catch_sums = reduced_data.groupby(['Melding ID','Starttidspunkt','Stopptidspunkt'])['Rundvekt'].sum()
catch_sums

In [None]:
# Check for duplicates
reduced_data.duplicated(["Melding ID", "Starttidspunkt", "Stopptidspunkt", "Art FAO"]).sum()

In [None]:
# Create columns of round weight for each of 14 fish species + column for rest
top_species = ['Torsk', 'Sei', 'Hyse', 'Uer (vanlig)', 'Dypvannsreke', 'Lange', 'Snabeluer', 'Blåkveite', 'Flekksteinbit', 'Lysing', 'Gråsteinbit', 'Breiflabb', 'Kveite', 'Lyr']
reduced_data = reduced_data.loc[reduced_data["Art FAO"].isin(top_species)]
reduced_data_pivot = reduced_data.pivot(index=["Melding ID", "Starttidspunkt", "Stopptidspunkt"], columns="Art FAO", values="Rundvekt").reset_index()
reduced_data_weight = reduced_data_pivot.merge(catch_sums, on=["Melding ID", "Starttidspunkt", "Stopptidspunkt"])

reduced_data_weight["ANDRE"] = reduced_data_weight.apply(lambda row: row["Rundvekt"] - row[top_species].sum(), axis=1)
reduced_data_weight[top_species] = reduced_data_weight[top_species].replace(np.nan, 0)
reduced_data_weight.head()

In [None]:
reduced_data = reduced_data.drop(columns=["Art FAO", "Rundvekt"]).drop_duplicates()

In [None]:
# Merge datasets and combine tonnage columns
complete_data = reduced_data.merge(reduced_data_weight, on=["Melding ID", "Starttidspunkt", "Stopptidspunkt"])
complete_data[["Bruttotonnasje 1969", "Bruttotonnasje annen"]] = complete_data[["Bruttotonnasje 1969", "Bruttotonnasje annen"]].replace(np.nan, 0)
complete_data["Bruttotonnasje"] = complete_data.apply(lambda row: row["Bruttotonnasje 1969"] + row["Bruttotonnasje annen"], axis=1)
complete_data.drop(columns=["Bruttotonnasje 1969", "Bruttotonnasje annen"], inplace=True)

In [None]:
complete_data = complete_data.sort_values(["Meldingstidspunkt", "Starttidspunkt"], ignore_index=True)

# Check for time overlap

In [None]:
message_ids = complete_data["Melding ID"].unique()
call_signs = complete_data["Radiokallesignal (ERS)"].unique()
print(call_signs)
complete_data["Starttidspunkt"] = pd.to_datetime(complete_data["Starttidspunkt"], format="mixed")
complete_data["Stopptidspunkt"] = pd.to_datetime(complete_data["Stopptidspunkt"], format="mixed")

# Drop time overlapping messages for each vessel
all_messages = []
for c_sign in call_signs:
    messages = complete_data.where(complete_data["Radiokallesignal (ERS)"]==c_sign).dropna(how="all")
    i = 0
    len_df = len(messages)
    while i < len_df-1:
        # if (messages.iloc[i+1]["Melding ID"] == messages.iloc[i]["Melding ID"] and
        # Message ID can be same or different
        if (messages.iloc[i+1]["Starttidspunkt"] < messages.iloc[i]["Stopptidspunkt"] and
            messages.iloc[i+1]["Starttidspunkt"] >= messages.iloc[i]["Starttidspunkt"]):
            # print(f"Overlap between: {messages.index[i]} and {messages.index[i+1]}")
            messages = messages.drop(messages.index[i+1], inplace=False)
            len_df -= 1
        i += 1
    all_messages.append(messages)

complete_data_no_dupes = pd.concat(all_messages)

In [None]:
print(len(complete_data), len(complete_data_no_dupes))

In [None]:
complete_data_no_dupes["Trekkavstand"] = complete_data_no_dupes["Trekkavstand"].replace(np.nan, 0)

# Drop rows where area is nan
print(complete_data_no_dupes.isna().any(axis=1).sum())
complete_data_no_dupes = complete_data_no_dupes.dropna()
print(complete_data_no_dupes.isna().any(axis=1).sum())

In [None]:
# complete_data_no_dupes.to_csv("final.csv", index=False)

# Formatting

In [None]:
df = complete_data_no_dupes
df = df.sort_values("Starttidspunkt")
df["Meldingstidspunkt"] = pd.to_datetime(df["Meldingstidspunkt"], format="mixed")
df

# Combining data

In [1]:
from main import process_ers_data
import pandas as pd
import os

In [2]:
# process_before = []
process_after = []
file_list = ["elektronisk-rapportering-ers-2018-fangstmelding-dca.csv", "elektronisk-rapportering-ers-2019-fangstmelding-dca.csv"]
# for dca_file in os.listdir("data"):
for dca_file in file_list:
    if dca_file.endswith(".csv"):
        df = pd.read_csv(os.path.join("data", dca_file), sep=";", decimal=",")
        process_after.append(df)
        # df_proc = process_ers_data(df)
        # process_before.append(df_proc)
                
# print(os.listdir("data"), )

  df = pd.read_csv(os.path.join("data", dca_file), sep=";", decimal=",")
  df = pd.read_csv(os.path.join("data", dca_file), sep=";", decimal=",")


In [3]:
pr_aft = pd.concat(process_after)
pr_aft.sort_values("Starttidspunkt")
pr_aft = process_ers_data(pr_aft)

In [4]:
# Group by month, either by message time or start time
# group_by_month = df.groupby(pd.Grouper(key="Starttidspunkt", freq="ME"))
group_by_month = pr_aft.groupby(pd.Grouper(key="Meldingstidspunkt", freq="ME"))

df_by_month = [month for _, month in group_by_month]
for month in df_by_month:
    month.index = pd.DatetimeIndex(month["Meldingstidspunkt"])

month_map = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December",
}

for month_n in df_by_month:
    year = month_n.index.year[0]
    month = month_n.index.month[0]
    # print(f"Year: {year}, Month: {month}")
    month_n.to_csv(f"processed/{year}_{month_map[month]}.csv")