In [None]:
import pandas as pd
from pathlib import Path
import re
import numpy as np

In [None]:
iso_table = pd.read_csv(
    "https://raw.githubusercontent.com/sepal-contrib/sepal_mgci/master/component/parameter/m49_countries.csv",
    sep=";",
)
belt_table = pd.read_csv("data/sources/5_belts_translation.csv")
columns = [
    "SeriesID",
    "SeriesDescription",
    "GeoAreaCode",
    "GeoAreaName",
    "TimePeriod",
    "Value",
    "Time_Detail",
    "Source",
    "FootNote",
    "Nature",
    "Units",
    "Reporting Type",
    "Observation Status",
    "Impact Type",
    "Bioclimatic Belt",
    "ISOalpha3",
    "Type",
    "SeriesCode",
]

transition = pd.read_csv("data/sources/1_Land_cover_transition_csv.csv")
transition["impact"] = transition["impact"].str.strip()

impact_table = (
    transition.groupby(["impact", "impact_code"])
    .sum()
    .reset_index()[["impact", "impact_code"]]
)

In [None]:
def get_impact(row):
    """Return the type of the impact based on the initial and last class"""

    # Check that both
    if not all([row["from_class"], row["to_class"]]):
        return 0

    return transition[
        (transition.from_code == row["from_class"])
        & (transition.to_code == row["to_class"])
    ]["impact_code"].values[0]


def get_impact_desc(row):
    """Return impact description based on its code"""
    desc = impact_table[impact_table.impact_code == row["impact"]]["impact"]

    return desc.values[0] if len(desc) else "All"


def get_belt_desc(row):
    """return bioclimatic belt description"""
    desc = belt_table[belt_table.code == row["belt_class"]]["desc"]

    return desc.values[0] if len(desc) else "Total"

In [None]:
def get_pdma(df, iso_code):
    """
    Get the MGCI report table for the given iso_code and year

    df (DataFrame): grouped dataframe from raw data
    iso_code (str): country iso code

    """

    # Prepare df
    df_ = df[df.iso_code == iso_code].copy(deep=True)

    # Summary area by belt
    by_belt = (
        df_.groupby(["iso_code", "belt_class"])
        .sum()
        .reset_index()[["iso_code", "belt_class", "sum"]]
    )
    by_belt["impact"] = "All"

    # Summary area by impact
    by_impact = (
        df_.groupby(["iso_code", "impact"])
        .sum()
        .reset_index()[["iso_code", "impact", "sum"]]
    )
    by_impact["belt_class"] = "Total"

    result = pd.concat([df_, by_belt, by_impact])

    return result

In [None]:
def create_report(df, iso_code, years, i):

    i += 1
    try:
        report_df = get_pdma(df, iso_code)
        report_df["SeriesID"] = i
        report_df["Value"] = report_df["sum"]
        report_df["SeriesDescription"] = "Proportion of degraded mountain land"
        report_df["GeoAreaName"] = report_df.iso_code
        report_df["GeoAreaCode"] = iso_table[
            iso_table.iso31661 == report_df.GeoAreaName.unique()[0]
        ]["m49"].iloc[0]
        report_df["TimePeriod"] = years[1]
        report_df["Time_Detail"] = f"{years[0]}-{years[1]}"
        report_df[
            "Source"
        ] = "Food and Agriculture Organisation of United Nations (FAO)"
        report_df["FootNote"] = "FAO estimate"
        report_df["Nature"] = "G"
        report_df["Units"] = "KM2_PA"
        report_df["Reporting Type"] = "G"
        report_df["Observation Status"] = "A"
        report_df["Bioclimatic Belt"] = report_df.apply(get_belt_desc, axis=1)
        report_df["ISOalpha3"] = np.nan
        report_df["Impact Type"] = report_df.apply(get_impact_desc, axis=1)
        report_df["SeriesCode"] = "XXXX"
        report_df["Type"] = "Region"
        return report_df[columns]

    except Exception as e:

        print(iso_code)
        raise e

In [None]:
def report_by_dataset(csv_path):
    """Create and save the subindicator for each of the given csv path

    csv_path: Has to be created using the reduce_results notebook. It has the formatted data.

    """

    csv_path = Path(csv_path)

    # Extract years from the CSV file
    years = [
        int(x) for x in re.search(r"\d{4}_\d{4}", str(csv_path)).group().split("_")
    ]

    raw_df = pd.read_csv(csv_path)
    raw_df = raw_df.fillna(0)
    raw_df["impact"] = raw_df.apply(get_impact, axis=1)
    raw_df = raw_df.groupby(["iso_code", "belt_class", "impact"]).sum().reset_index()
    raw_df = raw_df[raw_df["sum"] > 0][["iso_code", "belt_class", "impact", "sum"]]

    global i
    i = 0

    report_by_country = [
        create_report(raw_df, iso_code, years, i)
        for iso_code in raw_df.iso_code.unique()
    ]

    return pd.concat(report_by_country)

In [None]:
reports_by_year = [
    report_by_dataset(csv_path)
    for csv_path in Path("data/global_group_1_2_3").glob("global*.csv")
    if "2005" not in csv_path.name
]

In [None]:
reports_by_year_df = pd.concat(reports_by_year)
reports_by_year_df = reports_by_year_df.sort_values(
    ["GeoAreaName", "TimePeriod", "Bioclimatic Belt"]
)

In [None]:
out_path = Path("data/reporting/subB_pdma_TransitionType/")
out_path.mkdir(exist_ok=True, parents=True)
reports_by_year_df.to_csv(out_path / f"SubIndB_pdma_TransitionType.csv")

## Test with a single country

In [None]:
csv_path = Path("data/global_group_1_2_3/global_grouped_data_2000_2015.csv")

# Extract years from the CSV file
years = [int(x) for x in re.search(r"\d{4}_\d{4}", str(csv_path)).group().split("_")]

raw_df = pd.read_csv(csv_path)
raw_df = raw_df.fillna(0)
raw_df["impact"] = raw_df.apply(get_impact, axis=1)
raw_df = raw_df.groupby(["iso_code", "belt_class", "impact"]).sum().reset_index()
raw_df = raw_df[raw_df["sum"] > 0][["iso_code", "belt_class", "impact", "sum"]]
i = 0

In [None]:
create_report(raw_df, "COL", years, i)

In [None]:
test_data = [
    ["COL", 1.0, -1.0, 0.377574],
    ["COL", 1.0, 0.0, 251.801384],
    ["COL", 2.0, -1.0, 15.024332],
    ["COL", 2.0, 0.0, 5565.770271],
    ["COL", 2.0, 1.0, 4.730281],
    ["COL", 3.0, -1.0, 603.914079],
    ["COL", 3.0, 0.0, 65314.962632],
    ["COL", 3.0, 1.0, 913.755952],
    ["COL", 4.0, -1.0, 3475.691478],
    ["COL", 4.0, 0.0, 223495.762720],
    ["COL", 4.0, 1.0, 4316.702990],
]
columns = ["iso_code", "belt_class", "impact", "sum"]

In [None]:
test_df = pd.DataFrame(test_data, columns=columns)
test_df