In [None]:
from pathlib import Path
import pandas as pd
import re
from numpy import nan

In [None]:
iso_table = pd.read_csv(
    "https://raw.githubusercontent.com/sepal-contrib/sepal_mgci/master/component/parameter/m49_countries.csv",
    sep=";",
)
belt_table = pd.read_csv("data/sources/5_belts_translation.csv")
mgci_columns = [
    "SeriesID",
    "SeriesDescription",
    "GeoAreaCode",
    "GeoAreaName",
    "TimePeriod",
    "Value",
    "Time_Detail",
    "Source",
    "FootNote",
    "Nature",
    "Units",
    "Reporting Type",
    "Observation Status",
    "Bioclimatic Belt",
    "ISOalpha3",
    "Type",
    "SeriesCode",
]
lcluc_matrix = pd.read_csv("data/sources/cci_map_matrix.csv")
lcluc_matrix = lcluc_matrix[["target_code", "green"]]
lcluc_matrix.drop_duplicates(keep="first", inplace=True)
lcluc_matrix = lcluc_matrix.reset_index(drop=True)

In [None]:
def return_green(row):
    """Return the corresponding green/non-green value from target lulc code"""
    return lcluc_matrix[lcluc_matrix.target_code == row["lc_class"]]["green"].values[0]

In [None]:
def get_mgci_report(df, iso_code):
    """
    Get the MGCI report table for the given iso_code and year

    df (DataFrame): grouped dataframe from raw data
    iso_code (str): country iso code
    """

    # subset dataframe to the one we are working on
    df_ = df[df.iso_code == iso_code].copy(deep=True)
    df_["is_green"] = df_.apply(return_green, axis=1)

    # Get the green and non green area for each belt
    tmp_df = df_.groupby(["iso_code", "belt_class", "is_green"]).sum().reset_index()

    # Split by green and non-green
    green_df = tmp_df[tmp_df.is_green == 1]
    non_green_df = tmp_df[tmp_df.is_green == 0]

    # Merge and calculate mgci
    green_non_green = pd.merge(
        green_df,
        non_green_df,
        on=["belt_class", "iso_code"],
        suffixes=["green", "non_green"],
        how="outer",
    )
    green_non_green = green_non_green.fillna(0)

    green_non_green["mgci"] = (
        green_non_green["sumgreen"]
        / (green_non_green["sumgreen"] + green_non_green["sumnon_green"])
        * 100
    )
    green_non_green = green_non_green[
        ["iso_code", "belt_class", "sumgreen", "sumnon_green", "mgci"]
    ]
    # Return a dataframe with all the columns
    total_mgci = pd.DataFrame(
        [
            [
                green_non_green.iloc[:, 0].unique()[0],
                "Total",
                green_non_green.sum()["sumgreen"]
                / (
                    green_non_green.sum()["sumgreen"]
                    + green_non_green.sum()["sumnon_green"]
                )
                * 100,
            ]
        ],
        columns=["iso_code", "belt_class", "mgci"],
    ).fillna(0)

    result = pd.concat([green_non_green, total_mgci])
    return result

In [None]:
def get_belt_desc(row):
    """return bioclimatic belt description"""
    desc = belt_table[belt_table.code == row["belt_class"]]["desc"]

    return desc.values[0] if len(desc) else "Total"

In [None]:
i = 0


def create_report(df, iso_code, target_year):

    if iso_code == "KWT":
        print(iso_code)

    global i
    i += 1
    try:
        country_df = get_mgci_report(df, iso_code)
        report_df = pd.DataFrame(columns=mgci_columns)
        report_df = country_df
        report_df["SeriesID"] = i
        report_df["Value"] = report_df.mgci
        report_df["SeriesDescription"] = "Mountain Green Cover Index"
        report_df["GeoAreaName"] = report_df.iso_code
        report_df["GeoAreaCode"] = iso_table[
            iso_table.iso31661 == report_df.GeoAreaName.unique()[0]
        ]["m49"].iloc[0]
        report_df["TimePeriod"] = target_year
        report_df["Time_Detail"] = target_year
        report_df[
            "Source"
        ] = "Food and Agriculture Organisation of United Nations (FAO)"
        report_df["FootNote"] = "FAO estimate"
        report_df["Nature"] = "G"
        report_df["Units"] = "PERCENT"
        report_df["Reporting Type"] = "G"
        report_df["Observation Status"] = "A"
        report_df["Bioclimatic Belt"] = report_df.apply(get_belt_desc, axis=1)
        report_df["ISOalpha3"] = nan
        report_df["Type"] = "Region"
        report_df["SeriesCode"] = "ER_MTN_GRNCVI"
        return report_df

    except Exception as e:
        print(iso_code)
        raise e

## Loop over years

In [None]:
class Progress:
    def __init__(self):
        self.processed_years = []

In [None]:
def report_by_dataset(csv_path):
    """Create and save the subindicator for each of the dates in the given csv path

    csv_path: Has to be created using the reduce_results notebook. It has the formatted data.

    """

    raw_df = pd.read_csv(csv_path)

    # Extract years from the CSV file
    years = [
        int(x) for x in re.search(r"\d{4}_\d{4}", str(csv_path)).group().split("_")
    ]

    def report_date(year):

        target_year = year
        progress.processed_years.append(target_year)

        target = "from_class" if years.index(target_year) == 0 else "to_class"
        print(f"Processing {target_year}, {target} for {csv_path}")

        df = raw_df.groupby(by=["belt_class", target, "iso_code"]).sum().reset_index()
        df = df[["iso_code", "belt_class", target, "sum"]]
        df.rename(columns={target: "lc_class"}, inplace=True)

        countries_df = [
            create_report(df, iso_code, target_year)
            for iso_code in df.iso_code.unique()
        ]

        final = pd.concat(countries_df)[mgci_columns]

        return final

    # To this for the two years
    return pd.concat(
        [report_date(year) for year in years if year not in progress.processed_years]
    )

In [None]:
# Now let's do this for each of the datasets

In [None]:
list(Path("data/global_group_1_2_3").glob("global*.csv"))

In [None]:
progress = Progress()
reports_by_year = [
    report_by_dataset(csv_path)
    for csv_path in Path("data/global_group_1_2_3").glob("global*.csv")
]

In [None]:
reports_by_year_df = pd.concat(reports_by_year)
reports_by_year_df = reports_by_year_df.sort_values(
    ["GeoAreaName", "TimePeriod", "Bioclimatic Belt"]
)

In [None]:
out_path = Path("data/reporting/subA_mgci/")
out_path.mkdir(exist_ok=True, parents=True)
reports_by_year_df.to_csv(out_path / f"SubIndA_MGCI.csv")

## Test with a single country

In [None]:
# Do this for only one
csv_path = Path("data/global_group_1_2_3/global_grouped_data_2000_2015.csv")
raw_df = pd.read_csv(csv_path)

# Extract years from the CSV file
years = [int(x) for x in re.search(r"\d{4}_\d{4}", str(csv_path)).group().split("_")]


target_year = 2000

target = "from_class" if years.index(target_year) == 0 else "to_class"
df = raw_df.groupby(by=["belt_class", target, "iso_code"]).sum().reset_index()
df = df[["iso_code", "belt_class", target, "sum"]]
df.rename(columns={target: "lc_class"}, inplace=True)

create_report(df, "AFG", target_year)