In [None]:
import geopandas as gpd

import pandas as pd

In [None]:
meta_input = input("Enter the file path: ")

metadata = pd.read_csv(
    meta_input,
)

In [None]:
metadata.head(40)

In [None]:
# find row where SHP/* equals acoewrig
row = metadata[metadata["SHP/*"] == "acoewrig"].index[0]

row

sliice = metadata.iloc[row:, :]

sliice

In [None]:
# create new dataframe with latin names, common names, and shp/* columns from metadata

metadata = metadata[["Latin Name", "Common Name", "SHP/*"]]

metadata

In [None]:
little = gpd.read_file(
    "/media/muskrat/T7 Shield/eco_data/v3/native/little/7445016/wpetry/USTreeAtlas-v1.0/wpetry-USTreeAtlas-4999258/geojson/alnumari.geojson"
)

little

In [None]:
little.crs

In [None]:
little = little.to_crs("EPSG:4326")

little.crs

In [None]:
little.plot().invert_xaxis()

In [None]:
ecomap_loc = "/media/muskrat/T7 Shield/eco_data/ecomap_final/eco_map.geojson"

eco_map = gpd.read_file(ecomap_loc)

In [None]:
base = eco_map.plot(color="white", edgecolor="black")
xmin, ymin, xmax, ymax = (-60, 25, -100, 45)

ax = little.plot(ax=base, color="red", alpha=0.4)

# set the x and y limits of the plot to the specified bounding box coordinates
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.invert_xaxis()

In [None]:
intersects = gpd.sjoin(little, eco_map)

intersects

In [None]:
# put unique values of unique_id in intersects into a list

unique_ids = list(intersects["unique_id"].unique())
unique_ids

In [None]:
# create dataframe from eco_map that only contains the unique ids in unique_ids

eco_map_unique = eco_map[eco_map["unique_id"].isin(unique_ids)]
eco_map_unique

In [None]:
# remove rows with <NA> in unique_id from eco_map_unique

eco_map_unique = eco_map_unique[eco_map_unique["unique_id"] != "<NA>"]

# remove rows withe MEOW or PPOW in TYPE from eco_map_unique

eco_map_unique = eco_map_unique[~eco_map_unique["TYPE"].isin(["MEOW", "PPOW"])]

eco_map_unique

In [None]:
eco_map_unique.plot()

In [None]:
overlay = gpd.overlay(little, eco_map, how="intersection")

overlay

In [None]:
# remove rows with <NA> in unique_id from overlay

overlay = overlay[overlay["unique_id"] != "<NA>"]

# remove rows with MEOW or PPOW in TYPE from overlay

overlay = overlay[~overlay["TYPE"].isin(["MEOW", "PPOW"])]

overlay

In [None]:
overlay.plot(alpha=0.5, edgecolor="k", cmap="tab10")

In [None]:
overlay["area"] = overlay.geometry.area

overlay

In [None]:
# create a new dataframe from overlay where the first column is unique_id and the second column is the area of all the rows in overlay that have the same unique_id

overlay_areas = overlay[["unique_id", "area"]].groupby("unique_id").sum()

overlay_areas

In [None]:
# add an area column to eco_map_unique dataframe

eco_map_unique["area"] = eco_map_unique.geometry.area

eco_map_unique

In [None]:
# create a new dataframe from eco_map_unique where the first column is unique_id and the second column is the area of all the rows in eco_map_unique that have the same unique_id

eco_map_unique_areas = eco_map_unique[["unique_id", "area"]].groupby("unique_id").sum()

eco_map_unique_areas

In [None]:
# combine eco_map_unique_areas and overlay_areas into a new dataframe where the first column is unique_id, the second column is area from overlays, and the third column is area from eco_map_unique

combined_areas = pd.concat([overlay_areas, eco_map_unique_areas], axis=1)
combined_areas.columns = ["overlay_area", "eco_map_unique_area"]

combined_areas

In [None]:
combined_areas["percentage"] = (
    combined_areas["overlay_area"] / combined_areas["eco_map_unique_area"]
)

combined_areas

In [None]:
# if overlay_area / eco_map_unique_area > 0.2 then add unique_id to list of ids

# if length of combined_areas == 1 then native = unique_ids

if len(combined_areas) == 1:
    native = unique_ids
else:

    native = combined_areas[
        combined_areas["overlay_area"] / combined_areas["eco_map_unique_area"] > 0.2
    ].index.tolist()

    print(len(native))

    if len(native) == 0:
        print("test")
        native = combined_areas[
            combined_areas["eco_map_unique_area"] <= 2
        ].index.tolist()
        print(native)
        if len(native) == 0:
            native = combined_areas[
                combined_areas["overlay_area"] / combined_areas["eco_map_unique_area"]
                > 0.1
            ].index.tolist()
            print(native)

native

native_df = eco_map[eco_map["unique_id"].isin(native)]

native_df

In [None]:
native_df = eco_map[eco_map["unique_id"].isin(native)]

native_df

In [None]:
native_df.plot()

In [None]:
# scientific_name equals the Latin Name in metadata at value of SHP/*

scientific_name = metadata.loc[metadata["SHP/*"] == "acacchor"]["Latin Name"].values[0]


common_name = metadata.loc[metadata["SHP/*"] == "acacchor"]["Common Name"].values[0]

# final dataframe

final = pd.DataFrame(
    {
        "scientific_name": scientific_name,
        "common_name": common_name,
        "unique_id": native,
    },
)


# groupby scientific_name
final = (
    final.groupby(["scientific_name", "common_name"])["unique_id"]
    .apply(list)
    .reset_index()
)

final

In [None]:
# write to parquet file

# path equals scientific_name from final without spaces and converted to string

string_name = str(final["scientific_name"].values[0]).replace(" ", "_")

base_path = input("Enter the base path: ")

path = base_path + string_name + ".parquet"

final.to_parquet(path)

In [None]:
little_base = input("Enter the little base file path: ")

final_base_path = input("Enter the final base path: ")

for shp in metadata["SHP/*"]:
    print(shp)
    path = f"{little_base}{shp}.geojson"

    little = gpd.read_file(
        path,
    )

    # convert crs
    little = little.to_crs("EPSG:4326")

    # find intersecting geometry
    intersects = gpd.sjoin(little, eco_map)

    # put unique values of unique_id in intersects into a list
    unique_ids = list(intersects["unique_id"].unique())

    # create dataframe from eco_map that only contains the unique ids in unique_ids
    eco_map_unique = eco_map[eco_map["unique_id"].isin(unique_ids)]

    # remove rows with <NA> in unique_id from eco_map_unique
    eco_map_unique = eco_map_unique[eco_map_unique["unique_id"] != "<NA>"]

    # find overlaying geometry
    overlay = gpd.overlay(little, eco_map, how="intersection")

    # remove rows with <NA> in unique_id from overlay
    overlay = overlay[overlay["unique_id"] != "<NA>"]

    # add area column to overlay
    overlay["area"] = overlay.geometry.area

    # create a new dataframe from overlay where the first column is unique_id and the second column is the area of all the rows in overlay that have the same unique_id
    overlay_areas = overlay[["unique_id", "area"]].groupby("unique_id").sum()

    # add an area column to eco_map_unique dataframe
    eco_map_unique["area"] = eco_map_unique.geometry.area

    # create a new dataframe from eco_map_unique where the first column is unique_id and the second column is the area of all the rows in eco_map_unique that have the same unique_id
    eco_map_unique_areas = (
        eco_map_unique[["unique_id", "area"]].groupby("unique_id").sum()
    )

    # combine eco_map_unique_areas and overlay_areas into a new dataframe where the first column is unique_id, the second column is area from overlays, and the third column is area from eco_map_unique
    combined_areas = pd.concat([overlay_areas, eco_map_unique_areas], axis=1)
    combined_areas.columns = ["overlay_area", "eco_map_unique_area"]

    # if overlay_area / eco_map_unique_area > 0.2 then add unique_id to list of ids
    native = combined_areas[
        combined_areas["overlay_area"] / combined_areas["eco_map_unique_area"] > 0.2
    ].index.tolist()

    # convert to df
    native_df = eco_map[eco_map["unique_id"].isin(native)]

    # scientific_name equals the Latin Name in metadata at value of SHP/*
    scientific_name = metadata.loc[metadata["SHP/*"] == shp]["Latin Name"].values[0]
    print(scientific_name)

    common_name = metadata.loc[metadata["SHP/*"] == shp]["Common Name"].values[0]
    print(common_name)
    # final dataframe
    final = pd.DataFrame(
        {
            "scientific_name": scientific_name,
            "common_name": common_name,
            "unique_id": native,
        },
    )

    # groupby scientific_name
    final = (
        final.groupby(["scientific_name", "common_name"])["unique_id"]
        .apply(list)
        .reset_index()
    )

    print(final)

    string_name = str(final["scientific_name"].values[0]).replace(" ", "_")

    path = final_base_path + string_name + ".parquet"
    print(path)

    final.to_parquet(path)