In [1]:
import pandas as pd
from IPython.display import display
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Polygon, MultiPolygon

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def is_simple_polygon(geom):
    if isinstance(geom, Polygon):
        return not geom.interiors
    elif isinstance(geom, MultiPolygon):
        return all(not poly.interiors for poly in geom)
    return False

In [3]:
def filter_dataset(dataset_path, column_names, output_path):
    batch_size = 100000
    lakes_reader = pd.read_csv(
        dataset_path,
        chunksize=batch_size,
        delimiter="\t",
        header=None,
        on_bad_lines="warn",
        names=column_names,
    )

    total_rows = 0
    for i, chunk in enumerate(lakes_reader):
        chunk = chunk.dropna(subset=["shape"])
        chunk = chunk[chunk["shape"].str.startswith("POLYGON")].reset_index(drop=True)
        chunk = chunk[
            chunk["shape"].apply(lambda x: is_simple_polygon(wkt.loads(x)))
        ].reset_index(drop=True)

        chunk_df = pd.DataFrame(chunk["shape"])
        chunk_df.to_csv(
            output_path, mode=("w" if i == 0 else "a"), header=None, index=False
        )
        total_rows += chunk_df.shape[0]
        print(f"Finished chunk {i}")

    print(f"TOTAL ROWS: {total_rows}")

# Filter lakes dataset

In [10]:
filter_dataset(
    dataset_path="../../dataset_files/lakes",
    column_names=["way_id", "shape", "tags"],
    output_path="../../dataset_files/lakes_filtered",
)

Finished chunk 0
Finished chunk 1
Finished chunk 2
Finished chunk 3
Finished chunk 4
Finished chunk 5
Finished chunk 6
Finished chunk 7
Finished chunk 8
Finished chunk 9
Finished chunk 10
Finished chunk 11
Finished chunk 12
Finished chunk 13
Finished chunk 14
Finished chunk 15
Finished chunk 16
Finished chunk 17
Finished chunk 18
Finished chunk 19
Finished chunk 20
Finished chunk 21
Finished chunk 22
Finished chunk 23
Finished chunk 24
Finished chunk 25
Finished chunk 26
Finished chunk 27
Finished chunk 28
Finished chunk 29
Finished chunk 30
Finished chunk 31
Finished chunk 32
Finished chunk 33
Finished chunk 34
Finished chunk 35
Finished chunk 36
Finished chunk 37
Finished chunk 38
Finished chunk 39
Finished chunk 40
Finished chunk 41
Finished chunk 42
Finished chunk 43
Finished chunk 44
Finished chunk 45
Finished chunk 46
Finished chunk 47
Finished chunk 48
Finished chunk 49
Finished chunk 50
Finished chunk 51
Finished chunk 52
Finished chunk 53
Finished chunk 54
Finished chunk 55
Fi

# Filter parks dataset

In [11]:
filter_dataset(
    dataset_path="../../dataset_files/parks",
    column_names=["way_id", "shape", "tags"],
    output_path="../../dataset_files/parks_filtered",
)

Finished chunk 0
Finished chunk 1
Finished chunk 2
Finished chunk 3
Finished chunk 4
Finished chunk 5
Finished chunk 6
Finished chunk 7
Finished chunk 8
Finished chunk 9
Finished chunk 10
Finished chunk 11
Finished chunk 12
Finished chunk 13
Finished chunk 14
Finished chunk 15
Finished chunk 16
Finished chunk 17
Finished chunk 18
Finished chunk 19
Finished chunk 20
Finished chunk 21
Finished chunk 22
Finished chunk 23
Finished chunk 24
Finished chunk 25
Finished chunk 26
Finished chunk 27
Finished chunk 28
Finished chunk 29
Finished chunk 30
Finished chunk 31
Finished chunk 32
Finished chunk 33
Finished chunk 34
Finished chunk 35
Finished chunk 36
Finished chunk 37
Finished chunk 38
Finished chunk 39
Finished chunk 40
Finished chunk 41
Finished chunk 42
Finished chunk 43
Finished chunk 44
Finished chunk 45
Finished chunk 46
Finished chunk 47
Finished chunk 48
Finished chunk 49
Finished chunk 50
Finished chunk 51
Finished chunk 52
Finished chunk 53
Finished chunk 54
Finished chunk 55
Fi

# Create sub - datasets

## Read fitlered data

In [3]:
shapefile_path = "continents_geometries/World_Continents"
continents_df = gpd.read_file(shapefile_path)
continents_df = continents_df[["CONTINENT", "geometry"]]
continents = ["North America", "South America", "Oceania", "Europe", "Asia", "Africa"]
continents_df = continents_df[continents_df["CONTINENT"].isin(continents)]

In [4]:
continents_df

Unnamed: 0,CONTINENT,geometry
0,Africa,"MULTIPOLYGON (((3950542.075 -2473747.938, 3946..."
1,Asia,"MULTIPOLYGON (((-20037507.067 10744605.176, -2..."
3,Oceania,"MULTIPOLYGON (((20037507.067 -1916837.495, 200..."
4,South America,"MULTIPOLYGON (((-7481659.964 -7536755.913, -74..."
6,Europe,"MULTIPOLYGON (((2654806.574 4235181.553, 26684..."
7,North America,"MULTIPOLYGON (((-9092406.120 824784.893, -9089..."


In [5]:
continents_df.to_crs(epsg=4326, inplace=True)

In [6]:
continent_polygons = {
    "OC": continents_df[continents_df["CONTINENT"] == "Oceania"]["geometry"].values[0],
    "AF": continents_df[continents_df["CONTINENT"] == "Africa"]["geometry"].values[0],
    "NA": continents_df[continents_df["CONTINENT"] == "North America"][
        "geometry"
    ].values[0],
    "SA": continents_df[continents_df["CONTINENT"] == "South America"][
        "geometry"
    ].values[0],
    "EU": continents_df[continents_df["CONTINENT"] == "Europe"]["geometry"].values[0],
    "AS": continents_df[continents_df["CONTINENT"] == "Asia"]["geometry"].values[0],
}
datasets = {"parks_filtered": "O6"}
dataset_paths = {
    "parks_filtered": "../../dataset_files/parks_filtered",
}

In [7]:
def group_poly(polygon, dataset_nickname):
    for continent_name, continent_polygon in continent_polygons.items():
        if polygon.intersects(continent_polygon):
            return dataset_nickname + continent_name
    return None

In [9]:
for dataset_name, dataset_nickname in datasets.items():
    print(f"Processing {dataset_name}")
    dataset_path = dataset_paths[dataset_name]
    batch_size = 100000
    reader = pd.read_csv(
        dataset_path,
        chunksize=batch_size,
        on_bad_lines="warn",
        header=None,
        names=["shape"],
    )
    for i, chunk in enumerate(reader):
        geo_df = gpd.GeoDataFrame(
            pd.DataFrame(chunk["shape"].apply(wkt.loads)), geometry="shape"
        )
        geo_df["dataset_name"] = geo_df["shape"].apply(
            group_poly, args=(dataset_nickname,)
        )
        geo_df = geo_df.dropna()
        for combined_dataset_name in geo_df["dataset_name"].unique():
            save_path = (
                f"../../dataset_files/OSM_filtered_datasets/{combined_dataset_name}"
            )

            cur_dataset_polygons = geo_df[
                geo_df["dataset_name"] == combined_dataset_name
            ].drop(columns=["dataset_name"])

            cur_dataset_polygons.to_csv(
                save_path, mode=("w" if i == 0 else "a"), header=None, index=False
            )

        print(f"Finished chunk {i}")

Processing parks_filtered
Finished chunk 0
Finished chunk 1
Finished chunk 2
Finished chunk 3
Finished chunk 4
Finished chunk 5
Finished chunk 6
Finished chunk 7
Finished chunk 8
Finished chunk 9
Finished chunk 10
Finished chunk 11
Finished chunk 12
Finished chunk 13
Finished chunk 14
Finished chunk 15
Finished chunk 16
Finished chunk 17
