diff --git a/docker/import_mode.py b/docker/import_mode.py index 905ed25..1e93d65 100644 --- a/docker/import_mode.py +++ b/docker/import_mode.py @@ -68,11 +68,11 @@ def okay_to_run(self, prior_import: dict) -> bool: Only the replication key is specifically used """ self.logger.debug(f'Checking if it is okay to run...') - # If no prior imports, do not require force if self.force: self.logger.warn(f'Using --force, kiss existing data goodbye') return True + # If no prior imports, do not require force if len(prior_import) == 0: self.logger.debug(f'No prior import found, okay to proceed.') return True @@ -86,8 +86,8 @@ def okay_to_run(self, prior_import: dict) -> bool: self.logger.debug('Okay to proceed with replication') return True - msg = 'A prior import exists.' - self.logger.warn(msg) + msg = 'Prior data exists in the osm schema and --force was not used.' + self.logger.error(msg) return False def set_append_first_run(self): diff --git a/docker/pgosm_flex.py b/docker/pgosm_flex.py index eb0fdc7..6b10a9f 100644 --- a/docker/pgosm_flex.py +++ b/docker/pgosm_flex.py @@ -35,7 +35,7 @@ @click.option('--debug', is_flag=True, help='Enables additional log output') @click.option('--force', is_flag=True, - help='Danger! Forces PgOSM Flex to load the data even if this will overwrite pre-existing data. This only impacts usage when connecting to an external Postgres connection, not when using the internal-Docker Postgres instances.') + help='Danger! Forces PgOSM Flex to load the data even if this will overwrite pre-existing data. See https://pgosm-flex.com/force-load.html for more.') @click.option('--input-file', required=False, default=None, @@ -99,7 +99,10 @@ def run_pgosm_flex(ram, region, subregion, debug, force, layerset, layerset_path, sp_gist, replication) db.wait_for_postgres() if force and db.pg_conn_parts()['pg_host'] == 'localhost': - logger.warning('Using --force with the built-in database is unnecessary.') + msg = 'Using --force with the built-in database is unnecessary.' + msg += ' The pgosm database is always dropped and recreated when' + msg += ' running on localhost (in Docker).' + logger.warning(msg) if replication: replication_update = check_replication_exists() @@ -128,6 +131,7 @@ def run_pgosm_flex(ram, region, subregion, debug, force, if not import_mode.okay_to_run(prior_import): msg = 'Not okay to run PgOSM Flex. Exiting' + logger.error(msg) sys.exit(msg) # There's probably a better way to get this data out, but this worked right diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 9e8c450..2e2815c 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -20,13 +20,15 @@ - [Postgres Permissions](./postgres-permissions.md) - [Using External Postgres Connection](./postgres-external.md) -- [Stay Updated with Replication](./replication.md) -- [Using Update Mode](./update-mode.md) -- [Force Load](./force-load.md) +- [Data updates](./data-updates.md) + - [Using Replication](./replication.md) + - [Relocate Data](./relocate-data.md) + - [Using Update Mode](./update-mode.md) - [QGIS Styles](./qgis-styles.md) # Developers +- [Force Load](./force-load.md) - [Projects using PgOSM Flex](./projects.md) - [Build and Push Docker Images](./docker-build.md) - [Testing PgOSM Flex](./tests.md) diff --git a/docs/src/data-updates.md b/docs/src/data-updates.md new file mode 100644 index 0000000..94294c2 --- /dev/null +++ b/docs/src/data-updates.md @@ -0,0 +1,63 @@ +# Data updates + +Keeping OpenStreetMap data recent and up-to-date is important to many projects. +However, this concept can mean very different things depending on the needs at +hand. + +There are three (3) main ways to run subsequent imports using PgOSM Flex. + +* [Replication](replication.md) +* [Relocate data](relocate-data.md) +* [Manual Updates](update-mode.md) + +## Replication + +[Replication](replication.md) should be the default first choice to consider. +Replication is best used when you only want to load one region of data and want +to keep the region's data recent. + +Pros: + +* Fast updates after the first import +* Easy + +Cons: + +* Increased database size +* Little flexibility after initial import + +## Relocate data + +[Relocating data](relocate-data.md) involves renaming the `osm` schema. +This allows PgOSM Flex to run in single-import mode, and to import any number +of different regions. + +Pros: + +* Simple +* Smaller database size per region +* Very customizable + +Cons: + +* Always single-import +* Duplicates a lot of data if using for snapshots over time on one region + +## Manual Updates + +[Manual Updates](update-mode.md) provide significant flexibility with a tradeoff +in import performance + +Pros: + +* Very customizable + +Cons: + +* Very slow updates +* Poorly documented in PgOSM Flex + + + + + diff --git a/docs/src/force-load.md b/docs/src/force-load.md index 7804622..cad1b41 100644 --- a/docs/src/force-load.md +++ b/docs/src/force-load.md @@ -1,32 +1,40 @@ # Force Load -PgOSM Flex attempts to avoid accidentally overwriting existing data -when using a database -[external to](./postgres-external.md) the PgOSM Flex Docker container. > Added in PgOSM Flex 0.8.1. -## PgOSM Tries to be Safe +---- -Assumes you have followed the instructions on the -[Postgres External section](./postgres-external.md). +## ⚠️ Danger ahead ⚠️ +The examples in this section can do bad things in production setups. +The `--force` feature exists for development use cases. +**Most users should consider moving old data out of the way using the methods +described in the [relocate data](./relocate-data.md).** -```bash -source ~/.pgosm-db-myproject - -docker run --name pgosm -d --rm \ - -v ~/pgosm-data:/app/output \ - -v /etc/localtime:/etc/localtime:ro \ - -e POSTGRES_USER=$POSTGRES_USER \ - -e POSTGRES_PASSWORD=$POSTGRES_PASSWORD \ - -e POSTGRES_HOST=$POSTGRES_HOST \ - -e POSTGRES_DB=$POSTGRES_DB \ - -e POSTGRES_PORT=$POSTGRES_PORT \ - -p 5433:5432 -d rustprooflabs/pgosm-flex +---- + +## PgOSM Flex *Tries* to be Safe + +PgOSM Flex **attempts** to avoid accidentally overwriting existing data +when using a database +[external to](./postgres-external.md) the PgOSM Flex Docker container. +It does this by checking the data stored in the `osm.pgosm_flex` table. + +> The `--force` feature only applies to external database connections. The internal database is always dropped and recreated when using the built-in database. + +This section assumes you have followed the instructions on the +[Postgres External section](./postgres-external.md) including +[setting up permissions](postgres-permissions.md). +The protection against overwriting data is built into the `pgosm_flex.py` logic +ran via `docker exec`. With PgOSM Flex 0.8.1 and later, running the following +command twice in a row will result in an + + +```bash docker exec -it \ pgosm python3 docker/pgosm_flex.py \ --ram=8 \ @@ -38,11 +46,15 @@ Running the `docker exec` step a second time would result in the following error. ```bash -2023-05-14 14:59:33,145:WARNING:pgosm-flex:import_mode:A prior import exists. -Not okay to run PgOSM Flex. Exiting +2023-05-29 08:08:19,495:ERROR:pgosm-flex:import_mode:Prior data exists in the osm schema and --force was not used. +2023-05-29 08:08:19,495:ERROR:pgosm-flex:pgosm_flex:Not okay to run PgOSM Flex. Exiting ``` -To overwrite and reload data, use the `--force` option. + +## Using `--force` + +To overwrite and reload data, use the `--force` option with the `docker exec` +command. ```bash @@ -54,13 +66,56 @@ docker exec -it \ --force ``` -## Using `--force` -This outputs the following message during import. +Using `--force` outputs the following message during import when prior data exists. ``` 2023-05-14 15:09:12,457:WARNING:pgosm-flex:import_mode:Using --force, kiss existing data goodbye ``` +### Only overwrites tables in new `--layerset` + +Using `--force` can cause unexpected mismatches between tables when different +[layersets](layersets.md) are used. This section illustrates this problem. + +First run `docker exec` as shown in the [quick start](./quick-start.md) guide. +This loads the District of Columbia subregion with the default layerset. + +```bash +docker exec -it \ + pgosm python3 docker/pgosm_flex.py \ + --ram=8 \ + --region=north-america/us \ + --subregion=district-of-columbia +``` + + +Now run again with `--force` and `--layerset=minimal`. A different region +(Rhode Island) is also used to help illustrate the problem. + +```bash +docker exec -it \ + pgosm python3 docker/pgosm_flex.py \ + --ram=8 \ + --region=north-america/us \ + --subregion=rhode-island \ + --force \ + --layerset=minimal +``` + +The following image shows that while the `osm.place_polgyon` data is correctly loaded +with the Rhode Island region's data, the `osm.building_point` retained the +data from Washington D.C. This happens because `--force` only allows PgOSM Flex +to overwrite data as defined by the `--layerset` option. If tables were created +by layers not used in the latest `--layerset`, they will be left in the database +as-is. + +![Image showing a map of the northeast region of the U.S. containing Washington D.C. to Rhode Island. In the upper right corner (NE on the map) the Rhode Island region shows the place polygon data loaded as expected. The lower left corner (SW on the map) shows the building data, not included in the minimal layerset, is still displaying in the D.C. area.](./pgosm-flex-with-force-inconsistent-region.jpg) + + + +While this problem is most apparent when using different regions, it can also be +a problem with the same region if a user querying the data assumes all tables +were updated at the same time. diff --git a/docs/src/pgosm-flex-with-force-inconsistent-region.jpg b/docs/src/pgosm-flex-with-force-inconsistent-region.jpg new file mode 100644 index 0000000..b66c470 Binary files /dev/null and b/docs/src/pgosm-flex-with-force-inconsistent-region.jpg differ diff --git a/docs/src/relocate-data.md b/docs/src/relocate-data.md new file mode 100644 index 0000000..1283346 --- /dev/null +++ b/docs/src/relocate-data.md @@ -0,0 +1,36 @@ +# Relocate Data + + + +This section describes how to relocate OpenStreetMap data loaded using PgOSM Flex. +These instructions apply to using an external Postgres database in single-import +mode. + + +> Do not use these instructions when using `--append`, `--update`, or `--replication`. Something will most likely break. + + +## Why relocate data + +There are two common reasons you may want to relocate data. The same approach +works for both of these scenarios. + +* Snapshots over time +* Different regions + +If your goal is to have the latest data always available, consider using +[replication](replication.md) instead. + + +## Rename Schema + +PgOSM Flex always uses the `osm` schema. +The best way to relocate data is to simply rename the schema. This quickly moves +existing data out of the way for future PgOSM Flex use. The following query +renames `osm` to `osm_2023_05`. + + +```sql +ALTER SCHEMA osm RENAME TO osm_2023_05; +``` + diff --git a/docs/src/replication.md b/docs/src/replication.md index 7238f07..6c7d65b 100644 --- a/docs/src/replication.md +++ b/docs/src/replication.md @@ -1,12 +1,9 @@ -# Stay Updated with Replication +# Using Replication The `--replication` option of PgOSM Flex enables `osm2pgsql-replication` to provide an easy and quick way to keep your OpenStreetMap data refreshed. -> The `--replication` mode is stable as of 0.7.0. It was added as an experimental feature in 0.4, originally under the `--append` option. - - PgOSM Flex's `--replication` mode wraps around the `osm2pgsql-replication` package included with `osm2pgsql`. The first time running an import with `--replication` mode runs osm2pgsql normally, with `--slim` mode and without `--drop`. @@ -18,8 +15,9 @@ tables (`--slim`) must be left in the database (no `--drop`). > Important: The original `--append` option is now under `--replication`. The `--append` option was removed in PgOSM Flex 0.7.0. See [#275](https://github.com/rustprooflabs/pgosm-flex/issues/275) for context. +## Use tagged version -When using replication you need to pin your process to a specific PgOSM Flex version +When using replication you should pin your process to a specific PgOSM Flex version in the `docker run` command. When upgrading to new versions, be sure to check the release notes for manual upgrade steps for `--replication`. The release notes for @@ -36,6 +34,7 @@ your specific database and process.** ---- +## Max connections The other important change when using replication is to increase Postgres' `max_connections`. See [this discussion on osm2pgsql](https://github.com/openstreetmap/osm2pgsql/discussions/1650) @@ -59,6 +58,8 @@ docker run --name pgosm -d --rm \ -c max_connections=300 ``` +## Using `--replication` + Run the `docker exec` step with `--replication`. @@ -68,7 +69,6 @@ docker exec -it \ --ram=8 \ --region=north-america/us \ --subregion=district-of-columbia \ - --pgosm-date 2022-12-30 \ --replication ``` @@ -76,5 +76,5 @@ Running the above command a second time will detect that the target database has `osm2pgsql-replication` setup and load data via the defined replication service. -> Note: The `--pgosm-date` parameter is ignored during subsequent imports using `--replication`. + diff --git a/docs/src/update-mode.md b/docs/src/update-mode.md index 921af36..b7540d6 100644 --- a/docs/src/update-mode.md +++ b/docs/src/update-mode.md @@ -1,4 +1,4 @@ -# PgOSM Flex Update Mode +# Using Update Mode Running in experimental Update mode enables using osm2pgsql's `--append` option.