Skip to content

Commit

Permalink
sqlite-utils extract, closes #42
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Sep 22, 2020
1 parent 2db6c5b commit 55cf928
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 0 deletions.
91 changes: 91 additions & 0 deletions docs/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,97 @@ If you want to see the SQL that will be executed to make the change without actu
DROP TABLE [roadside_attractions];
ALTER TABLE [roadside_attractions_new_4033a60276b9] RENAME TO [roadside_attractions];

.. _cli_extract:

Extracting columns into a separate table
========================================

The ``sqlite-utils extract`` command can be used to extract specified columns into a separate table.

Take a look at the Python API documentation for :ref:`python_api_extract` for a detailed description of how this works, including examples of table schemas before and after running an extraction operation.

The command takes a database, table and one or more columns that should be extracted. To extract the ``species`` column from the ``trees`` table you would run::

$ sqlite-utils extract my.db trees species

This would produce the following schema:

.. code-block:: sql
CREATE TABLE "trees" (
[id] INTEGER PRIMARY KEY,
[TreeAddress] TEXT,
[species_id] INTEGER,
FOREIGN KEY(species_id) REFERENCES species(id)
)
CREATE TABLE [species] (
[id] INTEGER PRIMARY KEY,
[species] TEXT
)
The command takes the following options:

``--table TEXT``
The name of the lookup to extract columns to. This defaults to using the name of the columns that are being extracted.

``--fk-column TEXT``
The name of the foreign key column to add to the table. Defaults to ``columnname_id``.

``--rename <TEXT TEXT>``
Use this option to rename the columns created in the new lookup table.

Here's a more complex example that makes use of these options. It converts `this CSV file <https://github.com/wri/global-power-plant-database/blob/232a666653e14d803ab02717efc01cdd437e7601/output_database/global_power_plant_database.csv>`__ full of global power plants into SQLite, then extracts the ``country`` and ``country_long`` columns into a separate ``countries`` table::

wget 'https://github.com/wri/global-power-plant-database/blob/232a6666/output_database/global_power_plant_database.csv?raw=true'
sqlite-utils insert global.db power_plants \
'global_power_plant_database.csv?raw=true' --csv
# Extract those columns:
sqlite-utils extract global.db power_plants country country_long \
--table countries \
--fk-column country_id \
--rename country_long name

After running the above, the command ``sqlite3 global.db .schema`` reveals the following schema:

.. code-block:: sql
CREATE TABLE [countries] (
[id] INTEGER PRIMARY KEY,
[country] TEXT,
[name] TEXT
);
CREATE UNIQUE INDEX [idx_countries_country_name]
ON [countries] ([country], [name]);
CREATE TABLE IF NOT EXISTS "power_plants" (
[rowid] INTEGER PRIMARY KEY,
[country_id] INTEGER,
[name] TEXT,
[gppd_idnr] TEXT,
[capacity_mw] TEXT,
[latitude] TEXT,
[longitude] TEXT,
[primary_fuel] TEXT,
[other_fuel1] TEXT,
[other_fuel2] TEXT,
[other_fuel3] TEXT,
[commissioning_year] TEXT,
[owner] TEXT,
[source] TEXT,
[url] TEXT,
[geolocation_source] TEXT,
[wepp_id] TEXT,
[year_of_capacity_data] TEXT,
[generation_gwh_2013] TEXT,
[generation_gwh_2014] TEXT,
[generation_gwh_2015] TEXT,
[generation_gwh_2016] TEXT,
[generation_gwh_2017] TEXT,
[generation_data_source] TEXT,
[estimated_generation_gwh] TEXT,
FOREIGN KEY(country_id) REFERENCES countries(id)
);
.. _cli_create_view:

Creating views
Expand Down
33 changes: 33 additions & 0 deletions sqlite_utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,39 @@ def transform(
db[table].transform(**kwargs)


@cli.command()
@click.argument(
"path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.argument("table")
@click.argument("columns", nargs=-1, required=True)
@click.option(
"--table", "other_table", help="Name of the other table to extract columns to"
)
@click.option("--fk-column", help="Name of the foreign key column to add to the table")
@click.option(
"--rename",
type=(str, str),
multiple=True,
help="Rename this column in extracted table",
)
def extract(
path,
table,
columns,
other_table,
fk_column,
rename,
):
"Extract one or more columns into a separate table"
db = sqlite_utils.Database(path)
db[table].extract(
columns, table=other_table, fk_column=fk_column, rename=dict(rename)
)


@cli.command(name="insert-files")
@click.argument(
"path",
Expand Down
50 changes: 50 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1549,3 +1549,53 @@ def test_transform_drop_foreign_key(db_path):
schema
== 'CREATE TABLE "places" (\n [id] INTEGER PRIMARY KEY,\n [name] TEXT,\n [country] INTEGER,\n [city] INTEGER REFERENCES [city]([id])\n)'
)


_common_other_schema = (
"CREATE TABLE [species] (\n [id] INTEGER PRIMARY KEY,\n [species] TEXT\n)"
)


@pytest.mark.parametrize(
"args,expected_table_schema,expected_other_schema",
[
(
[],
'CREATE TABLE "trees" (\n [id] INTEGER PRIMARY KEY,\n [address] TEXT,\n [species_id] INTEGER,\n FOREIGN KEY(species_id) REFERENCES species(id)\n)',
_common_other_schema,
),
(
["--table", "custom_table"],
'CREATE TABLE "trees" (\n [id] INTEGER PRIMARY KEY,\n [address] TEXT,\n [custom_table_id] INTEGER,\n FOREIGN KEY(custom_table_id) REFERENCES custom_table(id)\n)',
"CREATE TABLE [custom_table] (\n [id] INTEGER PRIMARY KEY,\n [species] TEXT\n)",
),
(
["--fk-column", "custom_fk"],
'CREATE TABLE "trees" (\n [id] INTEGER PRIMARY KEY,\n [address] TEXT,\n [custom_fk] INTEGER,\n FOREIGN KEY(custom_fk) REFERENCES species(id)\n)',
_common_other_schema,
),
(
["--rename", "name", "name2"],
'CREATE TABLE "trees" (\n [id] INTEGER PRIMARY KEY,\n [address] TEXT,\n [species_id] INTEGER,\n FOREIGN KEY(species_id) REFERENCES species(id)\n)',
"CREATE TABLE [species] (\n [id] INTEGER PRIMARY KEY,\n [species] TEXT\n)",
),
],
)
def test_extract(db_path, args, expected_table_schema, expected_other_schema):
db = Database(db_path)
with db.conn:
db["trees"].insert(
{"id": 1, "address": "4 Park Ave", "species": "Palm"},
pk="id",
)
result = CliRunner().invoke(
cli.cli, ["extract", db_path, "trees", "species"] + args
)
print(result.output)
assert result.exit_code == 0
schema = db["trees"].schema
assert schema == expected_table_schema
other_schema = [t for t in db.tables if t.name not in ("trees", "Gosh", "Gosh2")][
0
].schema
assert other_schema == expected_other_schema

0 comments on commit 55cf928

Please sign in to comment.