# SQLite to PostgreSQL via Polars Dataframe

Install `python-wget, sqlalchemy, polars, connectorx, pyarrow`

## Imports

In [10]:
import pathlib
import zipfile

import wget

#import pandas as pd
import polars as pl
from sqlalchemy import create_engine

## Source URL and Download Directory 

In [2]:
# Source URL
zipfile_url = r"https://www.opengeodata.nrw.de/produkte/umwelt_klima/wasser/grundwasser/hygrisc/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite.zip"

# Target directory
datapath = "../data/OpenGeodata.NRW/OpenHygrisC/"


## Download zip file if necessary ...

In [3]:
zipfilename = zipfile_url.split(r"/")[-1]

print(f"Downloading {zipfilename} to directory {datapath}")

p = pathlib.Path(datapath)
p.mkdir(exist_ok=True)

f =  pathlib.Path(datapath + zipfilename)

if not f.is_file():
    wget.download(zipfile_url, out=datapath)
else:
    print(f"Warning: {f} already exists. Skip download.")

Downloading OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite.zip to directory ../data/OpenGeodata.NRW/OpenHygrisC/


## Unzip if necessary ...

In [4]:


print(f"unzip {zipfilename}")

sqlitepathname = datapath + f.stem

if not pathlib.Path(sqlitepathname).exists():
    with zipfile.ZipFile(f, 'r') as zip_ref:
        zip_ref.extractall(sqlitepathname)
else:
    print(f"Warning: directory {sqlitepathname} already exists. Skip unzip.")

unzip OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite.zip


## What is in the unzipped folder ...

In [5]:
p = pathlib.Path(sqlitepathname)
list(p.glob("*"))

[WindowsPath('../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite/katalog_gemeinde.sqlite'),
 WindowsPath('../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite/katalog_stoff.sqlite'),
 WindowsPath('../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite/opendata.gw_chemischer_messwert.sqlite'),
 WindowsPath('../data/OpenGeodata.NRW/OpenHygrisC/OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite/opendata.gw_messstelle.sqlite')]

## Assign file names to variables.

In [6]:
chem_path       = next(p.glob("*chem*"))  # Chemistry
stoff_path      = next(p.glob("*stoff*")) # Substance, physico-chemical Quantity, Parameter
messstelle_path = next(p.glob("*messstelle*")) # GW Well, Station
gemeinde_path   = next(p.glob("*gemeinde*")) # Municipality

print(f"{messstelle_path.name = }")
print(f"{chem_path.name       = }")
print(f"{stoff_path.name      = }")
print(f"{gemeinde_path.name   = }")


messstelle_path.name = 'opendata.gw_messstelle.sqlite'
chem_path.name       = 'opendata.gw_chemischer_messwert.sqlite'
stoff_path.name      = 'katalog_stoff.sqlite'
gemeinde_path.name   = 'katalog_gemeinde.sqlite'


## Focus on groundwater quality data (= chemistry) only.

In [7]:
sqlite_uri = r"sqlite:///" + str(chem_path)
print(f"{sqlite_uri = }")

sqlite_uri = 'sqlite:///..\\data\\OpenGeodata.NRW\\OpenHygrisC\\OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite\\opendata.gw_chemischer_messwert.sqlite'


## Use Jupyter SQL Magic for a quick look into the database.

In [None]:
%reload_ext sql
%config SqlMagic.autolimit = 10
%sql {sqlite_uri}

In [None]:
%%sql
SELECT * FROM sqlite_master

In [None]:
%config SqlMagic.autolimit = 10
%sql select count(*) from opendata_gw_chemischer_messwert

## Use Polars to read the data into a dataframe.

In [8]:
table_name = "opendata_gw_chemischer_messwert"
query = "select * from " + table_name
print(f"{query = }")

query = 'select * from opendata_gw_chemischer_messwert'


In [17]:
# Database connection ...
sqlite_engine = create_engine(sqlite_uri)
sqlite_engine

Engine(sqlite:///..\data\OpenGeodata.NRW\OpenHygrisC\OpenHygrisC_gw-messstellen-messwerte_EPSG25832_Sqlite\opendata.gw_chemischer_messwert.sqlite)

In [18]:
%%time
df = pl.read_database(query=query, connection=sqlite_engine.connect())

ComputeError: could not append value: -25.0 of type: f64 to the builder; make sure that all rows have the same schema or consider increasing `infer_schema_length`

it might also be that a value overflows the data-type's capacity

In [None]:
# SQLite connection
sqlite_conn = create_engine(sqlite_conn)

# Read data from SQLite into a Polars DataFrame
query = 'SELECT * FROM ' + table_name
df = pl.read_sql_table(table_name, sqlite_conn)

In [None]:


# PostgreSQL connection
postgres_engine = create_engine('postgresql://user:your_password@your_postgresql_host:your_postgresql_port/your_postgresql_database')

# Write the Polars DataFrame to PostgreSQL
df.write_sql(postgres_engine, 'messungen', schema='gw', if_exists='replace')

# Close connections
sqlite_conn.dispose()
postgres_engine.dispose()
