# Database Constructor

The data ingest route is designed to be tolerant of a certain amount of sloppiness in the way metadata describing documents is provided, but it still has certain expectations...

In [None]:
from sqlite_utils import Database

db_name = "demo.db"

# Uncomment the following lines to connect to a pre-existing database
#db = Database(db_name)

In [None]:
# Do not run this cell if your database already exists!

# While developing the script, recreate database each time...
db = Database(db_name, recreate=True)

# This schema has been evolved iteratively as I have identified structure
# that can be usefully mined...

db["sources"].create({
    "url": str,
    "publication": str,
    "published_date": str, # this may range from year to actual date
    "volume": str, # or volume like...
    "title": int, # Title of section
    "date": str, # optional; the second date field; may be eg correspondence date
    "author": str, # attempt at provenance
    "pages": str, # or pages like
    "text": str,
},# pk=("url", "title") # Need an autoincrement; no natural key?
)

# Enable full text search
# This creates an extra virtual table (books_fts) to support the full text search
db["sources"].enable_fts(["title", "text"], create_triggers=True)

## Parse Data Files

This attempts to be forgiving...

In [95]:
fn = "notes-and-queries.md"

def get_file_contents(fn):
    """Open file from filename and get file contents."""
    with open(fn) as f:
        txt = f.read().strip()
    return txt

txt = get_file_contents(fn)
txt[:100]

'---\n\n\nhttps://archive.org/details/notesqueriesmedi03note/page/24/mode/2up?q=mazer+wood\n\nNotes and Qu'

In [96]:
def get_sections(txt):
    """Get sections from file."""
    txt_sections = [s.strip('-').strip() for s in txt.split("---") if s.strip('-').strip()]
    return txt_sections

txt_sections = get_sections(txt)
txt_sections

['https://archive.org/details/notesqueriesmedi03note/page/24/mode/2up?q=mazer+wood\n\nNotes and Queries\nNo. 74\nMarch 29, 1851\np239\nMAZER WOOD : GUTTA PERCHA. \n\nIn the Musaeum Trade.icantimium, or a Collectian \nof liarities preserved at South Lambeth, near Lon- \ndon, by John Tradescant, 1656, I find, amongst \n"other variety of rarities," "the plyable IMazer \nwood, which, being warmed in water, will work to \nany form;" and a little farther on, in the list of \n"utensils and household stufFe," I also find "Mazer \ndishes." In my opinion, it is more than a coin- \ncidence that Doctor Montgomery, who, in 1843, \nreceived the gold medal of the Society of Arts \nfor bringing giitta percha and its useful properties \nunder tiie notice of that bodv, describes it in \nalmost the same words that Tradescant uses when \nspeaking of the jiliable jNlazer wood: the Doctor \nsay.s, "it coidd be moulded into any form by \nmerely dipping it into boiling water." It is \nworthy of remark that Tr

In [87]:
#%pip install dateparser
# https://dateparser.readthedocs.io/en/latest/usage.html
from dateparser.date import DateDataParser

ddp = DateDataParser(languages=['en'])

In [105]:
import re

# Example page search
m = re.search(r"^pp?\.?\s?([0-9\?].*)", "pp.23-34")
if m:
    print(m.group(1))

4


In [190]:
import re

def parse_sections(txt_sections, fn=None):
    """Parse file section."""
    records = []
    for section in txt_sections:
        txt_lines = [l.strip() for l in section.split('\n') if l.strip()]
        #print(txt_lines)
        record = {"fn":fn}
        for i, line in enumerate(txt_lines):
            line = line.strip()
            # This is inefficient...
            # We should test as fallback...
            try_url = line.startswith("http")
            try_date = ddp.get_date_data(line)
            try_pages = re.search(r"^pp?\.?\s?([0-9\?].*)", line)

            if try_url:
                record["url"] = line
            elif try_date["date_obj"]:
                if "published_date" in record:
                    record["date"] = try_date
                else:
                    record["published_date"] = try_date
            elif try_pages:
                record["pages"] = try_pages.group(1)
            elif not "publication" in record:
                record["publication"] = line

            # We take pages as the last item of metadata...
            if try_pages:
                break
        if len(txt_lines[i+1])>200:
            record["title"] = txt_lines[i+1][:100]
            record["text"] = "\n".join(txt_lines[i+1:])[100:]
        else:
            record["title"] = txt_lines[i+1]
            record["text"] = "\n".join(txt_lines[i+2:])
        records.append(record)

    return records

In [191]:
# Attempt at full run

import os

all_records = []

for fn in [fn for fn in os.listdir() if fn.endswith(".md") and not fn.startswith("_") and not fn.startswith("README")]:
    print(fn)
    txt = get_file_contents(fn)
    txt_sections = get_sections(txt)
    records = parse_sections(txt_sections, fn)
    all_records = all_records + records

glamorgan-monmouth-brecon-gazette-merthyr-guardian.md
weekly-mail.md
leland-collectanea.md
folklore.md
bye-gones.md
carnarvon-denbigh-herald.md
notes-and-queries.md
aberystwith-observer.md
phillips-beyond-border-sin-eater.md
english-chronicle-whitehall-eveing-post.md
golden-bough-frazer.md
evening-express.md
yorkshire-evening-press.md
the-welshman.md
roscoe-wanderings-south-wales.md
south-wales-daily-post.md
folklore-west-mid-wales-davies.md
llangollen-advertiser-denbighshire-merionethshire-north-wales-journal.md
pontypridd-chronicle-workmans-news.md
south-wales-echo.md
cambrian-news-merionethshire-standard.md
ethnology-in-folklore-gomme.md
inverness-courier.md
sin-eater-macleod.md
carmarthen-journal-south-wales-weekly-advertiser.md
contemporary-review.md
wrexham-guardian.md
hampshire-independent.md
shields-daily-news.md
tarian-gweithiwr.md
hereford-journal.md
amman-valley-chronicle-east-carmarthen-news.md
merthyr-times-sowlais-times-aberdare-echo.md
journal-anthropological-institute.m

In [192]:
for r in all_records:
    if "published_date" in r:
        print(r["published_date"])
    else:
        print("no date?", r["fn"])

DateData(date_obj=datetime.datetime(1836, 6, 4, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1836, 6, 11, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1836, 6, 18, 0, 0), period='day', locale='en')
no date? glamorgan-monmouth-brecon-gazette-merthyr-guardian.md
no date? glamorgan-monmouth-brecon-gazette-merthyr-guardian.md
DateData(date_obj=datetime.datetime(1852, 10, 30, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1883, 5, 5, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1883, 6, 23, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1895, 9, 21, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1902, 8, 16, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1895, 10, 19, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1909, 5, 15, 0, 0), period='day', locale='en')
DateData(date_obj=datetime.datetime(1909, 9,