neo4juser!

In [1]:
from datetime import datetime

import pandas as pd
import pandera as pa

In [150]:
rdf_metadata_url = "https://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
csv_metadata_url = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv.gz"


def download_metadata() -> pd.DataFrame:
    """ """
    dtypes = {
        "Text#": int,
        "Type": str,
        "Language": str,
        "Authors": str,
        "Subjects": str,
        "LoCC": str,
        "Bookshelves": str,
    }
    return pd.read_csv(
        rdf_metadata_url,
        dtype=dtypes,
        parse_dates=["Issued"],
        header=0,
        engine="c",
        keep_default_na=False,
    )


def validate_raw_metadata(df: pd.DataFrame) -> None:
    """ """
    gutenberg_start_date = datetime(year=1971, month=1, day=1)
    today = datetime.now()

    schema = pa.DataFrameSchema(
        {
            "Text#": pa.Column(pa.dtypes.Int),
            "Type": pa.Column(
                pa.dtypes.String,
                checks=pa.Check(lambda s: (s.str.lower() == "text").any()),
            ),
            "Issued": pa.Column(
                pa.dtypes.Timestamp,
                checks=[pa.Check.ge(gutenberg_start_date), pa.Check.le(today)],
            ),
            "Language": pa.Column(
                pa.dtypes.String,
                checks=pa.Check(lambda s: (s.str.lower() == "en").any()),
            ),
            "Authors": pa.Column(pa.dtypes.String),
            "Subjects": pa.Column(pa.dtypes.String),
            "LoCC": pa.Column(
                pa.dtypes.String,
                checks=pa.Check(lambda s: (s.str.upper() == "e838").any()),
            ),
            "Bookshelves": pa.Column(pa.dtypes.String),
        }
    )

    schema.validate(df)


def preprocess_metadata(raw_metadata_df: pd.DataFrame) -> pd.DataFrame:
    """ """
    pass

In [153]:
raw_metadata_df = download_metadata()

try:
    validate_raw_metadata(raw_metadata_df)
except pa.errors.SchemaError as e:
    print(e)

preprocessed_metadata_df = preprocess_metadata(raw_metadata_df)

<Schema Column(name=LoCC, type=DataType(str))> failed series or dataframe validator 0:
<Check <lambda>>


In [161]:
df = raw_metadata_df.rename(
    columns={
        "Text#": "id",
        "Issued": "issued_date",
        "Authors": "authorship",
        "LoCC": "loc_classification",
    }
).rename(columns=str.lower)

df["type"] = df["type"].str.lower()
df["language"] = df["language"].str.lower()
df["loc_classification"] = df["loc_classification"].str.upper()

In [162]:
df

Unnamed: 0,id,type,issued_date,title,language,authorship,subjects,loc_classification,bookshelves
0,1,text,1971-12-01,The Declaration of Independence of the United ...,en,"Jefferson, Thomas, 1743-1826","United States -- History -- Revolution, 1775-1...",E201; JK,Politics; American Revolutionary War; United S...
1,2,text,1972-12-01,The United States Bill of Rights\r\nThe Ten Or...,en,United States,Civil rights -- United States -- Sources; Unit...,JK; KF,Politics; American Revolutionary War; United S...
2,3,text,1973-11-01,John F. Kennedy's Inaugural Address,en,"Kennedy, John F. (John Fitzgerald), 1917-1963",United States -- Foreign relations -- 1961-196...,E838,
3,4,text,1973-11-01,Lincoln's Gettysburg Address\r\nGiven November...,en,"Lincoln, Abraham, 1809-1865",Consecration of cemeteries -- Pennsylvania -- ...,E456,US Civil War
4,5,text,1975-12-01,The United States Constitution,en,United States,United States -- Politics and government -- 17...,JK; KF,United States; Politics; American Revolutionar...
...,...,...,...,...,...,...,...,...,...
69595,69675,text,2023-01-01,Allworth Abbey,en,"Southworth, Emma Dorothy Eliza Nevitte, 1819-1899",,,
69596,69676,text,2023-01-01,The Italian Alp-bee,en,"Hermann, H. C.",,,
69597,69677,text,2023-01-01,The man in the street: Papers on American topics,en,"Nicholson, Meredith, 1866-1947",,,
69598,69678,text,2023-01-01,Memoirs of a millionaire,en,"Mead, Lucia True Ames, 1856-1936",,,
