In [1]:
%cd /home/thenodev/Projects/GutenbergOnomasticsExplorer

/home/thenodev/Projects/GutenbergOnomasticsExplorer


## LangChain Gutenbergloader

In [2]:
import langchain

In [3]:
from langchain.document_loaders import GutenbergLoader

In [4]:
loader = GutenbergLoader("https://www.gutenberg.org/cache/epub/71057/pg71057.txt")

In [5]:
data = loader.load()

## Prototype MetadataLibrary Construction

In [6]:
import tarfile
import os

def constructLibrary(libraryPath:str="./", rdf_path:str="./rdf-files.tar"):
    print(f"Constructing Library this may take some time")

    # Unzip Metadata Lake
    buildMetadata(rdf_path)


    print("Library Constructed Succesfully in {libraryPath}")
    return

def buildMetadata(rdf_path):
    print(f"Building metadata Lake using the following rdf-file: {rdf_path}")
    with tarfile.open(rdf_path, "r") as tar:
        tar.extractall()
    print("Metadata Lake build Sucesfully")
    return

In [7]:
import xml.etree.ElementTree as ET

def parseMetaData(directory='./notebooks/cache/epub/'):
    metadata = []

    for root, dirs, files in os.walk(directory):
        for filename in files:
            print(filename)
            if filename.endswith(".rdf"):  # Process only RDF files
                filepath = os.path.join(root, filename)
                # Recursively iterate through the directory and its subdirectories
                # Open and parse the RDF file
                tree = ET.parse(filepath)
                root = tree.getroot()
                
                # Extract the desired attributes from the RDF XML
                for element in root.findall(".//{http://www.gutenberg.org/2009/pgterms/}file"):
                    url = element.attrib['{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about']
                    if url.endswith(".txt.utf-8"):  # Replace with the actual tag you want to extract
                        # Do something with the attribute value
                        break


                for element in root.find(".//{http://purl.org/dc/elements/1.1/}title"):
                    title = element.text
                        
                for element in root.find(".//{http://purl.org/dc/terms/}issued"):
                    issued = element.text
                        
                for element in root.find('.//{http://www.gutenberg.org/2009/pgterms/}name'):
                    # extract author name alias
                    author = element[0].text

                for element in root.find('.//{http://www.gutenberg.org/2009/pgterms/}birthdate'):
                    # extract author name alias
                    birthday = element[0].text

                for element in root.find('.//http://www.gutenberg.org/2009/pgterms/}deathdate'):
                    # extract author name alias
                    deathdate = element[0].text

                for element in root.find('.//{http://www.gutenberg.org/2009/pgterms/}alias'):
                    # extract author name alias
                    alias = element[0].text

                element = root.find('.//{http://purl.org/dc/terms/}language')
                element = element.find('.//{http://www.w3.org/1999/02/22-rdf-syntax-ns#}value')
                language = element.text

                metadata.append(
                    {
                        'title': title,
                        'issued': issued,
                        'author': author,
                        'birthday': birthday,
                        'deathdate': deathdate,
                        'alias':alias,
                        'url':url,
                        'language':language
                    }
                )            
    
    return metadata

## Parsing RDF FIleusing rdflib

In [8]:
from rdflib import Graph, Namespace
import pandas as pd

def parsemetadataNEW(path):
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    dcterms = Namespace("http://purl.org/dc/terms/")
    pgterms = Namespace("http://www.gutenberg.org/2009/pgterms/")


    books_metadata = []

    for root, dirs, files in os.walk(path):
        for filename in files:
            if filename.endswith(".rdf"):  # Process only RDF files
                filepath = os.path.join(root, filename)
                # Recursively iterate through the directory and its subdirectories
                # Open and parse the RDF file
                # Define the namespaces used in the RDF document

                g = Graph()
                g.parse(filepath)

                # Retrieve the relevant information from the RDF document and add it to the dataframe
                for ebook in g.subjects(rdf.type, pgterms.ebook):
                    row = {}
                    row["publisher"] = g.value(ebook, dcterms.publisher)
                    row["license"] = g.value(ebook, dcterms.license)
                    row["issued"] = g.value(ebook, dcterms.issued)
                    row["rights"] = g.value(ebook, dcterms.rights)
                    row["downloads"] = g.value(ebook, pgterms.downloads)
                    
                    creator = g.value(ebook, dcterms.creator)
                    row["creator_name"] = g.value(creator, pgterms.name)
                    row["creator_birthdate"] = g.value(creator, pgterms.birthdate)
                    row["creator_deathdate"] = g.value(creator, pgterms.deathdate)
                    row["creator_alias"] = g.value(creator, pgterms.alias)
                    row["creator_webpage"] = g.value(creator, pgterms.webpage)
                    
                    row["title"] = g.value(ebook, dcterms.title)
                    row["description"] = g.value(ebook, dcterms.description)
                    
                    language = g.value(ebook, dcterms.language)
                    row["language"] = g.value(language, rdf.value)
                    
                    subjects = g.objects(ebook, dcterms.subject)
                    subjects = [g.value(s, rdf.value) for s in subjects]
                    row["subject1"] = subjects[0] if len(subjects) > 0 else None
                    row["subject2"] = subjects[1] if len(subjects) > 1 else None
                    row["subject3"] = subjects[2] if len(subjects) > 2 else None
                    row["subject4"] = subjects[3] if len(subjects) > 3 else None
                    
                    bookshelves = g.objects(ebook, pgterms.bookshelf)
                    bookshelves = [g.value(b, rdf.value) for b in bookshelves]
                    row["bookshelf1"] = bookshelves[0] if len(bookshelves) > 0 else None
                    row["bookshelf2"] = bookshelves[1] if len(bookshelves) > 1 else None
                    row["bookshelf3"] = bookshelves[2] if len(bookshelves) > 2 else None
                    
                    has_formats = g.objects(ebook, dcterms.hasFormat)
                    has_formats = [str(f) for f in has_formats]
                    row["hasFormat"] = has_formats

                    df = pd.DataFrame(row,columns=["publisher", "license", "issued", "rights", "downloads", "creator_name",
                                        "creator_birthdate", "creator_deathdate", "creator_alias", "creator_webpage",
                                        "title", "description", "language", "subject1", "subject2", "subject3",
                                        "subject4", "bookshelf1", "bookshelf2", "bookshelf3", "hasFormat"])
                    books_metadata.append(df)
    return books_metadata

In [9]:
#df_list = parsemetadataNEW("./notebooks/cache/epub/")

In [10]:
#df = pd.concat(df_list)

In [11]:
#df_multindex = df.set_index(['title', 'hasFormat'])

In [12]:
#df_multindex.to_parquet('metadata_multindex.parquet')

In [13]:
#df_multindex.to_json('metadata_multindex.json', orient='records')

In [84]:
df = pd.read_parquet("./metadata_multindex.parquet")

In [17]:
import urllib3

In [None]:
data = urllib3.urlopen(df.at[])

In [85]:
df = df.reset_index()

In [86]:
df = df.groupby("title").agg(lambda x: x.tolist()[0] if len(set(x.tolist()))== 1 else x.tolist()).reset_index()

In [88]:
df

Unnamed: 0,title,hasFormat,publisher,license,issued,rights,downloads,creator_name,creator_birthdate,creator_deathdate,...,creator_webpage,description,language,subject1,subject2,subject3,subject4,bookshelf1,bookshelf2,bookshelf3
0,!Tention: A Story of Boy-Life during the Penin...,[https://www.gutenberg.org/ebooks/21374.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2007-05-08,Public domain in the USA.,641,"Fenn, George Manville",1831,1909,...,https://en.wikipedia.org/wiki/George_Manville_...,,en,"Peninsular War, 1807-1814 -- Juvenile fiction",PZ,,,,,
1,"""'Tis Sixty Years Since""\r\nAddress of Charles...",[https://www.gutenberg.org/ebooks/9996.html.im...,Project Gutenberg,http://www.gutenberg.org/license,2006-02-01,Public domain in the USA.,43,"Adams, Charles Francis",1835,1915,...,https://en.wikipedia.org/wiki/Charles_Francis_...,,en,"Philosophy, Modern",B,,,,,
2,"""1683-1920""\r\nThe Fourteen Points and What Be...",[https://www.gutenberg.org/ebooks/50075.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2015-09-29,Public domain in the USA.,84,"Schrader, Frederick Franklin",1857,,...,,,en,Germans -- United States,"World War, 1914-1918 -- Miscellanea",E151,,,,
3,"""1812"" Napoleon I in Russia",[https://www.gutenberg.org/ebooks/51418.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2016-03-12,Public domain in the USA.,46,"Vereshchagin, Vasilïĭ Vasilʹevich",1842,1904,...,http://en.wikipedia.org/wiki/Vasily_Vereshchagin,,en,"Napoleonic Wars, 1800-1815 -- Campaigns -- Russia",Russia -- History -- 1801-1917,DC,,,,
4,"""1914""",[https://www.gutenberg.org/ebooks/66846.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2021-11-30,Public domain in the USA.,42,"Oxenham, John",1852,1941,...,https://en.wikipedia.org/wiki/William_Arthur_D...,,en,"World War, 1914-1918 -- Fiction",PR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69377,鹽鐵論,[https://www.gutenberg.org/ebooks/26920.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-10-14,Public domain in the USA.,47,"Huan, Kuan, active 1st century B.C.",,,...,,,zh,Government monopolies -- China,Salt industry and trade -- China,Iron industry and trade -- China,HD,,,
69378,麟兒報,[https://www.gutenberg.org/ebooks/27399.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-12-03,Public domain in the USA.,46,Anonymous,,,...,https://en.wikipedia.org/wiki/Anonymous_work,,zh,Chinese fiction,PL,,,,,
69379,黃繡球,[https://www.gutenberg.org/ebooks/25147.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-04-23,Public domain in the USA.,31,,,,...,,,zh,PL,,,,,,
69380,黄帝宅經,[https://www.gutenberg.org/ebooks/27858.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2009-01-21,Public domain in the USA.,26,Unknown,,,...,http://www.gutenberg.org/ebooks/author/216,,zh,Feng shui,PL,,,,,


In [89]:
df.to_parquet("./data/metadata.parquet")

ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column hasFormat with type object')

In [90]:
df_new =df.reset_index()

In [91]:
df_new.rename({'index':'ID'},axis=1, inplace=True)

In [92]:
df = df_new

In [93]:
df

Unnamed: 0,ID,title,hasFormat,publisher,license,issued,rights,downloads,creator_name,creator_birthdate,...,creator_webpage,description,language,subject1,subject2,subject3,subject4,bookshelf1,bookshelf2,bookshelf3
0,0,!Tention: A Story of Boy-Life during the Penin...,[https://www.gutenberg.org/ebooks/21374.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2007-05-08,Public domain in the USA.,641,"Fenn, George Manville",1831,...,https://en.wikipedia.org/wiki/George_Manville_...,,en,"Peninsular War, 1807-1814 -- Juvenile fiction",PZ,,,,,
1,1,"""'Tis Sixty Years Since""\r\nAddress of Charles...",[https://www.gutenberg.org/ebooks/9996.html.im...,Project Gutenberg,http://www.gutenberg.org/license,2006-02-01,Public domain in the USA.,43,"Adams, Charles Francis",1835,...,https://en.wikipedia.org/wiki/Charles_Francis_...,,en,"Philosophy, Modern",B,,,,,
2,2,"""1683-1920""\r\nThe Fourteen Points and What Be...",[https://www.gutenberg.org/ebooks/50075.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2015-09-29,Public domain in the USA.,84,"Schrader, Frederick Franklin",1857,...,,,en,Germans -- United States,"World War, 1914-1918 -- Miscellanea",E151,,,,
3,3,"""1812"" Napoleon I in Russia",[https://www.gutenberg.org/ebooks/51418.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2016-03-12,Public domain in the USA.,46,"Vereshchagin, Vasilïĭ Vasilʹevich",1842,...,http://en.wikipedia.org/wiki/Vasily_Vereshchagin,,en,"Napoleonic Wars, 1800-1815 -- Campaigns -- Russia",Russia -- History -- 1801-1917,DC,,,,
4,4,"""1914""",[https://www.gutenberg.org/ebooks/66846.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2021-11-30,Public domain in the USA.,42,"Oxenham, John",1852,...,https://en.wikipedia.org/wiki/William_Arthur_D...,,en,"World War, 1914-1918 -- Fiction",PR,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69377,69377,鹽鐵論,[https://www.gutenberg.org/ebooks/26920.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-10-14,Public domain in the USA.,47,"Huan, Kuan, active 1st century B.C.",,...,,,zh,Government monopolies -- China,Salt industry and trade -- China,Iron industry and trade -- China,HD,,,
69378,69378,麟兒報,[https://www.gutenberg.org/ebooks/27399.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-12-03,Public domain in the USA.,46,Anonymous,,...,https://en.wikipedia.org/wiki/Anonymous_work,,zh,Chinese fiction,PL,,,,,
69379,69379,黃繡球,[https://www.gutenberg.org/ebooks/25147.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2008-04-23,Public domain in the USA.,31,,,...,,,zh,PL,,,,,,
69380,69380,黄帝宅經,[https://www.gutenberg.org/ebooks/27858.html.i...,Project Gutenberg,http://www.gutenberg.org/license,2009-01-21,Public domain in the USA.,26,Unknown,,...,http://www.gutenberg.org/ebooks/author/216,,zh,Feng shui,PL,,,,,


In [95]:
import requests
df_new = []
for i,row in df.iterrows():
    try:
        txt_url = next(filter(lambda x: x.endswith(".txt.utf-8"),row.hasFormat))
        format_=".txt"
        encoding="utf-8"
    except StopIteration:
        txt_url = next(filter(lambda x: x.endswith(".txt"), row.hasFormat))
        format_=".txt"
        encoding="UNKNOWN"
    except:
        txt_url = ""
        format=""
        encoding=""
    
    if txt_url != "":
        res = requests.get(txt_url)
        if res.status_code == 200:
            text = res.text

    df_new.append({
        'metadataID':row.ID,
        'title':row.title,
        'format':format_,
        'encoding': encoding,
        'text' : text,
        'language':row.language,
        })

KeyboardInterrupt: 

In [96]:
len(df_new)

405

In [None]:
df_books = pd.DataFrame(df_new)

In [None]:
df_books.to_parquet("./data/gutenberg")