In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../src/ml_indie_tools')  # Point to local module source

In [4]:
from env_tools import MLEnv

In [5]:
ml=MLEnv(platform='tf', accelerator='gpu')
ml.describe()

'OS: Darwin, Python: 3.10.5 (Conda), Jupyter Notebook Tensorflow: 2.9.1, GPU: METAL'

In [6]:
import logging
logging.basicConfig(encoding='utf-8', level=logging.INFO)
from Gutenberg_Dataset import Gutenberg_Dataset

In [7]:
# simply remove root_url parameter, if no local Gutenberg mirror is available, files are then downloaded on-demand
gd=Gutenberg_Dataset()
gd.load_index()

In [8]:
gd.search?

[0;31mSignature:[0m [0mgd[0m[0;34m.[0m[0msearch[0m[0;34m([0m[0msearch_dict[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Search for book record with key specific key values
For a list of valid keys, use `get_record_keys()`
Standard keys are: `ebook_id`, `author`, `language`, `title`

*Note:* :func:`~Gutenberg_Dataset.Gutenberg_Dataset.load_index` needs to be called once before this function can be used.

Example: `search({"title": ["philosoph","phenomen","physic","hermeneu","logic"], "language":"english"})`
Find all books whose titles contain at least one of the keywords, language english. Search keys can either be
search for a single keyword (e.g. english), or an array of keywords. 

:returns: list of records 
[0;31mFile:[0m      ~/gith/domschl/ml-indie-tools/src/ml_indie_tools/Gutenberg_Dataset.py
[0;31mType:[0m      method


In [9]:
sl=gd.search({'title': ['prole', 'hermen']})
print(f"{len(sl)} books found.")

8 books found.


In [10]:
sl2=gd.insert_book_texts(sl)

In [11]:
from Text_Dataset import *

In [12]:
tl = Text_Dataset(sl2)

INFO:Datasets:Loaded 8 texts


In [13]:
len(tl.text_list)

8

In [14]:
for i in range(3):
    idx, txt = tl.get_random_sample(80, weighted=False)
    print(tl.text_list[idx]['title'])
    print(txt)
    print()

Legge Prolegomena
hed scholar and officer, f the reign of Kwang-wu [2], the first emperor of the E

Achter de schermen
itmaal alles behalve vriendelijk.
Bij zijn rit van de Gare du Nord door de Rue L

Prolegomena to the Study of Hegel's Philosophy
_: if each atom seems complete, they are subject to a
necessity which forces the



In [15]:
len(gd.records)

55518

In [16]:
lang={}
for r in gd.records:
    if 'language' in r:
        l=r['language']
    else:
        l='<undefined>'
    if l in lang:
        lang[l]=lang[l]+1
    else:
        lang[l]=1

In [17]:
lang

{'English': 44884,
 'Dutch': 648,
 'Finnish': 2122,
 'French': 2599,
 'Hungarian': 419,
 'German': 1771,
 'Spanish': 630,
 'Portuguese': 414,
 'Italian': 783,
 'Chinese': 391,
 'Welsh': 10,
 'Swedish': 191,
 'Latin': 77,
 'Afrikaans': 10,
 'Danish': 60,
 'Esperanto': 89,
 'Galician': 2,
 'Romanian': 4,
 'Englilsh': 1,
 'Low German': 1,
 'Catalan': 25,
 'Frisian': 4,
 'Hebrew': 5,
 'Czech': 5,
 'Slovenian': 2,
 'Norwegian': 14,
 'Greek': 193,
 'English and French': 2,
 'Pennsylvania Dutch': 1,
 'Irish': 2,
 'Telugu': 6,
 'Scots': 1,
 'Cebuano': 2,
 'Tagalog': 39,
 'Latin and English side by side': 1,
 'Estonian': 1,
 'Arabic': 1,
 'Japanese': 19,
 'Ojibwa': 1,
 'Scots Gaelic': 1,
 'Polish': 17,
 'Russian': 5,
 'Friulian': 6,
 'Portuguese & French': 1,
 'English English': 1,
 'Arapaho': 1,
 'Bagobo and Spanish': 1,
 'Inuktitut': 1,
 'English, Latin, Spanish, and Italian': 1,
 'Zh (Chinese)': 2,
 'Zh': 1,
 'Czech and Esperanto': 1,
 'Icelandic': 6,
 'Venetian': 1,
 'Kamilaroi and English'

In [18]:
print(tl.text_list[idx]['title'])
print(txt)
    

Prolegomena to the Study of Hegel's Philosophy
_: if each atom seems complete, they are subject to a
necessity which forces the


In [19]:
tl.tcum[:10]

[0.08188722144927345,
 0.31788877028439744,
 0.3979209146719631,
 0.4255873400904846,
 0.45566459457643327,
 0.508855955028702,
 0.9110163376769016,
 1.0]

In [20]:
tl.source_highlight("Wenn wir irgendwas schreiben, daß Teil aus der Source enthält, so wie: Das ist ihr Wahlspruch, dann erfolgt ein Highlight")

In [23]:
tl.source_highlight("If we write anything that contains parts of the sources, like: that is their motto, then a highlight will be applied.")

In [22]:
test_text="That would be a valid argument if we hadn't defeated it's assumptions way before."
print(f"Text length {len(test_text)}, {test_text}")
tokenizer='ngram'
tl.init_tokenizer(tokenizer=tokenizer)
st = tl.tokenize(test_text)
print(f"Token-count: {len(st)}, {st}")

Text length 81, That would be a valid argument if we hadn't defeated it's assumptions way before.
Token-count: 28, [1509, 3890, 1843, 1492, 1144, 2673, 2656, 1117, 3186, 2080, 3993, 2077, 1918, 46, 109, 2826, 1882, 2011, 1837, 47, 3078, 2299, 2370, 3327, 1608, 1988, 3363, 1898]


In [35]:
test2="ðƒ "+test_text
print(f"Text length {len(test2)}, {test2}")
el=tl.encode(test2)
print(f"Token-count: {len(el)}, {el}")

Text length 84, ðƒ That would be a valid argument if we hadn't defeated it's assumptions way before.
Token-count: 44, ['<unk>', '<unk>', '<wsep>', 1237, 1059, '<wsep>', 1919, '<wsep>', 1101, '<wsep>', 44, '<wsep>', 1765, 1215, '<wsep>', 4409, 2934, 1077, '<wsep>', 1298, '<wsep>', 1180, '<wsep>', 1420, 45, 108, 46, '<wsep>', 4360, 3065, 55, '<wsep>', 1064, 2275, '<wsep>', 1593, 1633, 2059, 57, '<wsep>', 1873, '<wsep>', 2081, 1409]


In [34]:
tl.decode(el, mark_separator=True)

"<unk><unk> Th_at_ would_ be_ a_ val_id_ arg_ume_nt_ if_ we_ had_n_'_t_ def_eate_d_ it_'s_ ass_um_ption_s_ way_ befor_e._"

## Pandas and Huggingface datasets

`!pip install transformer` or `conda install transformers`
`!pip install datasets` or `conda install datasets`

In [32]:
sl2[1].keys()

dict_keys(['title', 'ebook_id', 'author', 'subtitle', 'language', 'text', 'index', 'probability_weight'])

In [22]:
import pandas as pd

In [23]:
df = pd.DataFrame(sl2)

In [24]:
df

Unnamed: 0,title,ebook_id,author,subtitle,language,text,index,probability_weight,illustrator,editor,translator,author a.k.a.,editors
0,First notions of logic,67017,Augustus De Morgan,(preparatory to the study of geometry),English,FIRST NOTIONS\n ...,1.0,0.000278,,,,,
1,Researches Chemical and Philosophical,66955,Humphry Davy,Chiefly concerning nitrous oxide or dephlogist...,English,"RESEARCHES,\n ...",2.0,0.002587,,,,,
2,"The origins of art, a psychological & sociolog...",66869,Yrjö Hirn,,English,THE ORIGINS OF ART\n...,3.0,0.002853,,,,,
3,"The Story of a Boulder; or, Gleanings from the...",66703,Archibald Geikie,,English,THE STORY OF A BOULDER....,4.0,0.002410,,,,,
4,Astronomy and General Physics Considered with ...,66406,,,English,BRIDGEWATER TREATISES....,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,"Philosophy 4, A Story of Harvard University",862,Owen Wister,,English,PHILOSOPHY 4\n\nA STORY OF HARVARD UNIVERSITY\...,471.0,0.000287,,,,,
489,Philoktetes,806C,Sophocles,,English,SOPHOKLES\n\nPHILOKTETES\n\nTranslated by Greg...,472.0,0.000318,,,,,
490,Psychological Counter-current in Recent Fictio...,726,,,English,A PSYCHOLOGICAL COUNTER-CURRENT IN RECENT FICT...,,,,,,,
491,The Philobiblon of Richard de Bury,626,Richard de Bury,,English,THE LOVE OF BOOKS\n\nTHE PHILOBIBLON OF RICHAR...,473.0,0.000630,,,,,
