In [21]:
from gutenbergpy.gutenbergcache import GutenbergCache, GutenbergCacheTypes
import os
import json
import pandas as pd
import numpy as np
import pickle as pkl

In [22]:
cache  = GutenbergCache.get_cache()

# from IPython.display import Image
# Image(filename='sqlitescheme.png') 

In [23]:
# book +title
books = [a for a in cache.native_query("SELECT * FROM books")]
df = pd.DataFrame(books)
df.columns = ["book_id", "publish_id", "date", "right_id", "num_downloads", "language_id", "book_shelve_id", "guten_id", "type_id"]

titles = [a for a in cache.native_query("SELECT * FROM titles")]
df2 = pd.DataFrame(titles)
df2.columns = ["title_id", "title", "book_id"]
df = df.merge(df2, on="book_id")
 
# type
book_types =  [a for a in cache.native_query("SELECT * FROM types")]
book_types_dic =  {a:b for a, b in book_types}
df["book_type"] = df["type_id"].map(book_types_dic)

# language
languages =  [a for a in cache.native_query("SELECT * FROM languages")]
lang_dic = {a:b for a, b in languages}
df["language"] = df["language_id"].map(lang_dic)

# author_id
author = [a for a in cache.native_query("SELECT * FROM book_authors")]
author_df = pd.DataFrame(author)
author_df.columns = ["book_id", "author_id"]
df = df.merge(author_df, on="book_id")

# author name
author_name = [a for a in cache.native_query("SELECT * FROM authors")]
author_name_df = pd.DataFrame(author_name)
author_name_df.columns = ["author_id", "author_name"]
df = df.merge(author_name_df, on="author_id")

# df.head()

# book_subjects
book_subjects = [a for a in cache.native_query("SELECT * FROM book_subjects")]
book_subjects_df = pd.DataFrame(book_subjects)
book_subjects_df.columns = ["book_id", "subjects_id"]
df = df.merge(book_subjects_df, on="book_id")

# subjects_name
subjects = [a for a in cache.native_query("SELECT * FROM subjects")]
subjects_df = pd.DataFrame(subjects)
subjects_df.columns = ["subjects_id", "subjects_name"]
df = df.merge(subjects_df, on="subjects_id")


# download_links
download_links = [a for a in cache.native_query("SELECT * FROM downloadlinks")]
download_links_df = pd.DataFrame(download_links)
download_links_df.columns = ["download_id", "download_name", "download_type_id", "book_id"]
df = df.merge(download_links_df, on="book_id")

df.head()

Unnamed: 0,book_id,publish_id,date,right_id,num_downloads,language_id,book_shelve_id,guten_id,type_id,title_id,title,book_type,language,author_id,author_name,subjects_id,subjects_name,download_id,download_name,download_type_id
0,1,1,2005-06-02,1,4,1,1,15970,-1,1,The Great Round World and What Is Going On In ...,,en,1,Verschillende,1,"History, Modern -- 19th century -- Juvenile li...",1,https://www.gutenberg.org/ebooks/15970.html.im...,1
1,1,1,2005-06-02,1,4,1,1,15970,-1,1,The Great Round World and What Is Going On In ...,,en,1,Verschillende,1,"History, Modern -- 19th century -- Juvenile li...",2,https://www.gutenberg.org/files/15970/15970-h/...,2
2,1,1,2005-06-02,1,4,1,1,15970,-1,1,The Great Round World and What Is Going On In ...,,en,1,Verschillende,1,"History, Modern -- 19th century -- Juvenile li...",3,https://www.gutenberg.org/files/15970/15970-h.zip,2
3,1,1,2005-06-02,1,4,1,1,15970,-1,1,The Great Round World and What Is Going On In ...,,en,1,Verschillende,1,"History, Modern -- 19th century -- Juvenile li...",4,https://www.gutenberg.org/ebooks/15970.epub3.i...,3
4,1,1,2005-06-02,1,4,1,1,15970,-1,1,The Great Round World and What Is Going On In ...,,en,1,Verschillende,1,"History, Modern -- 19th century -- Juvenile li...",5,https://www.gutenberg.org/ebooks/15970.epub.im...,3


In [24]:
# only eng
df = df[df["language"] == "en"]

# No anonymous / (Various) / etc.
df = df[~df["author_name"].str.lower().str.contains("anonymous")]
df = df[~df["author_name"].str.lower().str.contains("anonyme")]
df = df[~df["author_name"].str.lower().str.contains("various")]
df = df[~df["author_name"].str.lower().str.contains("verschillende")]


# Key identifiers such as character names e.g. Captain Ahab
df = df[~df["author_name"].str.lower().str.contains("captain ahab")]

In [25]:
# each author how many books
author_books_count = df.groupby("author_name")["title"].nunique().to_frame().reset_index().rename({"title": "author_books_count"}, axis=1)
author_books_count

Unnamed: 0,author_name,author_books_count
0,'Colored Quartet' (name unknown),3
1,".מלמד, ש. מ",1
2,"1, A-No.",1
3,6th King of Babylon,1
4,7-Up Company,2
...,...,...
38116,陶潛,1
38117,陶潜,1
38118,高木庭次郎,1
38119,鹽谷榮,1


In [26]:
df = df.merge(author_books_count, on="author_name")
df.head()

Unnamed: 0,book_id,publish_id,date,right_id,num_downloads,language_id,book_shelve_id,guten_id,type_id,title_id,...,book_type,language,author_id,author_name,subjects_id,subjects_name,download_id,download_name,download_type_id,author_books_count
0,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526794,https://www.gutenberg.org/ebooks/17207.html.im...,1,7
1,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526795,https://www.gutenberg.org/files/17207/17207-h/...,2,7
2,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526796,https://www.gutenberg.org/files/17207/17207-h.zip,2,7
3,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526797,https://www.gutenberg.org/ebooks/17207.epub3.i...,3,7
4,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526798,https://www.gutenberg.org/ebooks/17207.epub.im...,3,7


In [27]:
#If the total amount of works of an author is less than 5, then filter out

df = df[df["author_books_count"] >=5]
df.head()

Unnamed: 0,book_id,publish_id,date,right_id,num_downloads,language_id,book_shelve_id,guten_id,type_id,title_id,...,book_type,language,author_id,author_name,subjects_id,subjects_name,download_id,download_name,download_type_id,author_books_count
0,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526794,https://www.gutenberg.org/ebooks/17207.html.im...,1,7
1,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526795,https://www.gutenberg.org/files/17207/17207-h/...,2,7
2,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526796,https://www.gutenberg.org/files/17207/17207-h.zip,2,7
3,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526797,https://www.gutenberg.org/ebooks/17207.epub3.i...,3,7
4,32306,1,2005-12-03,1,6,1,6,17207,-1,34287,...,,en,12876,"Chambers, R. (Robert)",2,AP,526798,https://www.gutenberg.org/ebooks/17207.epub.im...,3,7


In [28]:
df["author_name"].nunique()

5037

In [15]:
g = df.drop_duplicates(subset=['title', 'author_name'], keep='last')

In [16]:
import gutenbergpy.textget

def get_500_lines(book_id):
    raw_book = gutenbergpy.textget.get_text_by_id(book_id) # with headers
    clean_book = gutenbergpy.textget.strip_headers(raw_book).decode("utf-8") # without headers
    
    lines = clean_book.strip().split("\n")
    tot_lines = len(lines)
    
    # select radomly 500 lines 
    line_numbers = np.random.choice(np.arange(tot_lines), 500)
    select_500_lines = [lines[i] for i in line_numbers]
    return line_numbers, select_500_lines

# test get_500_lines() 
# a, b = get_500_lines(2701)

In [17]:
# time

# writer = g["author_name"].unique()
#
# # 3 ) Combine and consolidate text: Half a day
# # Topics
# # Authors
# # Book Title
# # Line Numbers
# # And Text
#
# h = {}
#
# Topics = []
# Authors = []
# Book_id = []
# Line_Numbers = []
# Line_Text = []
#
# for w in writer:
#
#     book_ids = g[g["author_name"] == w]["book_id"]
#
#     select_ids = np.random.choice(book_ids,5)
#     print(select_ids)
#
#
#     for book_id in select_ids:
#         line_nums, lines = get_500_lines(book_id)
#         for a, b in zip(line_nums, lines):
#
#             Authors.append(w)
#             Book_id.append(book_id)
#             Line_Numbers.append(a)
#             Line_Text.append(b)
import numpy as np
writer = g["author_name"].unique()

# 3 ) Combine and consolidate text: Half a day
# Topics
# Authors
# Book Title
# Line Numbers
# And Text

Topics = []
Authors = []
Book_id = []
Line_Numbers = []
Line_Text = []

for w in writer:
    # For an author, randomly select 5 books
    book_ids = g[g["author_name"] == w]["book_id"]

    select_ids = np.random.choice(book_ids,5)
    print(select_ids)

    # For each book, randomly select 500 lines
    for book_id in select_ids:
        try:
            line_nums, lines = get_500_lines(book_id)
            for a, b in zip(line_nums, lines):

                Authors.append(w)
                Book_id.append(book_id)
                Line_Numbers.append(a)
                Line_Text.append(b)
        except:
            pass


[16582 32306 12433 16582 32306]
[15557 10205 32306 15557 10205]
[23445 15221 55485 58634 66207]
[47816 44897 44897 63243 51786]
[63243 63243 66607 55291 20520]
[33790 20520 55746 69117 53851]
[45213 45213 34094 60931 60931]
[60931 15709  1719 29691 29691]
[15709 34094 34094  1719 22992]
[62304 19179 20524 35809  7358]
[25931 24659 25931  2383  2383]
[35809 35809 33365 37344 20524]
[40645 55697  2383 26892 18813]
[35809  6732 19179 64150  7358]
[32466 36179 30292 48974 43158]
[13619 23611  1508 59201 33691]
[30898 37845  8231 48558  5734]
[43545   917 39099 50676 63074]
[33981 42447 64803  3685 40374]
[ 1210  7912 16455 25784 65710]
[57694 34658   857 25977 25784]
[ 2150 37617 16139 58758 14729]
[55735 17765 33453 55927 39474]
[ 8596 61853 58937 38757 55735]
[  149 61853 33453 33453   149]
[39474 54683 61853  8596 55735]
[41066  2966  7220  2966 41214]
[43392 54440   478 38999 59740]
[  184 34984 54440 67817 38999]
[34984 43392  9453 52835 67817]
[ 6378 36578 52835 38999 36713]
[ 9453 5

In [20]:
import pandas as pd
m = {"Authors":  Authors,
        "Book_id": Book_id, 
         "Line_Numbers": Line_Numbers, 
         "Line_Text":Line_Text, 
    }

ss = pd.DataFrame(m)
# Write the file first here to prevent repeated operations later.
ss.to_csv("a1.csv", index=False)
ss.head()

NameError: name 'Authors' is not defined

# re-read the file, then merge

In [17]:
import pandas as pd
d2 = pd.read_csv("a1.csv")
d2 = d2.rename({"Book_id": "book_id"}, axis=1)
d2.head()

Unnamed: 0,Authors,book_id,Line_Numbers,Line_Text
0,"Chambers, R. (Robert)",16582,344,
1,"Chambers, R. (Robert)",16582,166,laitoit sille eri suitsua juuri kuin pyydyksek...
2,"Chambers, R. (Robert)",16582,4419,
3,"Chambers, R. (Robert)",16582,2785,"sanoi: ""Tuo kanalja, minkä petoksen teki tuo v..."
4,"Chambers, R. (Robert)",16582,2913,Tapanin tullessa huoneeseen olivat kumpasetkin...


In [18]:
# merge  ss and  titles
d3 = d2.merge(df2, on="book_id")
d3.head()

NameError: name 'df2' is not defined

In [None]:
# merge  ss and subjects_name
d4 = d2.merge(book_subjects_df, on="book_id")
d4.head()

In [15]:
d5 = d3.merge(subjects_df, on="subjects_id")
d5.head()

NameError: name 'd3' is not defined