# Fetch guternberg books from a category

## Step 1, get book ids

- go to http://m.gutenberg.org/ebooks/search.mobile/?query=bsxHorror&sort_order=downloads

- scroll to the bottom and click "show more" a few times
- enter the javascript below in the browsers js console
- it should have copied the ids to your clipboard, you can paste it into "ids" below


```js
// to get all book ids shown on page, paste this javascript into js console in browser when on the page above
a_elems = document.getElementsByClassName("table link")
hrefs = Array.from(a_elems)
  .map(e=>e.href) // get link
  .filter(e=>e) // remove empty links
ids = hrefs.map(e=>/(\d+)\.mobile/.exec(e)) // regular expression match
  .filter(e=>e) // remove ones not found
  .map(e=>e[1]) // get just id
copy(ids) // copy to clipboard
```

In [7]:
import requests
import os
import re
import bs4
import time
import json
from tqdm import tqdm_notebook as tqdm

dest_dir = '../data/input/horror_gutenberg'
if not os.path.isdir(dest_dir):
    os.makedirs(dest_dir)
    
raw_dir = os.path.join(dest_dir, 'raw')
if not os.path.isdir(raw_dir):
    os.makedirs(raw_dir)

In [8]:
# urls to download text inputs
ids = [
  "345",
  "43",
  "5200",
  "209",
  "2148",
  "375",
  "42",
  "10007",
  "2147",
  "8492",
  "7849",
  "6087",
  "389",
  "11438",
  "10897",
  "2149",
  "10150",
  "31469",
  "2151",
  "10002",
  "10662",
  "8486",
  "2150",
  "14833",
  "23172",
  "53419",
  "25016",
  "16726",
  "3781",
  "1188",
  "20387",
  "11074",
  "9629",
  "20034",
  "19797",
  "3095",
  "17144",
  "14082",
  "14168",
  "10542",
  "14317",
  "14107",
  "2520",
  "18233",
  "10053",
  "6534",
  "14154",
  "18089",
  "12562"
]


In [9]:
# download/cache raw files
for bid in tqdm(ids):
    
    # first download index
    index_url = "http://www.gutenberg.org/files/{bid:}".format(bid=bid)
    r = requests.get(index_url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.content, "html5lib")
    hrefs = [e.attrs['href'] for e in soup.findAll('a')]
    links = [h for h in hrefs if h.endswith('.txt')]
    
    # download text
    for link in links:
        txt_url = index_url + '/' + link
        outfile = os.path.join(dest_dir, 'raw', link)
        if not os.path.isfile(outfile):
            r = requests.get(txt_url)
            r.raise_for_status()
            open(outfile, 'w').write(r.text)
            
    time.sleep(0.15) # avoid ddos/ban

HBox(children=(IntProgress(value=0, max=49), HTML(value='')))




In [10]:
# from https://github.com/motoom/gutenberg-ebook-scraping/blob/master/gutenberg.py

# Repetitive stuff I don't want to read a 1000 times on my eBook reader.
remove = ["Produced by","End of the Project Gutenberg","End of Project Gutenberg"]

def beautify(text):
    ''' Reads a raw Project Gutenberg etext, reformat paragraphs,
    and removes fluff.  Determines the title of the book'''
    FIRST_CONTENT_LINE_LENGTH=300
    lines = [line.strip() for line in text.split('\n')]
    collect = False
    lookforsubtitle = False
    outlines = []
    startseen = endseen = False
    found_first_paragraph = False
    title=""
    author=""
    language=""
    extra=[]
    for line in lines:
        if line.startswith("Author: "):
            author = line[8:]
        if line.startswith("Language: "):
            language = line[10:]
        if line.startswith("Title: "):
            title = line[7:]
            lookforsubtitle = True
            continue
        if lookforsubtitle:
            if not line.strip():
                lookforsubtitle = False
            else:
                subtitle = line.strip()
                subtitle = subtitle.strip(".")
                title += ", " + subtitle
        if ("*** START" in line) or ("***START" in line) or (line.startswith("*END THE SMALL PRINT!")):
            collect = startseen = True
            paragraph = ""
            extra.append(line)
            continue
            
        
        if ("*** END" in line) or ("***END" in line):
            endseen = True
            extra.append(line)
            break           
        if not collect:
            extra.append(line)
            continue
        if not line:
            paragraph = paragraph.strip().replace('_', '')
            for term in remove:
                if paragraph.startswith(term):
                    extra.append(line)
                    paragraph = ""
                    break
            # Skip all the misc paragraphs, and only start at the first long paragraphs
            if not found_first_paragraph:
                if len(paragraph)>FIRST_CONTENT_LINE_LENGTH:
                    found_first_paragraph = True
            if paragraph and found_first_paragraph:
                outlines.append(paragraph)
                outlines.append("")
            paragraph = ""
        else:
            paragraph += " " + line

    # Report on anomalous situations, but don't make it a showstopper.
    if not title:
#         print (ofn)
        print ("    Problem: No title found\n")
    if not startseen:
#         print (ofn)
        print ("    Problem: No '*** START' seen\n")
    if not endseen:
#         print (ofn)
        print ("    Problem: No '*** END' seen\n")
        
    return dict(
        content='\n'.join(outlines),
        title=title,
        author=author,
        language=language,
        extra=extra
    )
        


# 2. turn into cleaned(ish) txt

In [11]:
from sklearn.model_selection import train_test_split

In [15]:
# convert raw text into one long csv

max_len = 400
num_sent = 6
data=[]
for infile in os.listdir(raw_dir):
    path = os.path.join(raw_dir, infile)
    info = beautify(open(path).read())
    if info['language']=='English':
        print(info['title'])
        data.append(info['content'])

Animal Ghosts, Or, Animal Hauntings and the Hereafter
The Sorcery Club
The Works of Edgar Allan Poe, Volume 5 (of 5) of the Raven Edition
Dracula
Ghost Stories of an Antiquary, Part 2: More Ghost Stories
The House of the Vampire
An Occurrence at Owl Creek
The Tale of Terror
The King in Yellow
The Trial
Dracula
The Lair of the White Worm
The House of the Vampire
The Trial
The Vampyre; A Tale
A Thin Ghost and Others
A Thin Ghost and Others
Widdershins
The Sorcery Club
The Works of Edgar Allan Poe, Volume 3 (of 5) of the Raven Edition
Metamorphosis
The Great God Pan
The Boats of the "Glen Carrig"
The Works of Edgar Allan Poe, Volume 5 (of 5) of the Raven Edition
The Night Land
Ghost Stories of an Antiquary, Part 2: More Ghost Stories
The Damned
Ghost Stories of an Antiquary
Twenty-Five Ghost Stories
Animal Ghosts, Or, Animal Hauntings and the Hereafter
Carmilla
The Works of Edgar Allan Poe, Volume 2 (of 5) of the Raven Edition
The Vampyre; A Tale
Scottish Ghost Stories
The Turn of the Scr

In [13]:
x_train, x_test = train_test_split(data)
x_val, x_test = train_test_split(x_test)

In [14]:
open(os.path.join(dest_dir, "train.txt"), "w").write("\n\n".join(x_train))
open(os.path.join(dest_dir, "val.txt"), "w").write("\n\n".join(x_val))
open(os.path.join(dest_dir, "test.txt"), "w").write("\n\n".join(x_test))

1677913