# Fetch guternberg books from a category

## Step 1, get book ids

- go to http://m.gutenberg.org/ebooks/search.mobile/?query=bsxHorror&sort_order=downloads

- scroll to the bottom and click "show more" a few times
- enter the javascript below in the browsers js console
- it should have copied the ids to your clipboard, you can paste it into "ids" below


```js
// to get all book ids shown on page, paste this javascript into js console in browser when on the page above
a_elems = document.getElementsByClassName("table link")
hrefs = Array.from(a_elems)
  .map(e=>e.href) // get link
  .filter(e=>e) // remove empty links
ids = hrefs.map(e=>/(\d+)\.mobile/.exec(e)) // regular expression match
  .filter(e=>e) // remove ones not found
  .map(e=>e[1]) // get just id
copy(ids) // copy to clipboard
```

In [2]:
import os
os.sys.path.append('..')

In [3]:
import requests
import os
import re
import bs4
import time
import json


dest_dir = '../data/input/horror_gutenberg'
if not os.path.isdir(dest_dir):
    os.makedirs(dest_dir)
    
raw_dir = os.path.join(dest_dir, 'raw')
if not os.path.isdir(raw_dir):
    os.makedirs(raw_dir)

In [4]:
from tqdm import tqdm as tqdm

In [5]:
from dataset import parse_gutenberg

In [6]:
# urls to download text inputs
ids = [
  "345",
  "43",
  "5200",
  "209",
  "2148",
  "375",
  "42",
  "10007",
  "2147",
  "8492",
  "7849",
  "6087",
  "389",
  "11438",
  "10897",
  "2149",
  "10150",
  "31469",
  "2151",
  "10002",
  "10662",
  "8486",
  "2150",
  "14833",
  "23172",
  "53419",
  "25016",
  "16726",
  "3781",
  "1188",
  "20387",
  "11074",
  "9629",
  "20034",
  "19797",
  "3095",
  "17144",
  "14082",
  "14168",
  "10542",
  "14317",
  "14107",
  "2520",
  "18233",
  "10053",
  "6534",
  "14154",
  "18089",
  "12562"
]


In [None]:
# download/cache raw files
for bid in ids:
    
    # first download index
    index_url = "http://www.gutenberg.org/files/{bid:}".format(bid=bid)
    r = requests.get(index_url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.content, "html5lib")
    hrefs = [e.attrs['href'] for e in soup.findAll('a')]
    links = [h for h in hrefs if h.endswith('.txt')]
    time.sleep(0.1) # avoid ddos/ban
    
    # download text
    for link in links:
        txt_url = index_url + '/' + link
        outfile = os.path.join(dest_dir, 'raw', link)
        if not os.path.isfile(outfile):
            r = requests.get(txt_url)
            r.raise_for_status()
            open(outfile, 'w').write(r.text)
            
            time.sleep(0.1) # avoid ddos/ban

In [None]:
# download/cache raw files
for bid in tqdm(ids, mininterval=60):
    
    # first download index
    index_url = "http://www.gutenberg.org/files/{bid:}".format(bid=bid)
    r = requests.get(index_url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.content, "html5lib")
    hrefs = [e.attrs['href'] for e in soup.findAll('a')]
    links = [h for h in hrefs if h.endswith('.txt')]
    time.sleep(0.1) # avoid ddos/ban
    
    # download text
    for link in links:
        txt_url = index_url + '/' + link
        outfile = os.path.join(dest_dir, 'raw', link)
        if not os.path.isfile(outfile):
            r = requests.get(txt_url)
            r.raise_for_status()
            open(outfile, 'w').write(r.text)
            
            time.sleep(0.1) # avoid ddos/ban

# 2. turn into cleaned(ish) txt

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# convert raw text into one long csv

max_len = 400
num_sent = 6
data=[]
for infile in os.listdir(raw_dir):
    path = os.path.join(raw_dir, infile)
    info = parse_gutenberg(open(path).read())
    if info['language']=='English':
        print(info['title'])
        data.append(info['content'])

In [None]:
x_train, x_test = train_test_split(data)
x_val, x_test = train_test_split(x_test)

In [None]:
open(os.path.join(dest_dir, "train.txt"), "w").write("\n\n".join(x_train))
open(os.path.join(dest_dir, "val.txt"), "w").write("\n\n".join(x_val))
open(os.path.join(dest_dir, "test.txt"), "w").write("\n\n".join(x_test))