# Fetch guternberg books from a category

## Step 1, get book ids

- go to http://m.gutenberg.org/ebooks/search.mobile/?query=Erotic+%21+bsxErotic&sort_order=downloads

- scroll to the bottom and click "show more" a few times
- enter the javascript below in the browsers js console
- it should have copied the ids to your clipboard, you can paste it into "ids" below


```js
// to get all book ids shown on page, paste this javascript into js console in browser when on the page above
a_elems = document.getElementsByClassName("table link")
hrefs = Array.from(a_elems)
  .map(e=>e.href) // get link
  .filter(e=>e) // remove empty links
ids = hrefs.map(e=>/(\d+)\.mobile/.exec(e)) // regular expression match
  .filter(e=>e) // remove ones not found
  .map(e=>e[1]) // get just id
copy(ids) // copy to clipboard
```

In [14]:
import requests
import os
import re
import bs4
import time
import json
from tqdm import tqdm_notebook as tqdm

dest_dir = '../data/input/erotic_gutenberg'

In [15]:
# urls to download text inputs
ids = [
  "30254",
  "30360",
  "28520",
  "25305",
  "14005",
  "28522",
  "31284",
  "28521",
  "29827",
  "52059",
  "14323",
  "13610",
  "57284",
  "13972",
  "52205",
  "54672",
  "13614",
  "28718",
  "44877",
  "26804",
  "45150",
  "37491",
  "43438",
  "48943",
  "53807",
  "26456",
  "26808",
  "13971",
  "42406",
  "43823",
  "39220",
  "56779",
  "26809",
  "18610",
  "44181",
  "42212",
  "26806",
  "42586",
  "47892",
  "43822",
  "49855",
  "26562",
  "26739",
  "26807",
  "20568",
  "40877",
  "54419",
  "53944",
  "40557",
  "29049",
  "25543",
  "40902",
  "41301",
  "56491",
  "28789",
  "40496"
]

In [73]:
# download/cache raw files
for bid in tqdm(ids):
    
    # first download index
    index_url = "http://www.gutenberg.org/files/{bid:}".format(bid=bid)
    r = requests.get(index_url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.content, "html5lib")
    hrefs = [e.attrs['href'] for e in soup.findAll('a')]
    links = [h for h in hrefs if h.endswith('.txt')]
    
    # download text
    for link in links:
        txt_url = index_url + '/' + link
        outfile = os.path.join(dest_dir, 'raw', link)
        if not os.path.isfile(outfile):
            r = requests.get(txt_url)
            r.raise_for_status()
            open(outfile, 'w').write(r.text)
            
        
    time.sleep(0.5) # avoid ddos/ban

HBox(children=(IntProgress(value=0, max=56), HTML(value='')))

In [74]:
# from https://github.com/motoom/gutenberg-ebook-scraping/blob/master/gutenberg.py

# Repetitive stuff I don't want to read a 1000 times on my eBook reader.
remove = ["Produced by","End of the Project Gutenberg","End of Project Gutenberg"]

def beautify(text):
    ''' Reads a raw Project Gutenberg etext, reformat paragraphs,
    and removes fluff.  Determines the title of the book'''
    FIRST_CONTENT_LINE_LENGTH=300
    lines = [line.strip() for line in text.split('\n')]
    collect = False
    lookforsubtitle = False
    outlines = []
    startseen = endseen = False
    found_first_paragraph = False
    title=""
    author=""
    language=""
    extra=[]
    for line in lines:
        if line.startswith("Author: "):
            author = line[8:]
        if line.startswith("Language: "):
            language = line[10:]
        if line.startswith("Title: "):
            title = line[7:]
            lookforsubtitle = True
            continue
        if lookforsubtitle:
            if not line.strip():
                lookforsubtitle = False
            else:
                subtitle = line.strip()
                subtitle = subtitle.strip(".")
                title += ", " + subtitle
        if ("*** START" in line) or ("***START" in line) or (line.startswith("*END THE SMALL PRINT!")):
            collect = startseen = True
            paragraph = ""
            extra.append(line)
            continue
            
        
        if ("*** END" in line) or ("***END" in line):
            endseen = True
            extra.append(line)
            break           
        if not collect:
            extra.append(line)
            continue
        if not line:
            paragraph = paragraph.strip().replace('_', '')
            for term in remove:
                if paragraph.startswith(term):
                    extra.append(line)
                    paragraph = ""
                    break
            # Skip all the misc paragraphs, and only start at the first long paragraphs
            if not found_first_paragraph:
                if len(paragraph)>FIRST_CONTENT_LINE_LENGTH:
                    found_first_paragraph = True
            if paragraph and found_first_paragraph:
                outlines.append(paragraph)
                outlines.append("")
            paragraph = ""
        else:
            paragraph += " " + line

    # Report on anomalous situations, but don't make it a showstopper.
    if not title:
#         print (ofn)
        print ("    Problem: No title found\n")
    if not startseen:
#         print (ofn)
        print ("    Problem: No '*** START' seen\n")
    if not endseen:
#         print (ofn)
        print ("    Problem: No '*** END' seen\n")
        
    return dict(
        content='\n'.join(outlines),
        title=title,
        author=author,
        language=language,
        extra=extra
    )
        


# 2. turn into csv, like rocstories

In [75]:
from sklearn.model_selection import train_test_split

In [94]:
# convert raw text into one long csv
raw_dir = os.path.join(dest_dir, 'raw')
max_len = 400
num_sent = 6
data=[]
for infile in os.listdir(raw_dir)[:16]:
    path = os.path.join(raw_dir, infile)
    info = beautify(open(path).read())
    if info['language']=='English':
        data.append(info['content'])

In [95]:
x_train, x_test = train_test_split(data)
x_val, x_test = train_test_split(x_test)

In [96]:
open(os.path.join(dest_dir, "train.txt"), "w").write("\n\n".join(x_train))
open(os.path.join(dest_dir, "val.txt"), "w").write("\n\n".join(x_val))
open(os.path.join(dest_dir, "test.txt"), "w").write("\n\n".join(x_test))

287192

'French'