In [2]:

import requests
import nltk
import re

from bs4 import BeautifulSoup
from itertools import groupby

nltk.download('averaged_perceptron_tagger')
nltk.download('words')

# ---
def get_tag_that_is_chapter(tag): return tag.name == "h2" and tag.contents[0].name == "a"

def tokenize_text(book_text):
    TOKEN_PATTERN = r'\s+'
    regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=True)
    word_tokens = regex_wt.tokenize(book_text)
    return word_tokens
# ---

# Use HTML instead of raw-txt
book_num = 17866
book_link = f"http://www.gutenberg.org/files/{book_num}/{book_num}-h/{book_num}-h.htm"
book_request = requests.get(book_link)

assert book_request.status_code == 200, f"Request of the book failed. Status: {book_request.status_code}"

book_soup = BeautifulSoup(book_request.content, "html.parser")
book_chapters = book_soup.find_all(get_tag_that_is_chapter)

book_chapters_paragraphs = dict()
for chapter in book_chapters:
    chapter_paragraphs = list()
    for element in chapter.next_siblings:
        if element.name == "hr" or element.name == "h2": break
        if element.name == "p" and element.get_text() is not None: 
            clean = re.sub(r"[\"\?\!\-\.\,\;\:\(\)\s]+", ' ', element.get_text())
            chapter_paragraphs.append(clean)
    book_chapters_paragraphs[chapter.contents[0]['id']] = chapter_paragraphs

# ---
# Tous les paragraphes (liste) d'un chapitre son accessible via :
# book_chapters_paragraphs.get("CHAPTER_#")

tree_by_chapter = dict()

for chapter in book_chapters_paragraphs:
    tokens = ''.join(book_chapters_paragraphs.get(chapter))
    tokens = tokenize_text(tokens)
    tagged_tokens = nltk.pos_tag(tokens)
    
    groups = groupby(tagged_tokens, key=lambda x: x[1])
    names = [[w for w,_ in words] for tag,words in groups if tag=="NNP"]
    names = [" ".join(name) for name in names if len(name)>=2] 

    tree_by_chapter[chapter] = set(names)

# ---
# Obtenir les noms propres détectés par chapitre :
# tree_by_chapter.get("CHAPTER_#") 

print(tree_by_chapter.get("CHAPTER_4"))

# TODO
# Il faut maintenant faire une vérification manuelle des personnages.. (voir le mail du prof)
# Je vais regarder comment implémenter l'algorithme de Louvain. 

{'Sam Browne', 'Fred Hines', "'Davy Crockett'", 'Maginot Line', 'Yes Gresham', 'Whitneyville Walker', "Haven's Colt", "Jeff It's", 'Rifle Association', 'Lane Fleming', 'Virginia Manufactory', "Colonel Walker's Texas Rangers—you", 'World War', 'Ordinarily Lane', 'Walker Colt', 'Stephen Rand', 'Rappahannock Forge', 'Premix Company', 'Paterson Colts', 'Germany A', 'Pierre Jarrett', "Lane Fleming's", 'Karen Lawrence Pierre', 'March Yes', 'Elisha Collier', 'Cabot Joyner', 'Stephen Gresham', 'Colt Navy Models', 'Walpole Galleries', 'Humphrey Goode', 'Edison Public Power', 'U S North', 'Philip Cabot', 'Scott County', 'N R A', 'Model Colt Dragoons Rand', 'A Hall', 'Whitneyville Walker Colts', 'S North', 'Elmer Umholtz', 'Cheney Navy', 'U S Martials', 'Civil War Rand', 'National Rifle Association', 'Jeff Lane Fleming', 'Oh Lord', 'Adam Trehearne', "Jeff I'm", 'Arnold Rivers', 'Colin MacBride', 'My God Jeff Twenty', 'Confederate Leech'}
