In [1]:
from bs4 import BeautifulSoup
import numpy
import pandas as pd
import csv
import requests


In [2]:
class BookIndexGenerator:
    def __init__(self, pages, exclude_words_file):
        self.pages = pages
        self.exclude_words = self.load_exclude_words(exclude_words_file)
        self.index = {}

    def load_exclude_words(self, exclude_words_file):
        with open(exclude_words_file, 'r') as file:
            return set(word.strip() for word in file.readlines())

    def process_pages(self):
        for page_number, page_file in self.pages.items():
            self.process_page(page_number, page_file)

    def process_page(self, page_number, page_file):
        with open(page_file, 'r') as file:
            words = file.read().split()
            unique_words = set(word for word in words if word not in self.exclude_words)
            for word in unique_words:
                if word in self.index:
                    self.index[word].add(page_number)
                else:
                    self.index[word] = {page_number}

    def generate_index_file(self, index_file):
        sorted_words = sorted(self.index.keys())
        with open(index_file, 'w') as file:
            for word in sorted_words:
                pages = ','.join(str(page) for page in sorted(self.index[word]))
                file.write(f"{word} : {pages}\n")


# Usage
pages = {
    1: 'Page1.txt',
    2: 'Page2.txt',
    3: 'Page3.txt'
}
exclude_words_file = 'exclude-words.txt'
index_file = 'index.txt'

index_generator = BookIndexGenerator(pages, exclude_words_file)
index_generator.process_pages()
index_generator.generate_index_file(index_file)


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 930: character maps to <undefined>

In [3]:
class BookIndexGenerator:
    def __init__(self, pages, exclude_words_file):
        self.pages = pages
        self.exclude_words = self.load_exclude_words(exclude_words_file)
        self.index = {}

    def load_exclude_words(self, exclude_words_file):
        with open(exclude_words_file, 'r', encoding='utf-8') as file:
            return set(word.strip() for word in file.readlines())

    def process_pages(self):
        for page_number, page_file in self.pages.items():
            self.process_page(page_number, page_file)

    def process_page(self, page_number, page_file):
        with open(page_file, 'r', encoding='utf-8') as file:
            words = file.read().split()
            unique_words = set(word for word in words if word not in self.exclude_words)
            for word in unique_words:
                if word in self.index:
                    self.index[word].add(page_number)
                else:
                    self.index[word] = {page_number}

    def generate_index_file(self, index_file):
        sorted_words = sorted(self.index.keys())
        with open(index_file, 'w', encoding='utf-8') as file:
            for word in sorted_words:
                pages = ','.join(str(page) for page in sorted(self.index[word]))
                file.write(f"{word} : {pages}\n")


# Usage
pages = {
    1: 'Page1.txt',
    2: 'Page2.txt',
    3: 'Page3.txt'
}
exclude_words_file = 'exclude-words.txt'
index_file = 'index.txt'

index_generator = BookIndexGenerator(pages, exclude_words_file)
index_generator.process_pages()
index_generator.generate_index_file(index_file)
