In [2]:
%config Completer.use_jedi = False

In [3]:
import requests
import re
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import os

In [4]:
ICCS_LINK = 'https://www.iccs-meeting.org/archive'
target_dir = 'pdfs'

In [5]:
html = requests.get(ICCS_LINK).content

In [6]:
soup = BeautifulSoup(html, 'html.parser')
archive_names = [link.get('href')[:-1] for link in soup.find_all('a') if link.get_text().startswith('iccs')]

In [7]:
ex_link = archive_names[0]

In [8]:
ex_page = requests.get(f'{ICCS_LINK}/{ex_link}').content

```html
<li class="paper">
    <h3>
        <a href="papers/<number>.pdf">
```

In [9]:
soup_page = BeautifulSoup(ex_page, 'html.parser')
ex = soup_page.find_all('li', {'class': 'paper'})[0]

In [10]:
ex

<li class="paper"><h3><a href="papers/108600001.pdf">Science at the Intersection of Data, Modelling and Computation. Preface for ICCS 2018</a></h3><p>Yong Shi, Haohuan Fu, Yingjie Tian, Valeria V. Krzhizhanovskaya, Michael Lees, Jack Dongarra, Peter M.A. Sloot</p></li>

In [11]:
url = ICCS_LINK + '/' + ex_link + '/' + ex.find('a').get('href')

In [12]:
ex.find('a').get('href')

'papers/108600001.pdf'

In [13]:
# os.system(f"wget {url} -O {'example.pdf'}")
# wget.download(url, out='pdfs/example.pdf')

with open('pdfs/example.pdf', 'wb') as fh:
    content = requests.get(url).content
    fh.write(content)

### ALL PDFS

In [14]:
def download_pdf(url):
    filename = url.split('/')[-1]
    if not os.path.exists(f'{target_dir}/{filename}'):
        with open(f'{target_dir}/{filename}', 'wb') as fh:
            content = requests.get(url).content
            fh.write(content)

In [15]:
def get_urls(archive, LIMIT=None):
    page_with_pdf_list = requests.get(f'{ICCS_LINK}/{archive}').content
    soup = BeautifulSoup(page_with_pdf_list, 'html.parser')
    if LIMIT:
        hrefs = soup.find_all('li', {'class': 'paper'})[:LIMIT]
    else:
        hrefs = soup.find_all('li', {'class': 'paper'})
    return [f"{ICCS_LINK}/{archive}/{href.find('a').get('href')}" for href in hrefs]

In [16]:
def download_content(archives):
    if not os.path.exists('pdfs'):
        os.makedirs('pdfs')
    for archive in tqdm(archives):
        archive_links = get_urls(archive, LIMIT=20)
        for url in tqdm(archive_links):
            download_pdf(url)

In [17]:
download_content(archive_names)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [18]:
import fitz

def extract_text_from_doc(path):
    doc = fitz.open(path)
    doc_pages = []
    for current_page in range(len(doc)):
        page = doc.load_page(current_page)
        page_text = page.get_text("text")
        doc_pages.append(page_text)
    return doc_pages

In [19]:
raw_doc = extract_text_from_doc('pdfs/108600007.pdf')

In [20]:
from typing import List

def concat_page(text: str, page_num: int) -> str:
    '''
    соединяет страницу (переносы слов) из представления fitz
    '''
    if not isinstance(text, str):
        raise TypeError('`text` should be a string')

    page_text = ''
    split_text = [sent for sent in text.split('\n') if sent]

    ftr_idxs = [len(split_text) - 3, len(split_text) - 2, len(split_text) - 1]
    for i, raw_sent in enumerate(split_text):
        # пропускаем header и footer
        # для титульной страницы только footer
        if page_num == 0:
            if i in ftr_idxs:
                continue
        else:
            if i in [0, 1, *ftr_idxs]:
                continue

        tokens = raw_sent.split()
        if not tokens:  # tokens == []
            continue
        last_word = tokens[-1]

        # проверяем кейс, когда последнее слово на странице с переносом (para-...transit)
        if i == len(split_text) - 1:
            if last_word.endswith('-') and len(last_word) > 1:
                page_text += raw_sent
            else:
                page_text += raw_sent + ' '
        else:
            if last_word.endswith('-') and len(last_word) > 1:
                page_text += raw_sent[:-1]
            else:
                page_text += raw_sent + ' '
    page_text = page_text.strip()
    return page_text

In [21]:
# не убираем \n (по ним нужно убрать заголовок и подвалы)
mult_ws = re.compile(r'[ \t]+')

def little_preprocessing(text: str) -> str:
    text = text.replace('—', '-')
    text = re.sub(mult_ws, ' ', text)
    return text.strip()

def concat_doc(doc: List[str]) -> str:
    '''
    представляет документ в виде строки
    с их небольшим препроцессингом
    '''
    if not isinstance(doc, list):
        raise TypeError('`doc` should of type list')

    doc_text = ''
    for page_num, raw_page in enumerate(doc):
        page_little_prep = little_preprocessing(raw_page)
        page_text = concat_page(page_little_prep, page_num)
        # проверяем кейс, когда последнее слово на странице с переносом (para-...transit)
        # и правильно соединим данный токен с его частью на следующей странице
        if page_text:
            last_token_on_page = page_text.split()[-1]
            if last_token_on_page.endswith('-') and len(last_token_on_page) > 1:
                doc_text += page_text[:-1]
            else:
                doc_text += page_text
    return doc_text

In [22]:
doc = concat_doc(raw_doc)
doc[:100]

'Optimizing the Eﬃciency, Vulnerability and Robustness of Road-based Para-transit Networks using Gene'

In [32]:
def concat_all_docs(source_dir: str) -> List:
    docs_concat = []
    for doc_name in tqdm(os.listdir(source_dir)):
        raw_doc = extract_text_from_doc(f'{source_dir}/{doc_name}')
        doc = concat_doc(raw_doc)
        docs_concat.append(doc)
    return docs_concat

In [33]:
docs = concat_all_docs('pdfs')

  0%|          | 0/81 [00:00<?, ?it/s]

### Сохраняем собранные тексты

In [39]:
def save_text(doc: str, filename: str = None, target_dir: str = 'txt') -> None:
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(f'{target_dir}/{filename}.txt', 'w') as fh:
        fh.write(doc)

In [40]:
for doc, fname in zip(docs, os.listdir('pdfs')):
    fname = fname.split('.')[0]
    save_text(doc, fname)