In [2]:
!pip install beautifulsoup4 requests tqdm



### Scrape ru-kbd

In [15]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re

BASE_URL = 'https://ibt.org.ru/ru/text?m=KBDOP'
BOOKS_SELECTOR = 'select#selbook option'
CHAPTER_SELECTOR = 'select#selchap option'
VERSE_SELECTOR = '.text div.interB'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}

session = requests.Session()
session.headers.update(headers)  # Set default headers for the session

def extract_books():
    resp = session.get(BASE_URL)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [opt['value'] for opt in soup.select(BOOKS_SELECTOR)]

def extract_chapters(book):
    resp = session.get(f'https://ibt.org.ru/ru/text?m1=KBDOP&m2=RSP&l={book}.1')
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [opt['value'] for opt in soup.select(CHAPTER_SELECTOR)]

def remove_spaces_before_punctuation(text):
    # Regular expression to match spaces before punctuation marks
    return re.sub(r'\s+([.,;:!?])', r'\1', text)

def extract_verses(book, chapter):
    resp = session.get(f'https://ibt.org.ru/ru/text?m1=KBDOP&m2=RSP&l={book}.{chapter}')
    soup = BeautifulSoup(resp.text, 'html.parser')
    verses = []
    for verse_div in soup.select(VERSE_SELECTOR):
        kabardian_div = verse_div.select_one('.interV1.cs-KBDOP .vs')
        russian_div = verse_div.select_one('.interV2.cs-RSP .vs')

        if kabardian_div is None or russian_div is None:
            continue  # skip if either div is not found

        # Extract text while skipping verse numbers
        kabardian = ' '.join([text for text in kabardian_div.stripped_strings if text not in [sup.get_text() for sup in kabardian_div.select('sup')]])
        russian = ' '.join([text for text in russian_div.stripped_strings if text not in [sup.get_text() for sup in russian_div.select('sup')]])

        # Remove spaces before punctuation
        kabardian = remove_spaces_before_punctuation(kabardian)
        russian = remove_spaces_before_punctuation(russian)

        verses.append((russian, kabardian))
    return verses

def save_verses_to_file(book, verses):
    with open(f'ru-kbd_bible_{book.lower()}.txt', 'w', encoding='utf-8') as f:
        for russian, kabardian in verses:
            f.write(f'{russian}😀{kabardian}\n')

def main():
    books = extract_books()
    for book in tqdm(books, desc='Books'):
        all_verses = []
        chapters = extract_chapters(book)
        for chapter in tqdm(chapters, desc='Chapters', leave=False):
            all_verses.extend(extract_verses(book, chapter))
        save_verses_to_file(book, all_verses)

if __name__ == '__main__':
    main()

Books:   0%|          | 0/29 [00:00<?, ?it/s]

Chapters:   0%|          | 0/50 [00:00<?, ?it/s]

Chapters:   0%|          | 0/150 [00:00<?, ?it/s]

Chapters:   0%|          | 0/28 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/24 [00:00<?, ?it/s]

Chapters:   0%|          | 0/21 [00:00<?, ?it/s]

Chapters:   0%|          | 0/28 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/13 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/13 [00:00<?, ?it/s]

Chapters:   0%|          | 0/22 [00:00<?, ?it/s]

#### print total lines in all scraped files

In [18]:
total_lines = 0

# Loop through each file in the current directory
for file in os.listdir():
    # Check if the file is a txt file and starts with "ru-kbd"
    if file.endswith(".txt") and file.startswith("ru-kbd"):
        with open(file, 'r') as f:
            total_lines += len(f.readlines())

print(f"Total number of lines: {total_lines}")

Total number of lines: 12014


#### download the scraped text files

In [16]:
import zipfile
import os
from google.colab import files

# Set the name for your zip file
zip_filename = "ru-kbd-files.zip"

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Loop through each file in the current directory
    for file in os.listdir():
        # Check if the file is a txt file and starts with "ru-kbd"
        if file.endswith(".txt") and file.startswith("ru-kbd"):
            zipf.write(file)

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Scrape ru-ady

In [19]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re

BASE_URL = 'https://ibt.org.ru/ru/text?m1=ADG'
BOOKS_SELECTOR = 'select#selbook option'
CHAPTER_SELECTOR = 'select#selchap option'
VERSE_SELECTOR = '.text div.interB'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}

session = requests.Session()
session.headers.update(headers)  # Set default headers for the session

def extract_books():
    resp = session.get(BASE_URL)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [opt['value'] for opt in soup.select(BOOKS_SELECTOR)]

def extract_chapters(book):
    resp = session.get(f'https://ibt.org.ru/ru/text?m1=ADG&m2=RSP&l={book}.1')
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [opt['value'] for opt in soup.select(CHAPTER_SELECTOR)]

def remove_spaces_before_punctuation(text):
    # Regular expression to match spaces before punctuation marks
    return re.sub(r'\s+([.,;:!?])', r'\1', text)

def extract_verses(book, chapter):
    resp = session.get(f'https://ibt.org.ru/ru/text?m1=ADG&m2=RSP&l={book}.{chapter}')
    soup = BeautifulSoup(resp.text, 'html.parser')
    verses = []
    for verse_div in soup.select(VERSE_SELECTOR):
        # Step 3: Update the adyghe_div class to target Adyghe text
        adyghe_div = verse_div.select_one('.interV1.cs-ADG .vs')
        russian_div = verse_div.select_one('.interV2.cs-RSP .vs')

        if adyghe_div is None or russian_div is None:
            continue  # skip if either div is not found

        # Extract text while skipping verse numbers
        adyghe = ' '.join([text for text in adyghe_div.stripped_strings if text not in [sup.get_text() for sup in adyghe_div.select('sup')]])
        russian = ' '.join([text for text in russian_div.stripped_strings if text not in [sup.get_text() for sup in russian_div.select('sup')]])

        # Remove spaces before punctuation
        adyghe = remove_spaces_before_punctuation(adyghe)
        russian = remove_spaces_before_punctuation(russian)

        verses.append((russian, adyghe))
    return verses

def save_verses_to_file(book, verses):
    with open(f'ru-ady_bible_{book.lower()}.txt', 'w', encoding='utf-8') as f:
        for russian, adyghe in verses:
            f.write(f'{russian}😀{adyghe}\n')

def main():
    books = extract_books()
    for book in tqdm(books, desc='Books'):
        all_verses = []
        chapters = extract_chapters(book)
        for chapter in tqdm(chapters, desc='Chapters', leave=False):
            all_verses.extend(extract_verses(book, chapter))
        save_verses_to_file(book, all_verses)

if __name__ == '__main__':
    main()

Books:   0%|          | 0/41 [00:00<?, ?it/s]

Chapters:   0%|          | 0/50 [00:00<?, ?it/s]

Chapters:   0%|          | 0/40 [00:00<?, ?it/s]

Chapters:   0%|          | 0/34 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/31 [00:00<?, ?it/s]

Chapters:   0%|          | 0/24 [00:00<?, ?it/s]

Chapters:   0%|          | 0/22 [00:00<?, ?it/s]

Chapters:   0%|          | 0/25 [00:00<?, ?it/s]

Chapters:   0%|          | 0/10 [00:00<?, ?it/s]

Chapters:   0%|          | 0/150 [00:00<?, ?it/s]

Chapters:   0%|          | 0/31 [00:00<?, ?it/s]

Chapters:   0%|          | 0/12 [00:00<?, ?it/s]

Chapters:   0%|          | 0/12 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/28 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/24 [00:00<?, ?it/s]

Chapters:   0%|          | 0/21 [00:00<?, ?it/s]

Chapters:   0%|          | 0/28 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/16 [00:00<?, ?it/s]

Chapters:   0%|          | 0/13 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/5 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/6 [00:00<?, ?it/s]

Chapters:   0%|          | 0/4 [00:00<?, ?it/s]

Chapters:   0%|          | 0/3 [00:00<?, ?it/s]

Chapters:   0%|          | 0/1 [00:00<?, ?it/s]

Chapters:   0%|          | 0/13 [00:00<?, ?it/s]

Chapters:   0%|          | 0/22 [00:00<?, ?it/s]

In [20]:
total_lines = 0

# Loop through each file in the current directory
for file in os.listdir():
    # Check if the file is a txt file and starts with "ru-kbd"
    if file.endswith(".txt") and file.startswith("ru-ady"):
        with open(file, 'r') as f:
            total_lines += len(f.readlines())

print(f"Total number of lines: {total_lines}")

Total number of lines: 19021


In [21]:
import zipfile
import os
from google.colab import files

# Set the name for your zip file
zip_filename = "ru-ady-files.zip"

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Loop through each file in the current directory
    for file in os.listdir():
        # Check if the file is a txt file and starts with "ru-ady"
        if file.endswith(".txt") and file.startswith("ru-ady"):
            zipf.write(file)

# Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>