## Load libraries

In [2]:
import pandas as pd
import numpy as np
import requests


# turn off warnings
import warnings
warnings.filterwarnings('ignore')


# set all columns to be displayed
pd.set_option('display.max_columns', None)

from dotenv import load_dotenv
load_dotenv()



from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import time

## Load data

In [3]:
data_test = pd.read_excel('data/Test_check.xlsx', sheet_name='sasha_3')
data_test.head()

Unnamed: 0,URL,lib_text
0,https://expert.ru/ekonomika/vygodna-li-rossii-...,Серьезнее других от введения западных санкций ...
1,https://ria.ru/20250311/klyuchevaya_stavka-196...,"МОСКВА, 14 фев — РИА Новости Ключевая процентн..."
2,https://expert.ru/mnenie/denis-manturov-gosuda...,Со следующего года начнется реализация 12 мега...
3,https://ria.ru/20250304/kredity-2003042476.html,Самозапрет на кредиты - это новая возможность ...
4,https://lenta.ru/brief/2025/03/11/green/,"Заходя в магазин, вы часто замечали зеленые уп..."


In [5]:
url_list = data_test['URL'].tolist()

## Selenium initialize

Start docker container with Selenium before run the code.

In [None]:
def get_chrome_driver(hub_url):
    """Initialize and return a new Chrome WebDriver instance."""
    chrome_options = ChromeOptions()
    chrome_options.page_load_strategy = 'normal'
    
    try:
        driver = webdriver.Remote(command_executor=hub_url, options=chrome_options)
        return driver
    except WebDriverException as e:
        print(f"Error initializing WebDriver: {e}")
        return None

def check_and_renew_driver(driver: webdriver, hub_url: str) -> webdriver:
    """Check if the driver is alive and renew if necessary."""
    try:
        driver.title  # Attempting to access a property to check if it's still active
        return driver  # Driver is still active
    except (WebDriverException, AttributeError):
        print("WebDriver is not active. Reinitializing...")
        return get_chrome_driver(hub_url)
    
hub_url = "http://localhost:4444/wd/hub" # docker container with selenium and chrome
# hub_url = "http://chrome:4444/wd/hub" # docker container with spark and chrome. chrome is a service name in docker-compose.yml


def close_driver(driver):
    """Close the WebDriver instance."""
    try:
        driver.quit()
    except WebDriverException as e:
        print(f"Error closing WebDriver: {e}")

## Functions for parse texts of news from urls

In [None]:
chrome_driver = get_chrome_driver(hub_url)

# chrome_driver = check_and_renew_driver(chrome_driver, hub_url)

# close_driver(chrome_driver)

In [7]:
# find all unique news sources from the URL list
news_source = []
for url in url_list[:]:
    match = re.match(r'https?://([^/]+)/([^/]+)/', url)
    news_source.append(f'{match.group(1)}/{match.group(2)}' if match else 'Unknown')

pd.DataFrame(news_source).value_counts().sort_index()

0                          
expert.ru/ekonomika            2
expert.ru/finance              2
expert.ru/mnenie               4
expert.ru/multimedia           1
expert.ru/news                 7
expert.ru/novosti-partnerov    1
expert.ru/v-mire               2
lenta.ru/articles              4
lenta.ru/brief                 1
lenta.ru/extlink               3
lenta.ru/news                  3
ria.ru/20241220                1
ria.ru/20250109                1
ria.ru/20250116                1
ria.ru/20250123                1
ria.ru/20250304                1
ria.ru/20250310                3
ria.ru/20250311                7
ria.ru/docs                    2
www.vesti.ru/article           3
Name: count, dtype: int64

In [None]:
def get_text_from_expert(url, timeout=10):
    """Fetch the text content from a 'expert.ru' news page using BeautifulSoup."""
    try:
        article_text = []

        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses

        soup = BeautifulSoup(response.text, 'html.parser')

        # Headline
        headline = soup.find("h1", itemprop="headline")
        if headline:
            article_text.append(headline.get_text(strip=True))

        # Subtitle
        subtitle = soup.find("h2", class_="subtitle_article")
        if subtitle:
            article_text.append(subtitle.get_text(strip=True))

        # Theme
        theme = soup.select_one(".article-theme")
        if theme:
            article_text.append(theme.get_text(strip=True))

        # Date
        date_span = soup.find("span", class_="article-date")
        if date_span and "content" in date_span.attrs:
            article_text.append(date_span["content"].split('T')[0])

        # Author
        author = soup.select_one(".article-header-author-bl [itemprop='name']")
        if author:
            article_text.append(author.get_text(strip=True))

        # Article body
        article_body = soup.find("div", class_="plain-text")
        if article_body:
            paragraphs = [p.get_text(strip=True) for p in article_body.find_all("p")]
            if paragraphs:
                main_text = "\n\n".join(paragraphs)
                article_text.append(main_text)

        # Join all parts into a single string
        article_text = "\n".join(article_text)
        
        # Clean up whitespace
        article_text = re.sub(r'\s+', ' ', article_text)  # Replace multiple spaces with one
        article_text = article_text.replace('\xa0', '').strip()

        return article_text if article_text else None

    except Exception as e:
        print(f"[Error] Failed to parse article from {url}: {e}")
        return None


In [None]:
def get_text_from_expert_multimedia(url, timeout=10):
    """Fetch the text content from a 'expert.ru/multimedia' news page using BeautifulSoup."""
    try:
        article_text = []

        response = requests.get(url, timeout=timeout)
        response.raise_for_status()  # Raise an error for bad responses

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract date
        date_tag = soup.find('span', class_='article-date')
        if date_tag:
            article_text.append(date_tag.get_text(strip=True))

        # Extract title
        title_tag = soup.find('h1')
        article_text.append(title_tag.get_text(strip=True) if title_tag else 'No title found')
        
        # Extract gallery items
        for item in soup.find_all('div', class_='multimedia_page_gallery_item'):
            text_block = item.find('div', class_='plain-text')
            article_text.append(text_block.get_text(strip=True, separator=' ') if text_block else '')

         # Join all parts into a single string
        article_text = "\n".join(article_text)
            
        return article_text
    
    except Exception as e:
        print(f"[Error] Failed to parse article from {url}: {e}")
        return None


In [None]:
def get_text_from_ria(url, timeout=10):
    """Fetch the text content from a 'ria' news page using BeautifulSoup."""
    try:
        response = requests.get(url, timeout=timeout)
        soup = BeautifulSoup(response.content, 'html.parser')

        article_text = []

        # Look for all content blocks with class "article__block"
        for block in soup.find_all('div', class_='article__block'):
            block_type = block.get('data-type')

            # Headers (h1, h2, etc.)
            if block_type == 'h2':
                header = block.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
                if header:
                    article_text.append(header.get_text(strip=True))

            # Simple text
            elif block_type == 'text':
                text_block = block.find('div', class_='article__text')
                if text_block:
                    article_text.append(text_block.get_text(strip=True))

            # lists
            elif block_type == 'list':
                for ul in block.find_all('ul', class_='article__list m-circle'):
                    for li in ul.find_all('li', class_='article__list-item'):
                        full_text = li.get_text(strip=True)
                        if full_text:
                            article_text.append(f"  • {full_text}")

        # alternative method "article__body"
        if not article_text:
            article_body = soup.find('div', class_='article__body')
            if article_body:
                for p in article_body.find_all('p'):
                    article_text.append(p.get_text(strip=True))

        # join with line breaks
        full_text = '\n'.join(article_text)

        return full_text
    
    except Exception as e:
        print(f"[Error] Failed to fetch text from {url}: {e}")
        return None


In [None]:
def get_text_from_lenta_brief(url):
    """Fetch the brief content from a 'lenta.ru' news page using BeautifulSoup."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if request fails
        soup = BeautifulSoup(response.content, 'html.parser')

        article_text = []

        # Extract publication date
        pub_date = soup.select_one('a.topic-header__time')
        if pub_date:
            article_text.append(pub_date.get_text(strip=True))

        # Extract category
        category = soup.select_one('a.topic-header__rubric')
        if category:
            article_text.append(category.get_text(strip=True))

        # Extract full title
        title_parts = soup.select('h1.topic-body__titles span')
        if title_parts:
            full_title = " ".join([part.get_text(strip=True) for part in title_parts])
            article_text.append(full_title)

        # Extract article sections
        brief_cards = soup.select('div.box-brief-card')
        for card in brief_cards:
            # Extract the number
            number = card.select_one('span.box-brief-card__number')
            if number:
                article_text.append(number.get_text(strip=True))

            # Extract subtitle
            title = card.select_one('div.box-brief-card__title')
            if title:
                article_text.append(title.get_text(strip=True))

            # Extract paragraph content
            paragraphs = card.select('p.box-brief-card__content-text')
            if paragraphs:
                content = "\n".join(p.get_text(strip=True) for p in paragraphs)
                article_text.append(content)

        # Join all parts into a single string with line breaks
        return "\n".join(article_text).strip() if article_text else None

    except Exception as e:
        print(f"[Error] Failed to fetch text from {url}: {e}")
        return None

In [None]:
def get_text_from_lenta_article(url, driver):
    """Fetch full article text from 'lenta.ru/article' using Selenium and BeautifulSoup. Scroll until article block is loaded."""
    options = ChromeOptions()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    try:
        driver.get(url)

        # Scroll until main article block is found
        timeout = 15
        end_time = time.time() + timeout
        while time.time() < end_time:
            if driver.find_elements(By.CSS_SELECTOR, "div.topic-body, div.topic-page__wrap.js-topic"):
                break
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(0.5)
        else:
            raise TimeoutError("Failed to load article wrapper div")

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.premium-header__titles, h1.topic-body__titles"))
        )

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_text = []

        # Date
        if (tag := soup.select_one('a.topic-header__time')):
            article_text.append(f"Date: {tag.get_text(strip=True)}")

        # Category
        if (tag := soup.select_one('a.topic-header__rubric')):
            article_text.append(f"Category: {tag.get_text(strip=True)}")

        # Title
        if (h1 := soup.select_one('h1.premium-header__titles')):
            title_main = h1.select_one('span.premium-header__title')
            title_sub = h1.select_one('span.premium-header__rightcol')
            full_title = " ".join(filter(None, [
                title_main.get_text(strip=True) if title_main else "",
                title_sub.get_text(strip=True) if title_sub else ""
            ]))
            article_text.append(f"Title: {full_title}")
        elif (h1 := soup.select_one('h1.topic-body__titles')):
            parts = h1.find_all('span')
            article_text.append("Title: " + " ".join(part.get_text(strip=True) for part in parts))

        # Subtitle from <div class="topic-body__title-yandex">
        if (subtitle := soup.select_one('div.topic-body__title-yandex')):
            article_text.append(subtitle.get_text(strip=True))

        # Author and job
        if (tag := soup.select_one('span.topic-authors__name')):
            article_text.append(f"Author: {tag.get_text(strip=True)}")
        if (tag := soup.select_one('span.topic-authors__job')):
            article_text.append(tag.get_text(strip=True))

        article_text.append("\n=== Article Content ===\n")

        # Content block (for premium and standard articles)
        content = soup.select_one('div.topic-body__content') or soup.select_one('div.topic-body__content._premium')
        if content:
            for elem in content.find_all(['h2', 'p', 'div', 'figure', 'blockquote'], recursive=False):
                if elem.name == 'h2':
                    article_text.append(elem.get_text(strip=True).upper())
                elif elem.name == 'p':
                    article_text.append(elem.get_text(strip=True))
                elif elem.name == 'div' and 'box-note' in elem.get('class', []):
                    note = elem.get_text(strip=True)
                    article_text.append(f"- {note}")
                elif elem.name == 'div' and 'box-quote__content' in elem.get('class', []):
                    quote = elem.get_text(strip=True)
                    article_text.append(f"QUOTE: {quote}")
                elif elem.name == 'figure':
                    caption = elem.select_one('figcaption')
                    if caption:
                        article_text.append("[" + caption.get_text(strip=True) + "]")

        return "\n".join(article_text).strip()

    except Exception as e:
        print(f"[Error] Failed to fetch text from {url}: {e}")
        return None


In [None]:
def get_text_from_vesti(url, driver):
    """Fetch full article text from 'vesti.ru' using Selenium and BeautifulSoup. Scroll until article block is loaded."""

    try:    
        # Setup Chrome options
        options = ChromeOptions()
        options.add_argument("--headless")  # Run headlessly
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        # Load the page
        driver.get(url)
        time.sleep(2)  # Let initial JS load

        # Scroll to bottom (multiple times if needed)
        scroll_pause = 1
        last_height = driver.execute_script("return document.body.scrollHeight")

        for _ in range(3):  # Increase iterations if more lazy-loaded content
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract page source after JS execution
        html = driver.page_source
       
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")

        full_text = []

        # Extract elements
        date_block = soup.find("div", class_="article__date")
        full_text.append(date_block.contents[0].strip() if date_block else None)
        full_text.append(date_block.find("span", class_="article__time").text.strip() if date_block else None)
        full_text.append(soup.find("h1", class_="article__title").text.strip())
        

        text_div = soup.find("div", class_="article__text")
        paragraphs = text_div.find_all("p")
        full_text.append("\n".join(p.get_text(strip=True) for p in paragraphs))
        # Join all parts into a single string
        full_text = "\n".join(filter(None, full_text))  # Remove None values and join

        return full_text
    
    except Exception as e:
        print(f"[Error] Failed to fetch text from {url}: {e}")
        return None


## Get news by url

In [None]:
chrome_driver = check_and_renew_driver(chrome_driver, hub_url)

In [182]:
data_test_web = data_test.copy()

In [None]:
# get text from the pages and add it to the dataframe
for url in url_list[:]:
    match = re.match(r'https?://([^/]+)/([^/]+)/', url)

    # join the domain and page to get the full URL
    news_domain = f"https://{match.group(1)}/{match.group(2)}"   

    if 'expert.ru'in news_domain:
        if 'multimedia' in news_domain:
            text = get_text_from_expert_multimedia(url)
        else:
            text = get_text_from_expert(url)
        # add column with text
        data_test_web.loc[data_test_web['URL'] == url, 'web_text'] = text
    elif 'ria.ru' in news_domain:
        text = get_text_from_ria(url)
        # add column with text
        data_test_web.loc[data_test_web['URL'] == url, 'web_text'] = text
    elif 'lenta.ru' in news_domain:
        if 'briefs' in news_domain:
            text = get_text_from_lenta_brief(url)
        else:
            # for 'lenta.ru/articles' 'lenta.ru/news' 'lenta.ru/extlink'
            text = get_text_from_lenta_article(url, chrome_driver)  
        data_test_web.loc[data_test_web['URL'] == url, 'web_text'] = text
    elif 'vesti.ru' in news_domain:
        text = get_text_from_vesti(url, chrome_driver)
        # add column with text
        data_test_web.loc[data_test_web['URL'] == url, 'web_text'] = text

data_test_web.head()

Unnamed: 0,URL,lib_text,web_text
0,https://expert.ru/ekonomika/vygodna-li-rossii-...,Серьезнее других от введения западных санкций ...,Выгодна ли России отмена санкций США Ослаблени...
1,https://ria.ru/20250311/klyuchevaya_stavka-196...,"МОСКВА, 14 фев — РИА Новости Ключевая процентн...","МОСКВА, 25 апр — РИА Новости.Ключевая процентн..."
2,https://expert.ru/mnenie/denis-manturov-gosuda...,Со следующего года начнется реализация 12 мега...,Денис Мантуров: «Государство в рамках нацпроек...
3,https://ria.ru/20250304/kredity-2003042476.html,Самозапрет на кредиты - это новая возможность ...,"МОСКВА, 4 мар - РИА Новости.Самозапрет на кред..."
4,https://lenta.ru/brief/2025/03/11/green/,"Заходя в магазин, вы часто замечали зеленые уп...","11:43, 11 марта 2025\nСреда обитания\nКомпании..."


In [210]:
data_test_web.iloc[40:49, :]

Unnamed: 0,URL,lib_text,web_text
40,https://expert.ru/v-mire/voyna-eshche-ne-zakon...,«Сирию могут расчленить»\nВедущий научный сотр...,Война еще не закончилась К чему приведет крово...
41,https://expert.ru/multimedia/photos/v-moskve-p...,На NAIS был впервые показан широкой публике от...,Фототека Фото:Евгений Филиппов
42,https://ria.ru/20250311/pridnestrove-200421351...,"Последние события в Румынии, включающие фактич...","Последние события в Румынии, включающие фактич..."
43,https://expert.ru/news/minstroy-rekomendoval-n...,"В Минстрое уточнили, что документ носит рекоме...",Минстрой рекомендовал не строить микроквартиры...
44,https://expert.ru/v-mire/amerika-ispytyvaet-kr...,Победа на выборах Дональда Трампа и разговоры ...,Америка испытывает крипторынок на прочность На...
45,https://expert.ru/novosti-partnerov/transforma...,Цель проекта — «разработать универсальную диза...,Трансформация образа: в чем ценность дизайн-си...
46,https://ria.ru/20250311/matkapital-1733022511....,"МОСКВА, 12 мар - РИА Новости. Материнский капи...","МОСКВА, 12 мар - РИА Новости.Материнский капит..."
47,https://lenta.ru/news/2025/03/10/odin-iz-samyh...,Российские войска начали реализацию одного из ...,"Date: 14:07, 10 марта 2025\nCategory: Россия\n..."
48,https://ria.ru/20250311/aktivy-2004213012.html,"МОСКВА, 11 мар — РИА Новости. Должностные лица...","МОСКВА, 11 мар — РИА Новости.Должностные лица ..."


In [206]:
data_test_web.isnull().sum()

URL         0
lib_text    0
web_text    0
dtype: int64

In [207]:
data_test_web.to_csv('tmp/data_test_web.csv', index=False)