In [1]:
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
import threading
import time

In [2]:
# class for thread

# locker
threadLock = threading.Lock()

class MyThread(threading.Thread):
    def __init__(self, processes, result, article, tag):
        threading.Thread.__init__(self)
        self.processes = processes
        self.result = result
        self.article = article
        self.tag = tag
    def run(self):
        temp = []
        for process in self.processes:
            temp.append(process(self.article))
        temp.append(self.tag)
        threadLock.acquire()
        self.result.append(temp)
        threadLock.release()

In [3]:
# finding header of the article
def find_header(article):
    return (article.find("h2", class_ = "entry-title")).text

# finding full text of the article
def find_article_text(article):
    # search for url to the text
    text_url = article.find("div", class_ = "np-article-thumb").find("a").get("href")
    
    # connect to new html page
    temp_response = requests.get(text_url)
    temp_soup = BeautifulSoup(temp_response.text, "lxml")
    
    # search full text of article
    text_list = []
    for item in (temp_soup.find("div", class_ = "entry-content")).find_all("p"):
        text_list.append(item.text)

    # make one text from the list
    article_text = " ".join(text_list)
    
    # return result
    return article_text

# finding date of the article
def find_date(article):
    return (article.find("time")).text

# main func for finding articles
def find_information(url, tag, num_of_pages, result):
    url += tag + "/"
    
    for i in range(1, num_of_pages + 1):
        # change page (first page without page/1)
        if (i > 1):
            new_url = url + "page/{0}".format(str(i))
        else:
            new_url = url
            
        # connect to the page
        response = requests.get(new_url)
        soup = BeautifulSoup(response.text, "lxml")
        
        # get all articles from the page
        articles = soup.find_all("article")
        
        # create threads for each article
        processes = [find_header, find_article_text, find_date]
        threads = [MyThread(processes, result, articles[i], tag) for i in range(len(articles))]
        
        # start running threads
        for t in threads:
            t.start()
            
        # waiting for every thread
        for t in threads:
            t.join()
    
    
            

url = 'https://panorama.pub/category/news/' # + tag
tags = ['politics', 'society', 'science', 'economics', 'books']




# politics 129
# society 316
# science 27
# economics 28
# books 1

In [6]:
result = []
start_time = time.time()
find_information(url, "politics", 129, result)
find_information(url, "society", 316, result)
find_information(url, "science", 27, result)
find_information(url, "economics", 28, result)
find_information(url, "books", 1, result)
print ("--- {0} seconds ---".format(time.time() - start_time))

--- 1393.9433810710907 seconds ---


In [7]:
data = (pd.DataFrame(data=result, columns=['title', 'text', 'date', 'tag'])).sample(frac=1).reset_index(drop=True)
data.to_excel("C:/Users/semav/Desktop/result.xlsx", sheet_name="articles_panorama")