In [4]:
import time
import sys
import os
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
def progressBar(value, endvalue, size, bar_length=50):
    percent = float(value + 1) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    sys.stdout.write("\r[{0}] {1}/{2} \t Size : {3}".format(arrow + spaces, value+1, endvalue, size))
    sys.stdout.flush()

## Connect URL

In [3]:
url = 'https://www.theguardian.com/world?page=1'
response = requests.get(url)

## Extract Article URL

In [4]:
bs = BeautifulSoup(response.text, 'html.parser')

In [5]:
article_info = bs.findAll('a', {'data-link-name' : 'article'})

In [6]:
article_list = [article['href'] for article in article_info]
article_list = list(set(article_list))

In [7]:
for i in range(5) :
    print('%d th Article' %i)
    print(article_list[i])

0 th Article
https://www.theguardian.com/film/2021/oct/11/i-am-belmaya-review-nepali-dalit-filmmaker
1 th Article
https://www.theguardian.com/commentisfree/2021/oct/16/apples-plan-to-scan-images-will-allow-governments-into-smartphones
2 th Article
https://www.theguardian.com/world/2021/oct/17/brutal-aggression-venezuela-halts-talks-with-opposition-after-envoy-extradited-to-us
3 th Article
https://www.theguardian.com/world/2021/oct/17/macron-and-the-french-trump-trap-gaullisms-heirs-in-a-political-vice
4 th Article
https://www.theguardian.com/world/live/2021/oct/17/covid-news-live-gordon-brown-vaccines-africa-uk-cases-coronavirus-latest


## Crawling Article Data

In [15]:
def crawl_data(base_url : str, size : int) -> list :
    article_data = []
    
    for i in range(1,size) :
        try :
            url = base_url + '?page=' + str(i)
    
            response = requests.get(url)
            bs = BeautifulSoup(response.text, 'html.parser')
            
            page_url = bs.find('link' , {'rel' : 'canonical'})
            page_url = page_url['href']
            
            if url != page_url :
                break
    
            article_info = bs.findAll('a', {'data-link-name' : 'article'})
        
            article_list = [article['href'] for article in article_info]
            article_list = list(set(article_list))
        
            article_data.extend(article_list)
            progressBar(i, size, len(article_data))
        except :
            continue
            
    article_data = list(set(article_data))
    return article_data

In [16]:
DATA_SIZE = 2000

In [None]:
world_articles = crawl_data('https://www.theguardian.com/world', 1800)

[>                                                 ] 8/1800 	 Size : 120

In [None]:
uk_articles = crawl_data('https://www.theguardian.com/uk-news', 1800)

In [9]:
tech_articles = crawl_data('https://www.theguardian.com/technology', 1800)

[------------------------------------------------->] 1800/1800 	 Size : 36035

In [None]:
business_articles = crawl_data('https://www.theguardian.com/business', 1800)

In [None]:
sport_articles = crawl_data('https://www.theguardian.com/sport', 1800)

In [None]:
environment_articles = crawl_data('https://www.theguardian.com/environment', 1800)

In [None]:
culture_articles = crawl_data('https://www.theguardian.com/culture', 1800)

In [18]:
article_data = world_articles + \
    uk_articles + \
    tech_articles + \
    environment_articles + \
    business_articles + \
    sport_articles + \
    culture_articles

In [None]:
article_data = list(set(article_data))

In [None]:
article_df = pd.DataFrame({'ID' : range(1, len(article_data)+1), 'URL' : article_data})

In [None]:
print('Data Size of articles : %d \n' len(article_df))
article_df.head()

In [None]:
article_df.to_csv('theguardinas.com_articles.csv')

## Article Data Crawling

In [26]:
class ArticleCrawler :
    
    def __init__(self, ) :
        pass
    
    def connect(self, url) :
        assert isinstance(url, str)
        response = requests.get(url)
        bs = BeautifulSoup(response.text, 'html.parser')
        return bs
    
    def get_title(self, bs) :
        title_info = bs.title.text
        index = title_info.find(' | The Guardian')
        return title_info[:index]
    
    def get_date(self, bs) :
        date_info = bs.find('meta', {'property' : 'article:published_time'})
        date = date_info['content']
        
        date_str = date.split('T')[0]
        return date_str
            
    def get_image_url(self, bs) :
        url_info = bs.find('meta', {'property' : 'og:image'})
        image_url = url_info['content']
        return image_url
    
    def get_image_text(self, bs) :
        for tag in bs.findAll('img') :
            if 'alt' in tag.attrs :
                return tag['alt']
            
        raise Exception('There is not img alt Attribute')
        
    def get_text(self, bs) :
        main_content = bs.find('div' , {'class' : "dcr-185kcx9"})
        p_list = main_content.findAll('p')
        
        text_data = [p.text for p in p_list]
        text_data = '\n'.join(text_data)
        return text_data
    
    def __call__(self, url) :
        bs = self.connect(url)
        
        title = self.get_title(bs)
        date = self.get_date(bs)
        image_url = self.get_image_url(bs)
        image_text = self.get_image_text(bs)
        text = self.get_text(bs)
        category = url.split('/')[3]
        
        return {'title' : title, 
                'date' : date, 
                'image_url' : image_url,
                'image_text' : image_text,
                'category' : category,
                'text' : text}
        

## Data Writer

In [27]:
import urllib
from urllib.request import urlopen
from PIL import Image

In [28]:
class Writer :
    def __init__(self, dir_path) :
        self.dir_path = dir_path
        
    def __call__(self, article_data : dict) :
        assert None not in list(article_data.values())
        
        title = article_data['title']
        date = article_data['date']
        category = article_data['category']
        image_url = article_data['image_url']
        image_text = article_data['image_text']
        text = article_data['text']
        
        image_path = self.save_image(image_url, title)
        text_path = self.save_text(text, title)
        
        return title, date, category, text_path, image_path, image_text
        
    def save_image(self, image_url : str, title : str) -> str :
        image_path = os.path.join(self.dir_path, 'image', title)
        image_path = image_path + '.jpg'
        
        urllib.request.urlretrieve(image_url, image_path)
        return image_path
        
    def save_text(self, text : str, title : str) -> str :
        text_path = os.path.join(self.dir_path, 'text', title)
        text_path = text_path + '.txt'
        
        with open(text_path, 'w') as f:
            f.write(text)
        return text_path
                

In [29]:
article_crawler = ArticleCrawler()
writer = Writer('./Data')

In [30]:
isinstance(article_crawler, ArticleCrawler)

True

## Save Data

In [78]:
def save(article_list) :
    title_list = []
    date_list = []
    cate_list = []
    text_paths = []
    img_paths = []
    img_texts = []

    for i, article_url in enumerate(tqdm(article_list)) :
        
        try :
            article_info = article_crawler(article_url)
            title, date, category, text_path, image_path, image_text = writer(article_info)
        except :
            continue
    
        title_list.append(title)
        date_list.append(date)
        cate_list.append(category)
        text_paths.append(text_path)
        img_paths.append(image_path)
        img_texts.append(image_text)
    
    data_df = pd.DataFrame({'title' : title_list, 
                            'date' : date_list,
                            'category' : cate_list, 
                            'text' : text_paths, 
                            'image' : img_paths, 
                            'image_texts' : img_texts})
    
    return data_df