In [1]:
import urllib, urllib.request
from bs4 import BeautifulSoup
import requests
import json
import re
from tqdm import tqdm
from random import randint
from datetime import datetime
import pandas as pd
import pypandoc
from tqdm import tqdm
import pathlib


In [2]:
def url_to_bs4(url):
       hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
       request = urllib.request.Request(url, headers=hdr)
       data_str = urllib.request.urlopen(request, timeout=10).read()
       data_str = data_str.decode('utf-8')
       parsed_html = BeautifulSoup(data_str,'html.parser')
       return parsed_html

def get_url_list(archive_url):
       parsed_html = url_to_bs4(archive_url)
       all_items = json.loads(parsed_html.find('script',attrs={'type':'application/json'}).string)['props']['pageProps']['content']['hasPart']['parts']
       return [item['url']['canonical'] for item in all_items]



In [3]:
def html_text_piece_to_latex(item):
    if 'class' not in item.attrs:
        return None
    item_class = item.attrs['class']
    if 'article__body-text' not in item_class:
        return None
    paragraph = pypandoc.convert_text(item, 'tex', format='html').replace('\n',' ')
    if 'article__body-text--dropcap' in item_class:
        paragraph = '\lettrine'+paragraph
    return paragraph


def convert_image_url_to_tex(image_url,tex_path, is_main_image=True):
        
        image_local_path = os.path.join(tex_path,'images',image_url.split('/')[-1])
        os.system("wget -O "+image_local_path+" "+image_url)
        if is_main_image:
            image_tex = "\\begin{figure*}[h]\n\\centering\n\\includegraphics[width=0.8\\textwidth]{images/%s}\n\\end{figure*}\n" % image_url.split('/')[-1]
        else:
            #image_tex = "\\begin{wrapfigure}[20]{L}{0.5\textwidth}\n\\includegraphics[width=0.4\\textwidth]{images/%s}\n\\end{wrapfigure}\n" % image_url.split('/')[-1]
            image_tex = "\\begin{figure*}[h]\n\\centering\n\\includegraphics[width=0.4\\textwidth]{images/%s}\n\\end{figure*}\n" % image_url.split('/')[-1]
        return image_tex


def convert_article_ulr_to_latex(article_url,tex_path):
    parsed_article = url_to_bs4(article_url)
    # meta info
    json_dict = json.loads(parsed_article.find("script",attrs={'type':"application/json"}).string)['props']['pageProps']['content']
    if type(json_dict) is list:
        json_dict = json_dict[0]
    meta_dict_str = {'article_headline':json_dict['headline'],'article_subheadline':json_dict['subheadline'],'description':json_dict['description'], 'section':json_dict['_section']['sectionHeadline'], 'subsection':json_dict['_section']['sectionSubheadline'], "date":json_dict['datePublishedString']}
    meta_dict_tex = {k: pypandoc.convert_text(v, 'tex', format='html').replace('\n',' ') for k,v in meta_dict_str.items()}
    # main image
    main_image_url = json_dict['image']['main']['url']['canonical']
    main_image_tex = convert_image_url_to_tex(main_image_url,tex_path)
    # article body
    article = []
    for item in parsed_article.find('div',attrs={'class':'ds-layout-grid ds-layout-grid--edged layout-article-body'}).contents:
        if 'class' not in item.attrs:
            continue
        item_class = item.attrs['class']
        if 'article__body-text-image' in item_class:
            body_image_url = item.find('div',attrs={'itemprop':'image'}).find("meta",attrs={"itemprop":"url"})['content']  
            article.append(convert_image_url_to_tex(body_image_url,tex_path,is_main_image=False))
            for sub_item in item.contents:
                paragraph = html_text_piece_to_latex(sub_item)
                if paragraph:
                    article.append(paragraph)
        else:
            paragraph = html_text_piece_to_latex(item)
            if paragraph:
                    article.append(paragraph)
    final_tex = main_image_tex + '\n\n'.join(article)
    final_tex = final_tex.replace('■','').replace("\\euro{}","€")
    return final_tex, meta_dict_tex



In [4]:
archive_url = 'https://www.economist.com/weeklyedition/archive'
edition_url_list = get_url_list(archive_url)

In [5]:
lastest_edition_url = edition_url_list[0]

In [6]:
edition_name = lastest_edition_url.split('/')[-1]
tex_path = os.path.join('tex',edition_name)

In [7]:
all_articles_url = get_url_list(lastest_edition_url)

In [8]:
tex_prefix = open('tex/template.tex','r').read()

In [10]:
pathlib.Path(tex_path+'/images').mkdir(parents=True, exist_ok=True)

with open(os.path.join(tex_path,'main.tex'),'w', encoding='utf-8') as f_open:
    f_open.write(tex_prefix)
    current_section = ''
    for article_url in tqdm(all_articles_url):
        try:
            tex, meta_info = convert_article_ulr_to_latex(article_url,tex_path)
        except:
            continue
        if not current_section:
            current_section = meta_info['section']
            f_open.write ("\\section{%s}\n"%meta_info['section'])
        elif current_section!=meta_info['section']:
            f_open.write ("\\section{%s}\n"%meta_info['section'])
            current_section = meta_info['section']
        f_open.write ("\\subsubsection{%s}\n"%meta_info['article_subheadline'])
        f_open.write ("\\subsection{%s}\n"%meta_info['article_headline'])
        f_open.write ("\\paragraph{Print Edition | %s \quad \color{gray}{%s}}\n"%(meta_info['section'] ,meta_info['date']))

        f_open.write (tex+'\n')
        f_open.write('\\clearpage\n')
    f_open.write ('\\end{document}')




100%|██████████| 82/82 [02:50<00:00,  2.08s/it]


In [12]:
article_url = 'https://www.economist.com/europe/2021/03/27/a-row-over-land-takes-italy-back-to-the-middle-ages'
parsed_article = url_to_bs4(article_url)

In [13]:
parsed_article.find('div',attrs={'class':'ds-layout-grid ds-layout-grid--edged layout-article-body'}).contents

[<div class="layout-sticky-rail"><div class="layout-sticky-rail-advert-wrapper"><div class="advert right hidden advert--right-rail advert--sticky-rail" id=""><div><div id="econright-r1"></div></div></div></div></div>,
 <aside class="article__aside"><div class="layout-article-meta"><time class="article__dateline-datetime" datetime="2021-03-27T00:00:00Z" itemscope="" itemtype="http://schema.org/DateTime">Mar 27th 2021</time><meta content="The Economist" itemprop="author"/><p class="article__dateline-location" data-test-id="Dateline" itemtype="http://schema.org/dateline">ROME</p></div><div class="layout-article-sharing"><ul class="ds-share-list"><li id="Facebook"><a aria-label="Share on Facebook" class="ds-share-link"><svg height="32" viewbox="-1 -1 34 34" width="32"><title>Facebook</title><g fill="none" fill-rule="evenodd" id="share-facebook"><circle class="path-background" cx="16" cy="16" fill="#333" r="16"></circle><path class="path-foreground" d="M17.49 26v-9.123h2.95l.44-3.555h-3.39v

In [19]:
pypandoc.convert_text("35,000", 'tex', format='html')

'35,000\n'