In [10]:
import trafilatura
import json
from langdetect import detect
import hanzidentifier
from urllib.parse import urlparse
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [11]:
def extract_TUN_XIAO_EDU_AU_link(max_pages=14):
    links = set()
    for page_number in range(14):
        if page_number == 0:
            url = f"https://au.oliuxue.com/studentnews/?"
        else:
            url = f"https://au.oliuxue.com/studentnews/?page={page_number}"
        print(f'accessing {url}')
        resp = requests.get(url, headers={"User-Agent": "MyBot/1.0"})
        time.sleep(2)
        soup = BeautifulSoup(resp.text, "lxml")

        # Extract all href links
        for a in soup.find_all("a", href=True):
            full_url = urljoin(url, a["href"])  # make relative URLs absolute
            if full_url.startswith('https://au.oliuxue.com/studentnews/'):
                links.add(full_url)
    return links

In [12]:
def load_web(url):
    print(f'start fetch {url}')
    # Download HTML
    downloaded = trafilatura.fetch_url(url)
    # Extract with metadata
    data_json = trafilatura.extract(
        downloaded,
        output_format="json",   # structured output
        with_metadata=True,     # include title, author, date, etc.
        include_comments=False,
        include_images=False
    )
    if data_json:
        data = json.loads(data_json)

        # return none for 404 page
        if data.get('title') == 'undefined':
            return None
        
        return data
    else:
        return None

In [13]:
def check_language(text):
    # detect language
    try:
        language = detect(text)
    except:
        return None

    # check chinese script type
    if language.startswith('zh'):
        has_simp = hanzidentifier.is_simplified(text)
        has_trad = hanzidentifier.is_traditional(text)
        
        if has_simp and not has_trad:
            return "simplified-chinese"
        elif has_trad and not has_simp:
            return "traditional-chinese"
        elif has_simp and has_trad:
            return 'mixed-chinese'

    # English language
    elif language.startswith("en"):
        return 'english'
    else:
        return language

In [14]:
def extract_source(url):
    parsed = urlparse(url)
    domain = parsed.netloc
    return domain

In [15]:
def parse_json(data, url, id):
    # Parse JSON
    result_json = {
        'id': str(id).zfill(5),
        'question': None,
        'raw_text': data.get('raw_text'),
        'text': data.get('text'),
        'source': extract_source(url),
        'title': data.get('title'),
        'author': data.get('author'),
        'post_date': data.get('date'),
        'language': check_language(data.get('text')),
        'created_at': data.get('filedate'),
        'excerpt': data.get('excerpt'),
        'tags': [data.get('tags')],
        'link': url
    }
    return result_json

In [16]:
def main():
    id = 13406
    json_list = []
    urls = extract_TUN_XIAO_EDU_AU_link()
    print(f'url collection succeed, collectoed {len(urls)} urls')
    seen = set()
    for url in urls:
        if url in seen:
            continue
        seen.add(url)
        data_json = load_web(url)
        time.sleep(2)
        if not data_json:
            print('No result fetched\n')
            continue

        id += 1
        result_json = parse_json(data_json, url, id)
        json_list.append(result_json)
        print('Successfully fetched\n')
    
    print('fetch finished')
    with open("../data/YUN_XIAO_EDU_AU.json", "w", encoding="utf-8") as f:
        json.dump(json_list, f, ensure_ascii=False, indent=2)

In [17]:
if __name__ == '__main__':
    main()

accessing https://au.oliuxue.com/studentnews/?
accessing https://au.oliuxue.com/studentnews/?page=1
accessing https://au.oliuxue.com/studentnews/?page=2
accessing https://au.oliuxue.com/studentnews/?page=3
accessing https://au.oliuxue.com/studentnews/?page=4
accessing https://au.oliuxue.com/studentnews/?page=5
accessing https://au.oliuxue.com/studentnews/?page=6
accessing https://au.oliuxue.com/studentnews/?page=7
accessing https://au.oliuxue.com/studentnews/?page=8
accessing https://au.oliuxue.com/studentnews/?page=9
accessing https://au.oliuxue.com/studentnews/?page=10
accessing https://au.oliuxue.com/studentnews/?page=11
accessing https://au.oliuxue.com/studentnews/?page=12
accessing https://au.oliuxue.com/studentnews/?page=13
url collection succeed, collectoed 233 urls
start fetch https://au.oliuxue.com/studentnews/1995.html
Successfully fetched

start fetch https://au.oliuxue.com/studentnews/3999.html
Successfully fetched

start fetch https://au.oliuxue.com/studentnews/4658.html
S