In [None]:
import os
import pickle
import json

In [None]:
os.chdir('/content/')

# clone refnews data repository
!git clone https://github.com/sfschouten/refnews.git

Cloning into 'refnews'...
remote: Enumerating objects: 34, done.[K
remote: Total 34 (delta 0), reused 0 (delta 0), pack-reused 34[K
Unpacking objects: 100% (34/34), done.


In [None]:
os.chdir('/content/')

# clone mwep repository (which containst class files necessary to unpickle)
!git clone https://github.com/sfschouten/multilingual-wiki-event-pipeline.git mwep -b develop

Cloning into 'mwep'...
remote: Enumerating objects: 774, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 774 (delta 22), reused 44 (delta 19), pack-reused 721[K
Receiving objects: 100% (774/774), 84.18 MiB | 18.64 MiB/s, done.
Resolving deltas: 100% (504/504), done.


In [None]:
# dependency
!pip install rdflib news_please

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[K     |████████████████████████████████| 500 kB 4.2 MB/s 
[?25hCollecting news_please
  Downloading news_please-1.5.22-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 7.5 MB/s 
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 573 kB/s 
[?25hCollecting Scrapy>=1.1.0
  Downloading Scrapy-2.7.0-py2.py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 56.6 MB/s 
[?25hCollecting readability-lxml>=0.6.2
  Downloading readability_lxml-0.8.1-py3-none-any.whl (20 kB)
Collecting plac>=0.9.6
  Downloading plac-1.3.5-py2.py3-none-any.whl (22 kB)
Collecting warcio>=1.3.3
  Downloading warcio-1.7.4-py2.py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 4.4 MB/s 
[?25hCollecting ago

In [None]:
TOPIC = {
    'Q1241356_en,pilot.bin':  ('human interest / record and achievement'),
    'Q149086_en,pilot.bin':   ('crime, law and justice / crime / homicide'),
    'Q167170_en,pilot.bin':   ('sport / sport event'),
    'Q18515440_en,pilot.bin': ('sport / sports transaction'),
    'Q27318_en,pilot.bin':    ('educaction / educational testing and examinations'),
    'Q350604_en,pilot.bin':   ('conflict, war and peace / armed conflict'),
    'Q43109_en,pilot.bin':    ('politics / election / referenda'),
    'Q45382_en,pilot.bin':    ('conflict, war and peace / coup d\'etat'),
    'Q669262_en,pilot.bin':   ('politics / election / primary election'),
    'Q7590_en,pilot.bin':     ('economy, business and finance / strategy and marketing / transport'),
    'Q8065_en,pilot.bin':     ('disaster, accident and emergency incident / disaster / natural disaster'),
    'Q11822042_en,pilot.bin': ('accident and emergency incident / transportation accident and incident')
}

DATA_DIR = '/content/refnews/data'

collections_by_file = {}
total_articles = 0
total_incidents = 0

# switch to code directory so class files can be found to unpickle
os.chdir(os.path.join('/content/mwep'))

print('Total number of articles and incidents for which we have URIs.')
print("file                     |     #articles / #incidents    |  topic")
print('------------------------------------------------------------------------------------------------------------------------------')

for f in os.listdir(DATA_DIR):
    if not f.endswith(".bin"):
        continue
    
    with open(os.path.join(DATA_DIR, f), 'rb') as pickle_file:
        collection = pickle.load(pickle_file)
        collections_by_file[f] = collection

    nr_incidents = len(collection.incidents)
    nr_articles = sum(len(incident.reference_texts) for incident in collection.incidents)

    total_articles += nr_articles
    total_incidents += nr_incidents

    short = f[f.find('Q'):]
    topic = TOPIC[short]

    print(F"{short:<23}  |  {nr_articles:<6}  /  {nr_incidents:<5}  =  {nr_articles/nr_incidents:>6.2f}  |  {topic}")

print('------------------------------------------------------------------------------------------------------------------------------')
print(F'total                    |  {total_articles:<7} / {total_incidents:<5}')

Total number of articles and incidents for which we have URIs.
file                     |     #articles / #incidents    |  topic
------------------------------------------------------------------------------------------------------------------------------
Q669262_en,pilot.bin     |  2612    /  193    =   13.53  |  politics / election / primary election
Q8065_en,pilot.bin       |  10900   /  1103   =    9.88  |  disaster, accident and emergency incident / disaster / natural disaster
Q11822042_en,pilot.bin   |  16551   /  1822   =    9.08  |  accident and emergency incident / transportation accident and incident
Q45382_en,pilot.bin      |  3416    /  339    =   10.08  |  conflict, war and peace / coup d'etat
Q43109_en,pilot.bin      |  5627    /  722    =    7.79  |  politics / election / referenda
Q350604_en,pilot.bin     |  4155    /  137    =   30.33  |  conflict, war and peace / armed conflict
Q167170_en,pilot.bin     |  7188    /  518    =   13.88  |  sport / sport event
Q18515440_e

In [None]:
mwep_settings = json.load(open('/content/mwep/config/mwep_settings.json'))

# settings for crawling Wikipedia sources
excluded_domains = set(mwep_settings['newsplease']['excluded_domains'])
title_required = mwep_settings['newsplease']['title_required']
range_start, range_end = mwep_settings['newsplease']['num_chars_range']
num_chars_range = range(int(range_start), int(range_end))
startswith = mwep_settings['newsplease']['startswith']
timeout = mwep_settings['newsplease']['timeout']
illegal_substrings = mwep_settings['newsplease']['illegal_substrings']
illegal_chars_in_title = mwep_settings['newsplease']['illegal_chars_in_title']

In [None]:
import crawl_utils
import hashlib


for f, collection in collections_by_file.items():
    
    for incident_obj in collection.incidents:
        
        map = {
            ref_text_obj.web_archive_uri: ref_text_obj
            for ref_text_obj in incident_obj.reference_texts
            if 'Wikipedia source' in ref_text_obj.found_by
        }
       
        primary_url_to_ref_text_obj = crawl_utils.get_ref_text_obj_of_primary_reference_texts(
            list(map.keys()),
            timeout,
            startswith=startswith,
            excluded_domains=excluded_domains,
            title_required=True,
            num_chars_range=num_chars_range,
            illegal_substrings=illegal_substrings,
            illegal_chars_in_title=illegal_chars_in_title,
        )

        for wa_url, new_text_obj in primary_url_to_ref_text_obj.items():
            old_text_obj = map[wa_url]

            old_md5_hash = old_text_obj.content
            new_md5_hash = hashlib.md5(new_text_obj.content.encode('utf-8')).hexdigest()

            # replace md5 with content
            old_text_obj.content = new_text_obj.content

            changed = new_md5_hash != old_md5_hash
            if changed:
                print(f'WARNING: text for {old_text_obj.web_archive_uri} changed')

http://web.archive.org/web/201: : 1it [00:00,  1.67it/s]
http://web.archive.org/web/202: : 4it [00:12,  3.17s/it]
http://web.archive.org/web/201: : 2it [00:01,  1.42it/s]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 8it [00:06,  1.16it/s]
http://web.archive.org/web/202: : 4it [00:17,  4.29s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.76it/s]
https://web.archive.org/web/20: : 7it [01:25, 16.86s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://web.archive.org/web/20120402195749/http://www.indystar.com/apps/pbcs.dll/article?AID=/20080513/NEWS0502/805130391 HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=8)


https://web.archive.org/web/20: : 9it [01:36, 10.69s/it]
http://web.archive.org/web/202: : 11it [00:18,  1.70s/it]
http://web.archive.org/web/201: : 3it [00:53, 17.72s/it]
http://web.archive.org/web/202: : 77it [02:12,  1.42s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: http://web.archive.org/web/20220101151849/https://www.cnn.com/2016/06/09/politics/president-barack-obama-endorses-hillary-clinton-in-video/index.html HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=8)


http://web.archive.org/web/202: : 110it [03:16,  1.32s/it]



http://web.archive.org/web/202: : 145it [04:24,  1.82s/it]
http://web.archive.org/web/202: : 6it [00:10,  1.70s/it]
http://web.archive.org/web/202: : 13it [00:23,  1.78s/it]
0it [00:00, ?it/s]
https://web.archive.org/web/20: : 2it [00:02,  1.14s/it]
http://web.archive.org/web/202: : 3it [00:04,  1.48s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.18it/s]
http://web.archive.org/web/202: : 3it [00:04,  1.50s/it]
https://web.archive.org/web/20: : 1it [00:00,  1.42it/s]
http://web.archive.org/web/202: : 6it [00:26,  4.40s/it]
http://web.archive.org/web/202: : 16it [08:23, 31.49s/it]
http://web.archive.org/web/202: : 1it [00:01,  1.03s/it]
http://web.archive.org/web/202: : 23it [00:38,  1.68s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.70it/s]
https://web.archive.org/web/20: : 6it [01:26, 14.46s/it]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 1it [00:01,  1.43s/it]
http://web.archive.org/web/202: : 9it [00:25,  2.79s/it]
https://web.archive.org/web/20: : 12it [01:46

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://web.archive.org/web/20080226113250/http://www.voanews.com/english/2008-02-23-voa3.cfm HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=8)


https://web.archive.org/web/20: : 15it [05:29, 21.95s/it]
http://web.archive.org/web/202: : 21it [00:51,  1.87s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: http://web.archive.org/web/20210425211658/https://nypost.com/2020/02/06/iowa-democratic-party-chair-ignores-dnc-calls-for-recount-of-caucus/ HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=8)


http://web.archive.org/web/202: : 24it [01:02,  2.43s/it]



http://web.archive.org/web/202: : 27it [01:08,  2.23s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: http://web.archive.org/web/20210922065844/https://apnews.com/6fa91b6c58290a24655c5ae25cdcb184 HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=8)


http://web.archive.org/web/202: : 28it [01:16,  4.01s/it]



http://web.archive.org/web/202: : 85it [03:16,  2.00s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: http://web.archive.org/web/20210421034554/https://www.theverge.com/2020/2/5/21125449/iowa-recorder-app-democractic-caucus-motherboard-published HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=8)


http://web.archive.org/web/202: : 90it [03:32,  2.36s/it]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 1it [00:01,  1.72s/it]
http://web.archive.org/web/201: : 31it [00:51,  1.57s/it]

  " Skipping tag %s" % (size, len(data), tag)


https://web.archive.org/web/20: : 32it [01:16,  8.69s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: https://web.archive.org/web/20120306113437/http://firstread.msnbc.msn.com/_news/2011/03/09/6226568-2012-searching-for-the-anti-romney HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=8)


http://web.archive.org/web/201: : 65it [02:41,  2.48s/it]




http://web.archive.org/web/202: : 2it [00:13,  6.59s/it]
https://web.archive.org/web/20: : 8it [00:11,  1.40s/it]
http://web.archive.org/web/202: : 1it [00:01,  1.80s/it]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 1it [00:00,  1.71it/s]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 16it [00:22,  1.38s/it]
0it [00:00, ?it/s]
https://web.archive.org/web/20: : 2it [00:03,  1.79s/it]
https://web.archive.org/web/20: : 13it [01:50,  8.50s/it]
https://web.archive.org/web/20: : 1it [00:53, 53.29s/it]
http://web.archive.org/web/202: : 12it [01:37,  8.14s/it]
http://web.archive.org/web/202: : 5it [00:07,  1.46s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.74it/s]
0it [00:00, ?it/s]
http://web.archive.org/web/201: : 6it [00:14,  1.95s/it]

ERROR:newsplease.crawler.simple_crawler:connection/timeout error: http://web.archive.org/web/20160609211046/https://www.youtube.com/watch?v=4wVm8WjbVJU HTTPConnectionPool(host='web.archive.org', port=80): Read timed out. (read timeout=8)


http://web.archive.org/web/202: : 12it [00:34,  2.86s/it]
http://web.archive.org/web/202: : 6it [00:12,  2.17s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 2it [00:02,  1.05s/it]
http://web.archive.org/web/202: : 3it [00:10,  3.35s/it]
https://web.archive.org/web/20: : 3it [00:29,  9.71s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.00it/s]
http://web.archive.org/web/202: : 6it [00:11,  1.90s/it]
http://web.archive.org/web/202: : 4it [00:11,  2.07s/it]



https://web.archive.org/web/20: : 13it [00:26,  2.05s/it]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 22it [00:51,  1.55s/it]



http://web.archive.org/web/202: : 29it [01:06,  2.28s/it]
http://web.archive.org/web/202: : 31it [01:32,  3.00s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.71it/s]
http://web.archive.org/web/202: : 1it [00:01,  1.19s/it]
https://web.archive.org/web/20: : 2it [00:01,  1.21it/s]
http://web.archive.org/web/201: : 2it [00:03,  1.65s/it]
http://web.archive.org/web/202: : 1it [00:01,  1.98s/it]
https://web.archive.org/web/20: : 1it [00:06,  6.28s/it]
http://web.archive.org/web/202: : 3it [00:10,  3.42s/it]
0it [00:00, ?it/s]
http://web.archive.org/web/202: : 5it [00:08,  1.63s/it]
http://web.archive.org/web/202: : 1it [00:06,  6.09s/it]
https://web.archive.org/web/20: : 1it [00:04,  4.36s/it]

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/newspaper/images.py", line 118, in fetch_url
    p.feed(new_data)
  File "/usr/local/lib/python3.7/dist-packages/PIL/ImageFile.py", line 411, in feed
    im = Image.open(fp)
  File "/usr/local/lib/python3.7/dist-packages/PIL/Image.py", line 2881, in open
    im = _open_core(fp, filename, prefix)
  File "/usr/local/lib/python3.7/dist-packages/PIL/Image.py", line 2867, in _open_core
    im = factory(fp, filename)
  File "/usr/local/lib/python3.7/dist-packages/PIL/ImageFile.py", line 107, in __init__
    self._open()
  File "/usr/local/lib/python3.7/dist-packages/PIL/IcoImagePlugin.py", line 279, in _open
    self.load()
  File "/usr/local/lib/python3.7/dist-packages/PIL/IcoImagePlugin.py", line 295, in load
    im = self.ico.getimage(self.size)
  File "/usr/local/lib/python3.7/dist-packages/PIL/IcoImagePlugin.py", line 161, in getimage
    return self.frame(self.getentryindex(size, bpp))
  File "/usr/local/

https://web.archive.org/web/20: : 17it [02:48,  9.93s/it]
http://web.archive.org/web/202: : 1it [00:01,  1.81s/it]
http://web.archive.org/web/202: : 1it [00:00,  1.69it/s]
http://web.archive.org/web/202: : 4it [01:00, 17.26s/it]