In [8]:
import os
import json

In [21]:
project = 'enwiki'
dump_date = "20220420"
dataset_home = '/home/scai/phd/aiz218323/scratch/XML/wikipedia-data-science/'

data_dir = f"{dataset_home}/data"
partition_dir = f"{dataset_home}/partition"

## helper_code

In [23]:
def read_data(filename):
    with open(filename, 'r') as fin:
        data = json.loads(fin.read())
    return data

In [53]:
def is_node(title):
    return title in title_to_id

def is_redirect(title):
    return title in redirects

In [114]:
def separate_titles(same_titles, diff_titles, path):
    with open(path, encoding='latin-1') as fin:
        for line in fin.readlines():
            page = json.loads(line)
            title = page['title'].replace('_', ' ')
            title = title[0].lower() + title[1:]
            if title:
                if title in title_to_id:
                    same_titles.append(title)
                else:
                    diff_titles.append(title)

## id-to-title

In [24]:
id_title_file = f'{partition_dir}/enwiki-20220420_id-to-title.ndjson'

id_to_title = read_data(id_title_file)

In [28]:
for i, (page_id, title) in enumerate(id_to_title.items()):
    print(f'{i+1}: {page_id} {title}')
    if i > 4:
        break

1: 12 anarchism
2: 25 autism
3: 39 albedo
4: 290 a
5: 303 alabama
6: 305 achilles


In [29]:
title_to_id = {title:page_id for page_id, title in id_to_title.items()}

In [30]:
for i, (title, page_id) in enumerate(title_to_id.items()):
    print(f'{i+1}: {title} {page_id}')
    if i > 4:
        break

1: anarchism 12
2: autism 25
3: albedo 39
4: a 290
5: alabama 303
6: achilles 305


In [33]:
len(id_to_title)

6472830

## Redirect page

In [51]:
redirects_file = f'{partition_dir}/enwiki-20220420_redirects.ndjson'
redirects = read_data(redirects_file)

In [52]:
for i, (title1, title2) in enumerate(redirects.items()):
    print(f'{i+1}: {title1} - {title2}')
    if i > 4:
        break

1: accessibleComputing - computer accessibility
2: afghanistanHistory - history of Afghanistan
3: afghanistanGeography - geography of Afghanistan
4: afghanistanPeople - demographics of Afghanistan
5: afghanistanCommunications - communications in Afghanistan
6: afghanistanTransportations - transport in Afghanistan


## WikiSeeAlsoTitles-350K

In [70]:
data_path = f"{data_dir}/WikiSeeAlsoTItles-350K/"

train_path = f"{data_path}/trn.json"
test_path = f"{data_path}/tst.json"

In [80]:
same_titles, diff_titles = [], []

separate_titles(same_titles, diff_titles, train_path)
separate_titles(same_titles, diff_titles, test_path)

In [81]:
len(same_titles), len(diff_titles), len(same_titles)+len(diff_titles)

(692785, 99124, 791909)

In [82]:
total_pages = len(same_titles)+len(diff_titles)
print(f"Number of similar pages : {len(same_titles)} ({100.*len(same_titles)/total_pages:.4f}%)")
print(f"Number of different pages : {len(diff_titles)} ({100.*len(diff_titles)/total_pages:.4f}%)")

Number of similar pages : 692785 (87.4829%)
Number of different pages : 99124 (12.5171%)


In [90]:
diff_titles

['asociaciÃ³n Alumni',
 'andrÃ© Gide',
 'royal Antigua and Barbuda Defence Force',
 'list of Governors of Alabama',
 'casa BatllÃ³',
 'park GÃ¼ell',
 'casa MilÃ',
 'atanasoffâ\x80\x93Berry computer',
 'anadyr River',
 'abbotsford House',
 'approximant consonant',
 'ã\x86lle of Sussex',
 'aramaic language',
 'albrecht DÃ¼rer',
 'conservation and restoration of cultural heritage',
 'andrÃ© the Giant',
 'ã\x81ed mac CinÃ¡eda',
 'aberration of light',
 'alcobaÃ§a, Portugal',
 'amiga 500 Plus',
 'blindness in literature',
 'books of the Bible',
 'british and Irish Lions',
 'borsukâ\x80\x93Ulam theorem',
 'beerâ\x80\x93Lambert law',
 'braâ\x80\x93ket notation',
 'batman & Robin (film)',
 'brasÃ\xadlia',
 'murmured voice',
 'boer',
 'bathyscaphe Trieste',
 'bjÃ¸rn Lomborg',
 'canadaâ\x80\x93United States relations',
 'republic of Croatia Armed Forces',
 'wu Xing',
 'czesÅ\x82aw MiÅ\x82osz',
 'cleveland Indians',
 'rod (optics)',
 'customer-relationship management',
 'comet Haleâ\x80\x93Bopp',

In [91]:
redirects_titles = []
not_redirect_titles = []

for title in diff_titles: 
    if is_redirect(title):
        redirects_titles.append(title)
    else:
        not_redirect_titles.append(title)

In [92]:
print(f"Number of redirect pages : {len(redirects_titles)} ({100.*len(redirects_titles)/len(diff_titles):.4f}%)")
print(f"Number of not redirect pages : {len(not_redirect_titles)} ({100.*len(not_redirect_titles)/len(diff_titles):.4f}%)")

Number of redirect pages : 36069 (36.3878%)
Number of not redirect pages : 63055 (63.6122%)


In [93]:
not_redirect_titles

['asociaciÃ³n Alumni',
 'andrÃ© Gide',
 'casa BatllÃ³',
 'park GÃ¼ell',
 'casa MilÃ',
 'atanasoffâ\x80\x93Berry computer',
 'ã\x86lle of Sussex',
 'albrecht DÃ¼rer',
 'andrÃ© the Giant',
 'ã\x81ed mac CinÃ¡eda',
 'alcobaÃ§a, Portugal',
 'borsukâ\x80\x93Ulam theorem',
 'beerâ\x80\x93Lambert law',
 'braâ\x80\x93ket notation',
 'batman & Robin (film)',
 'brasÃ\xadlia',
 'bjÃ¸rn Lomborg',
 'canadaâ\x80\x93United States relations',
 'czesÅ\x82aw MiÅ\x82osz',
 'comet Haleâ\x80\x93Bopp',
 'cuitlÃ¡huac',
 'cuauhtÃ©moc',
 'cauchyâ\x80\x93Riemann equations',
 'deutsches Institut fÃ¼r Normung',
 'diogo CÃ£o',
 'domnall mac AilpÃ\xadn',
 'dÃ©jÃ\xa0 vu',
 'helsingÃ¸r',
 'encyclopÃ¦dia Britannica',
 'ã\x89douard Manet',
 'ã\x89variste Galois',
 'elblÄ\x85g',
 'ã\x86thelberht of Kent',
 'ã\x89lisabeth VigÃ©e Le Brun',
 'ã\x86thelred the Unready',
 'emperor ShÅ\x8dmu',
 'ã\x89mile Baudot',
 'evliya Ã\x87elebi',
 'emperor YÅ\x8dmei',
 'empress KÅ\x8dgyoku',
 'emperor KÅ\x8dtoku',
 'emperor KÅ\x8dbun',


In [94]:
is_node('andré Gide')

True

## LF-WikiSeeAlso-320K

In [96]:
data_path = f"{data_dir}/LF-WikiSeeAlso-320K/"

train_path = f"{data_path}/trn.json"
test_path = f"{data_path}/tst.json"

In [97]:
same_titles, diff_titles = [], []

separate_titles(same_titles, diff_titles, train_path)
separate_titles(same_titles, diff_titles, test_path)

In [98]:
len(same_titles), len(diff_titles), len(same_titles)+len(diff_titles)

(765946, 104651, 870597)

In [99]:
total_pages = len(same_titles)+len(diff_titles)
print(f"Number of similar pages : {len(same_titles)} ({100.*len(same_titles)/total_pages:.4f}%)")
print(f"Number of different pages : {len(diff_titles)} ({100.*len(diff_titles)/total_pages:.4f}%)")

Number of similar pages : 765946 (87.9794%)
Number of different pages : 104651 (12.0206%)


In [100]:
redirects_titles = []
not_redirect_titles = []

for title in diff_titles: 
    if is_redirect(title):
        redirects_titles.append(title)
    else:
        not_redirect_titles.append(title)

In [101]:
print(f"Number of redirect pages : {len(redirects_titles)} ({100.*len(redirects_titles)/len(diff_titles):.4f}%)")
print(f"Number of not redirect pages : {len(not_redirect_titles)} ({100.*len(not_redirect_titles)/len(diff_titles):.4f}%)")

Number of redirect pages : 38052 (36.3609%)
Number of not redirect pages : 66599 (63.6391%)


In [102]:
not_redirect_titles

['asociaciÃ³n Alumni',
 'andrÃ© Gide',
 'casa BatllÃ³',
 'park GÃ¼ell',
 'casa MilÃ',
 'atanasoffâ\x80\x93Berry computer',
 'ã\x81lfheimr',
 'ã\x86lle of Sussex',
 'albrecht DÃ¼rer',
 'andrÃ© the Giant',
 'ã\x81ed mac CinÃ¡eda',
 'alcobaÃ§a, Portugal',
 'borsukâ\x80\x93Ulam theorem',
 'beerâ\x80\x93Lambert law',
 'braâ\x80\x93ket notation',
 'batman & Robin (film)',
 'brasÃ\xadlia',
 'bjÃ¸rn Lomborg',
 'canadaâ\x80\x93United States relations',
 'czesÅ\x82aw MiÅ\x82osz',
 'comet Haleâ\x80\x93Bopp',
 'cuitlÃ¡huac',
 'cuauhtÃ©moc',
 'cauchyâ\x80\x93Riemann equations',
 'deutsches Institut fÃ¼r Normung',
 'diogo CÃ£o',
 'domnall mac AilpÃ\xadn',
 'dÃ©jÃ\xa0 vu',
 'helsingÃ¸r',
 'encyclopÃ¦dia Britannica',
 'ã\x89douard Manet',
 'ã\x89variste Galois',
 'elblÄ\x85g',
 'ã\x89lisabeth VigÃ©e Le Brun',
 'ã\x86thelred the Unready',
 'emperor ShÅ\x8dmu',
 'ã\x89mile Baudot',
 'evliya Ã\x87elebi',
 'emperor YÅ\x8dmei',
 'empress KÅ\x8dgyoku',
 'emperor KÅ\x8dtoku',
 'emperor KÅ\x8dbun',
 'empress 

## Wikipedia-500K

In [115]:
data_path = f"{data_dir}/Wikipedia-500K/"

train_path = f"{data_path}/trn.raw.json"
test_path = f"{data_path}/tst.raw.json"

In [116]:
same_titles, diff_titles = [], []

separate_titles(same_titles, diff_titles, train_path)
separate_titles(same_titles, diff_titles, test_path)

In [117]:
len(same_titles), len(diff_titles), len(same_titles)+len(diff_titles)

(2034155, 562979, 2597134)

In [118]:
total_pages = len(same_titles)+len(diff_titles)
print(f"Number of similar pages : {len(same_titles)} ({100.*len(same_titles)/total_pages:.4f}%)")
print(f"Number of different pages : {len(diff_titles)} ({100.*len(diff_titles)/total_pages:.4f}%)")

Number of similar pages : 2034155 (78.3231%)
Number of different pages : 562979 (21.6769%)


In [120]:
diff_titles

['asociaciÃ³n Alumni',
 'abdul Alhazred',
 'the Plague',
 'ancient Pueblo peoples',
 "all Souls\\' Day",
 'andrÃ© Gide',
 'royal Antigua and Barbuda Defence Force',
 'alfonso CuarÃ³n',
 'list of Governors of Alabama',
 'aromatic hydrocarbon',
 'annales School',
 'casa BatllÃ³',
 'park GÃ¼ell',
 'casa MilÃ',
 'apache Software Foundation',
 'atanasoffâ\x80\x93Berry computer',
 'andrÃ©-Marie AmpÃ¨re',
 'abbotsford House',
 'nYSE MKT',
 "dodo (Alice\\'s Adventures in Wonderland)",
 'alexander Emanuel Agassiz',
 'ajax (mythology)',
 'alexander Severus',
 'alhazen',
 'amara Sinha',
 'alfonso XII of Spain',
 'alphonsus a Sancta Maria',
 'st. Ambrose Traversari',
 'ananda',
 'andrÃ© de Longjumeau',
 'ã\x86gir',
 'antibiotics',
 'amalric I of Jerusalem',
 'amalric II of Jerusalem',
 'antibiotic resistance',
 'adobe Systems',
 'anne BrontÃ«',
 'ã\x81satrÃº in the United States',
 'ã\x86lfheah of Canterbury',
 'list of animated television series',
 'african American',
 'cuisine of the United Stat

In [119]:
redirects_titles = []
not_redirect_titles = []

for title in diff_titles: 
    if is_redirect(title):
        redirects_titles.append(title)
    else:
        not_redirect_titles.append(title)

In [121]:
print(f"Number of redirect pages : {len(redirects_titles)} ({100.*len(redirects_titles)/len(diff_titles):.4f}%)")
print(f"Number of not redirect pages : {len(not_redirect_titles)} ({100.*len(not_redirect_titles)/len(diff_titles):.4f}%)")

Number of redirect pages : 239167 (42.4824%)
Number of not redirect pages : 323812 (57.5176%)


In [122]:
not_redirect_titles

['asociaciÃ³n Alumni',
 "all Souls\\' Day",
 'andrÃ© Gide',
 'alfonso CuarÃ³n',
 'casa BatllÃ³',
 'park GÃ¼ell',
 'casa MilÃ',
 'atanasoffâ\x80\x93Berry computer',
 'andrÃ©-Marie AmpÃ¨re',
 "dodo (Alice\\'s Adventures in Wonderland)",
 'andrÃ© de Longjumeau',
 'ã\x86gir',
 'anne BrontÃ«',
 'ã\x81satrÃº in the United States',
 'ã\x86lfheah of Canterbury',
 "sid Meier\\'s Alpha Centauri",
 'ã\x86lle of Sussex',
 "amdahl\\'s law",
 "ardal O\\'Hanlon",
 'albrecht DÃ¼rer',
 'andrÃ© the Giant',
 "antoine Thomson d\\'Abbadie",
 'ã\x81ed mac CinÃ¡eda',
 'andreas SchlÃ¼ter',
 'anaÃ¯s Nin',
 "`Abdu\\'l-BahÃ¡",
 'alcobaÃ§a, Portugal',
 'arithmeticâ\x80\x93geometric mean',
 "banca d\\'Italia",
 'bifrÃ¶st',
 'brÃ\xadsingamen',
 'borsukâ\x80\x93Ulam theorem',
 'boÃ¶tes',
 'boseâ\x80\x93Einstein condensate',
 'beerâ\x80\x93Lambert law',
 'bÃ©la BartÃ³k',
 'blue Ã\x96yster Cult',
 'bÃ©zier curve',
 "bernoulli\\'s inequality",
 'barÄ±Å\x9f ManÃ§o',
 "bÃ©zout\\'s identity",
 'bjÃ¸rn Lomborg',
 'cold War

In [112]:
'atom probe' in title_to_id

True

In [113]:
'atom_probe'.replace('_',' ')

'atom probe'