### Import Data and Index

In [1]:
import bz2

data_path = 'idwiki-latest-pages-articles-multistream.xml.bz2'
index_path = 'idwiki-latest-pages-articles-multistream-index.txt.bz2'

### Index file contains the title of each article
(https://stackoverflow.com/questions/29020732/how-to-use-information-provided-in-wiki-downloads-index-file)

In [2]:
count_title_from_index = 0

for i, line in enumerate(bz2.BZ2File(index_path, 'r')):
    count_title_from_index += 1

count_title_from_index

1660717

In [3]:
count_title_from_data = 0

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    line = line.decode('utf-8')

    if '<title>' in line:
        count_title_from_data += 1

count_title_from_data

1660717

### Searching for related categories
Since many categories that have been listed doesn't have its own 'Wiki page' (e.g. Kategori: Emigran Taiwan di Amerika Serikat), I decided to list categories that have their own page, so we can retrieve the articles that belongs to each category.

We have the list of every article's title in index.txt, so we can use it to decide which category (that have been listed or maybe have not listed in the Excel) that have their own page.

Why we should only use the categories that have their own page?
For the reason that I use 'Category Approach' in this wikidata, and the information about what articles that belong to a category only available in the 'category page', we should only use the categories that have their own page.

The categories will be searched on the xml dump file, since we want to retrieve the content either from pywikibot or mwparserfromhell library

In [4]:
list_category = []

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    line = line.decode('utf-8')

    if '<title>Kategori:' in line:
        # remove <title></title> tag
        stripped_line = line[20:-9]
        list_category.append(stripped_line)

list_category

['Yahudi',
 'Indonesia',
 'Presiden',
 'Stan',
 'Kimia',
 'Buddhisme',
 'Rumpun bahasa Indo-Eropa',
 'Presiden Amerika Serikat',
 'Benua',
 'Malaysia',
 'Sastra',
 'Bahasa Semit',
 'Agama',
 'Bahasa',
 'Sejarah',
 'Aksara',
 'Waktu',
 'Statistika',
 'Matematika',
 'Geologi',
 'Fisika',
 'Biologi',
 'Astronomi',
 'Transportasi',
 'Teknologi',
 'Teknik',
 'Pertanian',
 'Pendidikan',
 'Otomotif',
 'Komunikasi',
 'Informatika',
 'Hukum',
 'Bisnis dan industri',
 'Arsitektur',
 'Televisi',
 'Internet',
 'Hobi',
 'Hiburan',
 'Film',
 'Dapur',
 'Budaya',
 'Berkebun',
 'Manusia',
 'Tokoh Indonesia',
 'Sosiologi',
 'Sejarah Indonesia',
 'Sejarah Nusantara',
 'Psikologi',
 'Mitologi',
 'Politik',
 'Geografi',
 'Filsafat',
 'Ekonomi',
 'Arkeologi',
 'Antropologi',
 'Permainan',
 'Musik',
 'Olahraga',
 'Pariwisata',
 'Rekreasi',
 'Sandiwara',
 'Seni',
 'Kabinet Indonesia',
 'Bahasa Sino-Tibet',
 'Peristiwa 2004',
 'Islam',
 'Raja Jawa',
 'Sepak bola',
 'Perang Dingin',
 'Uni Soviet',
 'Perang Duni

#### DB Connection

In [5]:
import MySQLdb
import dotenv

config = dotenv.dotenv_values('.env')

db_credentials = {
    'host' : 'localhost',
    'user' : config['USERNAME'],
    'password' : config['PASSWORD'],
    'database' : config['DATABASE_NAME']
}

conn = MySQLdb.connect(**db_credentials)
conn.close()

#### Selecting categories and its subcategories that will be used
Documentation: 
- https://www.mediawiki.org/wiki/Manual:Page_table
- https://www.mediawiki.org/wiki/Manual:Categorylinks_table

In [59]:
category_to_be_used = {}
category_without_subcategory = []
page_to_be_used = {}

In [7]:
def get_subcategories(conn, category):
    """
    :param conn: MySQL's connection: 
    :param category: Pages will be searched based on this category (case-sensitive):
    :return:
    
    Get category's subcategories that will be used.
    
    The category itself will not be appended to the dict, cl_to can be used to get the 'super categories'; unless it doesn't have subcategories.
    """
    
    # eliminates the separation using space between words in category
    if len(category.split()) > 1 :
        category = category.replace(" ", "_")
    
    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace, cl_to from page p inner join categorylinks c on p.page_id = c.cl_from "
                 f"where page_namespace = 14 and cl_to = '{category}'" )
    result = cur.fetchall()
    
    if (len(result)) == 0:
        category = category.replace("_", " ")
        if category not in category_without_subcategory:
            category_without_subcategory.append(category)
            print(f"Category without subcategory: {category}")
    else : 
        for i, line in enumerate(result):
            page_id = line[0]
            page_title = line[1].decode('utf-8').replace("_", " ")
            page_namespace = line[2]
            cl_to = line[3].decode('utf-8').replace("_", " ")
            
            # check if the key already exists; if so, only append the cl_to
            if page_id in category_to_be_used:
                # check duplicates in cl_to
                if cl_to not in category_to_be_used[page_id]['cl_to']:
                    category_to_be_used[page_id]['cl_to'].append(cl_to)
            else:
                #  add the new category where page_id as the key
                category_to_be_used[page_id] = {
                    "page_id": page_id,
                    "page_title" : page_title,
                    "page_namespace" : page_namespace,
                    "cl_to" : [cl_to]
                }

In [8]:
def get_page_from_category(conn, category):
    """
    :param conn: MySQL's connection: 
    :param category: Pages will be searched based on this category (case-sensitive):
    :return:
   
    Get all pages that belong to a category.
    """

    # eliminates the separation using space between words in category
    if len(category.split()) > 1 :
        category = category.replace(" ", "_")
    
    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace from page p inner join categorylinks c on p.page_id = c.cl_from "
                 f"where page_namespace = 0 and cl_to = '{category}'" )
    
    result = cur.fetchall()
    _compose_page_result(result)

def get_page_from_page_title(conn, page_title):
    """
    :param conn: MySQL's connection: 
    :param page_title: Page's title that will be searched (case-sensitive): 
    :return:

    Get a page by its title.
    """

    # eliminates the separation using space between words in page_title
    if len(page_title.split()) > 1 :
        page_title = page_title.replace(" ", "_")

    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace from page p where page_namespace = 0 and page_title = '{page_title}'")
    
    result = cur.fetchall()
    _compose_page_result(result)
            
def _compose_page_result(result):
    """
    :param result: SQL Fetch result: 
    :return: 
  
    Append page_to_be_used dictionary that will be used by get_page_from_category() and get_page_from_page_title().
    """
    for i, line in enumerate(result):
        page_id = line[0]
        page_title = line[1].decode('utf-8').replace("_", " ")
        page_namespace = line[2]

        # check if the key already exists
        if page_id not in page_to_be_used:
            page_to_be_used[page_id] = {
                "page_id": page_id,
                "page_title" : page_title,
                "page_namespace" : page_namespace,
            }
    

In [60]:
conn = MySQLdb.connect(**db_credentials)

'''
get_subcategories(conn, "Sejarah Indonesia")
get_subcategories(conn, "Orde Lama")
get_subcategories(conn, "Republik Indonesia Serikat")
get_subcategories(conn, "Tokoh Orde Lama")
get_subcategories(conn, "Indonesia dalam tahun 1945")
get_subcategories(conn, "Indonesia dalam tahun 1946")
get_subcategories(conn, "Indonesia dalam tahun 1947")
get_subcategories(conn, "Indonesia dalam tahun 1948")
get_subcategories(conn, "Indonesia dalam tahun 1949")
get_subcategories(conn, "Indonesia dalam tahun 1950")
get_subcategories(conn, "Indonesia dalam tahun 1951")
get_subcategories(conn, "Indonesia dalam tahun 1952")
get_subcategories(conn, "Indonesia dalam tahun 1953")
get_subcategories(conn, "Indonesia dalam tahun 1954")
get_subcategories(conn, "Indonesia dalam tahun 1955")
get_subcategories(conn, "Indonesia dalam tahun 1956")
get_subcategories(conn, "Indonesia dalam tahun 1957")
get_subcategories(conn, "Indonesia dalam tahun 1958")
get_subcategories(conn, "Indonesia dalam tahun 1959")
get_subcategories(conn, "Indonesia dalam tahun 1960")
get_subcategories(conn, "Indonesia dalam tahun 1961")
get_subcategories(conn, "Indonesia dalam tahun 1962")
get_subcategories(conn, "Indonesia dalam tahun 1963")
get_subcategories(conn, "Indonesia dalam tahun 1964")
get_subcategories(conn, "Indonesia dalam tahun 1965")
get_subcategories(conn, "Proklamasi Kemerdekaan Indonesia")
get_subcategories(conn, "Perang Kemerdekaan Indonesia")
'''

for key, value in category_to_be_used.items():
    get_page_from_category(conn, category_to_be_used[key]['page_title'])

for i, line in enumerate(category_without_subcategory):
    get_page_from_category(conn, line)

conn.close()

Category without subcategory: Orde Lama
Category without subcategory: Republik Indonesia Serikat


In [61]:
len(page_to_be_used)

1984

### mwparserfromhell and pywikibot library

In [172]:
import mwparserfromhell
import pywikibot
page_with_infobox = {}
page_without_infobox = {}

In [173]:
def get_infobox(page_title):
    """
    :param page_title: 
    :return: 
    
    Get infobox based on page title. 
    
    If the page doesn't have infobox, it will be appended to page_without_infobox (vice versa).
    """
    # eliminates the separation using space between words in page_title
    global information
    
    if len(page_title.split()) > 1 :
        page_title = page_title.replace(" ", "_")
        
    site = pywikibot.Site('id', 'wikipedia')
    page = pywikibot.Page(site, page_title)
    
    try :
        text = page.get()
        wikitext = mwparserfromhell.parse(text)
        infoboxes = wikitext.filter_templates(matches = 'Infobox')
        if infoboxes:
            for infobox in infoboxes :
                information = {param.name.strip_code().strip(): param.value.strip_code().strip() for param in infobox.params}
                # save the infobox's name as the first 'index' 
                information = {'infobox_name' : infobox.name, **information}                
                
                # Append to the list of values
                if page_title.replace("_", " ") in page_with_infobox:
                        page_with_infobox[page_title.replace("_", " ")].append(information)
                # Make the values as list since the page can have multiple infoboxes
                else :
                    page_with_infobox[page_title.replace("_", " ")] = [information]
        else:
            page_without_infobox[page_title.replace("_", " ")] = wikitext
    except pywikibot.page._basepage.IsRedirectPageError as e :
        print(f"!!! Title {page_title} has {e} Exception !!!")

In [175]:
for key, value in page_to_be_used.items():
    get_infobox(page_to_be_used[key]['page_title'])

!!! Title Pangeran_Wira_Kasoema has Page [[id:Pangeran Wira Kasoema]] is a redirect page. Exception !!!
!!! Title Kalimantan_Tenggara has Page [[id:Kalimantan Tenggara]] is a redirect page. Exception !!!
!!! Title Temenggung_Setia_Pahlawan has Page [[id:Temenggung Setia Pahlawan]] is a redirect page. Exception !!!
!!! Title Sejarah_Cipta_Pulau_Jawa has Page [[id:Sejarah Cipta Pulau Jawa]] is a redirect page. Exception !!!
!!! Title Sejarah_pulau_Jawa has Page [[id:Sejarah pulau Jawa]] is a redirect page. Exception !!!
!!! Title PPKI has Page [[id:PPKI]] is a redirect page. Exception !!!
!!! Title Nederlands-Indische_gulden has Page [[id:Nederlands-Indische gulden]] is a redirect page. Exception !!!
!!! Title Nederlands-Indische_Tramweg_Maatschappij has Page [[id:Nederlands-Indische Tramweg Maatschappij]] is a redirect page. Exception !!!
!!! Title Hasnan_A._Habib has Page [[id:Hasnan A. Habib]] is a redirect page. Exception !!!
!!! Title A._A._Rifai has Page [[id:A. A. Rifai]] is a red

In [176]:
# Solving the redirect page
conn = MySQLdb.connect(**db_credentials)

get_page_from_page_title(conn, 'Panitia Persiapan Kemerdekaan Indonesia')
get_page_from_page_title(conn, 'Hasnan Habib')
get_page_from_page_title(conn, 'Achmad Rifai Manggabarani')

get_infobox('Panitia Persiapan Kemerdekaan Indonesia')
get_infobox('Hasnan Habib')
get_infobox('Achmad Rifai Manggabarani')

conn.close()

In [303]:
type(page_with_infobox.items())

dict_items

In [360]:
import re
page_with_infobox_orla = {}
page_with_infobox_no_year = {}

for key, value in page_with_infobox.items():    
    is_orla = False
    for index_value, value_inside in enumerate(value):
        for key_final, value_final in value_inside.items():
            numbers = re.findall(r'\b\d+\b', page_with_infobox[key][index_value][key_final])
            if any(1945 <= int(num) <= 1965 for num in numbers):
                is_orla = True
                break
        break
    if is_orla:
        page_with_infobox_orla[key] = value
    else :
        page_with_infobox_no_year[key] = value
    
    
    #     for keya, valua in enumerate(page_with_infobox[key][index]):
    #         numbers = re.findall(r'\b\d+\b', valua)
    #         if any(1945 <= int(num) <= 1965 for num in numbers):
    #             is_orla = True
    #     if is_orla:
    #         rrr.append(page_with_infobox[key])
    #         break
    #     else :
    #         ddd.append(page_with_infobox[key])
    # 
    # if len(rrr) > 0 or len(ddd) == 0:
    #     break
    
    
    
    
    # for key_inside, value_inside in page_with_infobox[key][index].items():
    #     numbers = re.findall(r'\b\d+\b', value_inside)
    #     if any(1945 <= int(num) <= 1965 for num in numbers):
    #         is_orla = True
    # if (is_orla) :
    #     page_with_infobox_orla[key] = value
    # else :
    #     page_with_infobox_no_year[key] = value

In [362]:
for key, value in page_with_infobox_orla.items():
    print(value)
    break

[{'infobox_name': 'Infobox Penulis\n', 'name': 'Sobron Aidit', 'image': 'Sobron Aidit, Pekan Buku Indonesia 1954, p190.jpg', 'imagesize': '200px', 'caption': 'Sobron Aidit tahun 1954', 'pseudonym': 'Simon', 'birthname': '', 'birth_date': '', 'birth_place': 'Tanjung Pandan, Belitung, Hindia Belanda', 'death_date': '', 'death_place': 'Paris, Prancis', 'occupation': 'Sastrawan, guru', 'nationality': 'Prancis', 'ethnicity': 'Minangkabau, Melayu Indonesia', 'religion': 'Katolik', 'citizenship': '', 'period': '', 'genre': '', 'subject': '', 'movement': '', 'notableworks': '', 'spouse': '', 'partner': '', 'children': 'Wanita Tekun Pertiwi', 'relatives': 'D.N. Aidit (kakak)Asahan Alham (adik)', 'influences': '', 'influenced': '', 'awards': '', 'signature': '', 'website': '', 'portaldisp': ''}]


In [363]:
for key,value in page_with_infobox_no_year.items():
    print(value)
    break

[{'infobox_name': 'Infobox military conflict\n', 'conflict': 'Penyerbuan Jawa 1811', 'image': 'B26056056H - The landing of the British Army at Chillinching on the island of Java 4th Augt. 1811.jpg', 'caption': 'Ilustrasi pendaratan Inggris di Cilincing pada 4 Agustus 1811', 'partof': 'Peperangan era Napoleon', 'date': '3 Agustus–18 September 1811', 'place': 'Jawa, Hindia Belanda', 'result': 'Kemenangan Inggris', 'territory': 'Jawa direbut oleh Britania', 'combatant1': 'Britania Raya', 'combatant2': 'Prancis\n  Hindia Belanda', 'commander1': 'Robert Stopford\n Samuel Auchmuty\n Robert Rollo Gillespie', 'commander2': 'Jan Willem Janssens', 'strength1': '12.000', 'strength2': '17.000', 'casualties1': '1.000', 'casualties2': '2.000', '1': ''}]


In [269]:
categories_from_page = {}

def get_categories_from_page_title(conn, page_title) :
    """
   :param conn: MySQL's connection: 
   :param page_title: An article's title:
   :return:
  
   Get all categories that belong to a page.
   """

    # eliminates the separation using space between words in category
    if len(page_title.split()) > 1 :
        page_title = page_title.replace(" ", "_")

    cur = conn.cursor()
    cur.execute(f"""
                SELECT
                    GROUP_CONCAT(REPLACE(cl.cl_to, '_', ' ')) AS categories
                FROM
                page AS p LEFT JOIN categorylinks AS cl ON p.page_id = cl.cl_from
                WHERE
                p.page_namespace = 0 and p.page_title = "{page_title}"
                """)

    result = cur.fetchall()
    try :
        categories_from_page[page_title.replace("_", " ")] = result[0][0].decode('utf-8')
    except AttributeError  as e:
        if page_title == 'Achmad_Rifai_Manggabarani':
            categories_from_page[page_title.replace("_", " ")] = 'Kelahiran 1924, Kematian 2001, Meninggal usia 77, Pejuang kemerdekaan Indonesia, Tokoh Bugis, Tokoh Sulawesi Selatan, Tokoh dari Polewali Mandar, Tokoh Angkatan 45, Politikus Indonesia, Gubernur Sulawesi Selatan, Penerima Bintang Gerilya'
        else : 
            print(e)

In [270]:
conn = MySQLdb.connect(**db_credentials)

for key, value in page_with_infobox.items():
    get_categories_from_page_title(conn, key)

conn.close()

### Extract data to excel

In [271]:
import pandas as pd

df = pd.DataFrame(list(page_with_infobox.items()), columns=['Title', 'Infobox'])
df_cat = pd.DataFrame(list(categories_from_page.items()), columns=['Title', 'Categories'])

result_df = pd.merge(df_cat, df, on='Title')

result_df.to_excel('infobox.xlsx', index=False)

In [183]:
df = pd.DataFrame(list(page_without_infobox.items()), columns=['Title', 'Text'])
df.to_excel('noinfobox.xlsx', index=False)

### What is the most infobox used in the articles

In [192]:
page_with_infobox['Penyerbuan Jawa (1811)'][0]['infobox_name']

'Infobox military conflict\n'

In [229]:
count_infobox_per_names = {}

for key in page_with_infobox:
    for index, key_inside in enumerate(page_with_infobox[key]):
        infobox_name = str(page_with_infobox[key][index]['infobox_name'])
        if infobox_name in count_infobox_per_names:
            count_infobox_per_names[infobox_name] += 1
        else:
            count_infobox_per_names[infobox_name] = 1

In [230]:
count_infobox_per_names

{'Infobox military conflict\n': 32,
 'Infobox Military Conflict\n': 22,
 'Infobox Film\n': 107,
 'Infobox former country\n': 13,
 'Infobox Former Country\n': 24,
 'Infobox Military Conflict \n': 2,
 'Infobox Penulis\n': 9,
 'Infobox person\n': 118,
 'Infobox musical artist': 1,
 'Infobox civil conflict\n': 4,
 'Infobox civilian attack\n': 22,
 'Infobox monument\n': 2,
 'Infobox Universitas\n': 14,
 'Infobox Sekolah\n': 4,
 'Infobox Observatory\n': 1,
 'Infobox cagar budaya\n': 2,
 'Infobox building\n': 5,
 'Infobox tempat wisata\n': 1,
 'Infobox Museum\n ': 2,
 'Infobox prison\n': 1,
 'Infobox concentration camp\n': 1,
 'Infobox Former Subdivision\n': 7,
 'Infobox royalty\n': 25,
 'Infobox former subdivision\n': 6,
 'Infobox_Monarch \n': 21,
 'Infobox event \n': 1,
 'Infobox military conflict': 9,
 'Brunei Darussalam infobox': 1,
 'Infobox country\n': 12,
 'Infobox former country': 1,
 'Infobox Monarch \n': 3,
 'Infobox royalty': 7,
 'Infobox government agency\n': 3,
 'Infobox Civil Co