### Import Data and Index

In [174]:
import bz2

data_path = 'idwiki-latest-pages-articles-multistream.xml.bz2'
index_path = 'idwiki-latest-pages-articles-multistream-index.txt.bz2'

### Index file contains the title of each article
(https://stackoverflow.com/questions/29020732/how-to-use-information-provided-in-wiki-downloads-index-file)

In [175]:
count_title_from_index = 0

for i, line in enumerate(bz2.BZ2File(index_path, 'r')):
    count_title_from_index += 1

count_title_from_index

1660717

In [176]:
count_title_from_data = 0

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    line = line.decode('utf-8')

    if '<title>' in line:
        count_title_from_data += 1

count_title_from_data

1660717

### Searching for related categories
Since many categories that have been listed doesn't have its own 'Wiki page' (e.g. Kategori: Emigran Taiwan di Amerika Serikat), I decided to list categories that have their own page, so we can retrieve the articles that belongs to each category.

We have the list of every article's title in index.txt, so we can use it to decide which category (that have been listed or maybe have not listed in the Excel) that have their own page.

Why we should only use the categories that have their own page?
For the reason that I use 'Category Approach' in this wikidata, and the information about what articles that belong to a category only available in the 'category page', we should only use the categories that have their own page.

The categories will be searched on the xml dump file, since we want to retrieve the content either from pywikibot or mwparserfromhell library

In [205]:
list_category = []

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    line = line.decode('utf-8')

    if '<title>Kategori:' in line:
        # remove <title></title> tag
        stripped_line = line[20:-9]
        list_category.append(stripped_line)

list_category

['Yahudi',
 'Indonesia',
 'Presiden',
 'Stan',
 'Kimia',
 'Buddhisme',
 'Rumpun bahasa Indo-Eropa',
 'Presiden Amerika Serikat',
 'Benua',
 'Malaysia',
 'Sastra',
 'Bahasa Semit',
 'Agama',
 'Bahasa',
 'Sejarah',
 'Aksara',
 'Waktu',
 'Statistika',
 'Matematika',
 'Geologi',
 'Fisika',
 'Biologi',
 'Astronomi',
 'Transportasi',
 'Teknologi',
 'Teknik',
 'Pertanian',
 'Pendidikan',
 'Otomotif',
 'Komunikasi',
 'Informatika',
 'Hukum',
 'Bisnis dan industri',
 'Arsitektur',
 'Televisi',
 'Internet',
 'Hobi',
 'Hiburan',
 'Film',
 'Dapur',
 'Budaya',
 'Berkebun',
 'Manusia',
 'Tokoh Indonesia',
 'Sosiologi',
 'Sejarah Indonesia',
 'Sejarah Nusantara',
 'Psikologi',
 'Mitologi',
 'Politik',
 'Geografi',
 'Filsafat',
 'Ekonomi',
 'Arkeologi',
 'Antropologi',
 'Permainan',
 'Musik',
 'Olahraga',
 'Pariwisata',
 'Rekreasi',
 'Sandiwara',
 'Seni',
 'Kabinet Indonesia',
 'Bahasa Sino-Tibet',
 'Peristiwa 2004',
 'Islam',
 'Raja Jawa',
 'Sepak bola',
 'Perang Dingin',
 'Uni Soviet',
 'Perang Duni

#### DB Connection

In [179]:
import MySQLdb
import dotenv

config = dotenv.dotenv_values('.env')

db_credentials = {
    'host' : 'localhost',
    'user' : config['USERNAME'],
    'password' : config['PASSWORD'],
    'database' : config['DATABASE_NAME']
}

conn = MySQLdb.connect(**db_credentials)
conn.close()

#### Selecting categories and its subcategories that will be used
Documentation: 
- https://www.mediawiki.org/wiki/Manual:Page_table
- https://www.mediawiki.org/wiki/Manual:Categorylinks_table

In [573]:
category_to_be_used = {}
category_without_subcategory = []
page_to_be_used = {}

In [574]:
def get_subcategories(conn, category):
    """
    Parameters:
    - conn: MySQL connection
    - category: Subcategories will be searched based on this category (case-sensitive)
    
    - Get category's subcategories that will be used
    - The category itself will not be appended to the dict, cl_to can be used to get the 'super categories'; unless it doesn't have subcategories
    """
    
    # eliminates the separation using space between words in category
    if len(category.split()) > 1 :
        category = category.replace(" ", "_")
    
    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace, cl_to from page p inner join categorylinks c on p.page_id = c.cl_from "
                 f"where page_namespace = 14 and cl_to = '{category}'" )
    result = cur.fetchall()
    
    if (len(result)) == 0:
        category = category.replace("_", " ")
        if category not in category_without_subcategory:
            category_without_subcategory.append(category)
            print(f"Category without subcategory: {category}")
    else : 
        for i, line in enumerate(result):
            page_id = line[0]
            page_title = line[1].decode('utf-8').replace("_", " ")
            page_namespace = line[2]
            cl_to = line[3].decode('utf-8').replace("_", " ")
            
            # check if the key already exists; if so, only append the cl_to
            if page_id in category_to_be_used:
                # check duplicates in cl_to
                if cl_to not in category_to_be_used[page_id]['cl_to']:
                    category_to_be_used[page_id]['cl_to'].append(cl_to)
            else:
                #  add the new category where page_id as the key
                category_to_be_used[page_id] = {
                    "page_id": page_id,
                    "page_title" : page_title,
                    "page_namespace" : page_namespace,
                    "cl_to" : [cl_to]
                }

In [575]:
def get_page_from_category(conn, category):
    """
    Parameters:
    - conn: MySQL connection
    - category: Pages will be searched based on this category (case-sensitive)
  
    - Get all pages that belong to a category
    """

    # eliminates the separation using space between words in category
    if len(category.split()) > 1 :
        category = category.replace(" ", "_")
    
    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace from page p inner join categorylinks c on p.page_id = c.cl_from "
                 f"where page_namespace = 0 and cl_to = '{category}'" )
    
    result = cur.fetchall()
    _compose_page_result(result)

def get_page_from_page_title(conn, page_title):
    """
    Parameters:
    - conn: MySQL connection
    - page_title: Page's title that will be searched (case-sensitive)
  
    - Get page by a title
    """

    # eliminates the separation using space between words in page_title
    if len(page_title.split()) > 1 :
        page_title = page_title.replace(" ", "_")

    cur = conn.cursor()
    cur.execute( f"select page_id, page_title, page_namespace from page p where page_namespace = 0 and page_title = '{page_title}'")
    
    result = cur.fetchall()
    _compose_page_result(result)
            
def _compose_page_result(result):
    """
    Parameters:
    - result: SQL Fetch result
  
    - Append page_to_be_used dictionary that will be used by get_page_from_category() and get_page_from_page_title()
    """
    for i, line in enumerate(result):
        page_id = line[0]
        page_title = line[1].decode('utf-8').replace("_", " ")
        page_namespace = line[2]

        # check if the key already exists
        if page_id not in page_to_be_used:
            page_to_be_used[page_id] = {
                "page_id": page_id,
                "page_title" : page_title,
                "page_namespace" : page_namespace,
            }
    

In [576]:
conn = MySQLdb.connect(**db_credentials)

# get_subcategories(conn, "Orde Lama")
# get_subcategories(conn, "Republik Indonesia Serikat")

get_page_from_category(conn, 'Orde Lama')

conn.close()


In [578]:
page_to_be_used

{85082: {'page_id': 85082,
  'page_title': 'Demokrasi Terpimpin (1959–1965)',
  'page_namespace': 0},
 1061635: {'page_id': 1061635,
  'page_title': 'Front Nasional (Orde Lama)',
  'page_namespace': 0},
 1151450: {'page_id': 1151450,
  'page_title': 'Sejarah Indonesia (1965–1966)',
  'page_namespace': 0},
 1884981: {'page_id': 1884981,
  'page_title': 'Partai Persatuan Indonesia Raya',
  'page_namespace': 0},
 2177254: {'page_id': 2177254,
  'page_title': 'Hiperinflasi Indonesia 1963-1965',
  'page_namespace': 0},
 68557: {'page_id': 68557,
  'page_title': 'Tri Tuntutan Rakyat',
  'page_namespace': 0},
 610391: {'page_id': 610391,
  'page_title': 'Sinterklas Hitam',
  'page_namespace': 0},
 13029: {'page_id': 13029,
  'page_title': 'Gunting Syafruddin',
  'page_namespace': 0},
 1602997: {'page_id': 1602997,
  'page_title': 'Sengketa Irian Barat',
  'page_namespace': 0}}