In [1]:
# Example: Former Subdivision

import pandas as pd

# Merging the same infobox
excel_path = pd.ExcelFile('D:\Kuliah\Skripsi\Ontologi-Sejarah-Indonesia\MichaelDawBalma\Code\infoboxes\infobox-orla.xlsx')
sheet_name = pd.read_excel(excel_path, 'Former Subdivision')
sheet_name2 = pd.read_excel(excel_path, 'former subdivision1')
csv_file_path = 'former-subdivision-final.csv'

merged_df = pd.concat([sheet_name, sheet_name2])

merged_df.to_csv(csv_file_path, index=False)

In [69]:
# Clean the merged Former Subdivision data
import csv
former_subdivision_clean = []

with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)

    for row in reader:
        former_subdivision_clean.append(dict(row))

### Cleaning lat-long data

In [3]:
'''
Searching for longitude and latitude based on the docs:
https://id.wikipedia.org/wiki/Templat:Infobox_former_subdivision (common_name)
'''
import geopy

def get_coordinates(place):
    geolocator = geopy.Nominatim(user_agent='place_coordinates_locator')
    location = geolocator.geocode(place)
    if location:
        return location.latitude, location.longitude
    return None,None


for index, value in enumerate(former_subdivision_clean):
    latitude, longitude = get_coordinates(value['common_name'])
    if latitude is None:
        latitude, longitude = get_coordinates(value['nation'])
    # Add latitude and longitude into the dictionary  
    value['latitude'] = latitude
    value['longitude'] = longitude

### Cleaning date data

In [34]:
'''
Merge column date_start-year_start and date_end-year_end into merged_date_start and merged_date_end
https://raw.githubusercontent.com/w3c/sdw/gh-pages/time/rdf/time.ttl (time:inXSDDate)
'''

month_list = {
    'Januari' : '01',
    'Februari' : '02',
    'Maret' : '03',
    'April' : '04',
    'Mei' : '05',
    'Juni' : '06',
    'Juli' : '07',
    'Agustus' : '08',
    'September' : '09',
    'Oktober' : '10',
    'November' : '11',
    'Desember' : '12',
}

defaultDateStart = '01-01'
defaultDateEnd = '12-31'
defaultYear = 2000

for index, value in enumerate(former_subdivision_clean):
    dateStartUsed = ''
    yearStartUsed = ''
    dateEndUsed = ''
    yearEndUsed = ''

    '''
    Both start and end date will be set to default date if it doesnt have the corresponding data; 
    else it will be on the mm-dd format
    '''
    if value['date_start'] == '' :
        dateStartUsed = defaultDateStart
    else :
        date = value['date_start'].split(" ")
        day = date[0]
        month = month_list[date[1]]
        # change into dd format
        if len(day) == 1 :
            day = '0' + f'{day}'
            # merge day and month into mm-dd format
        dateStartUsed = f'{month}-{day}'
        
    if value['date_end'] == '':
        dateEndUsed = defaultDateEnd
    else :
        date = value['date_end'].split(" ")
        day = date[0]
        month = month_list[date[1]]
        # change into dd format
        if len(day) == 1 :
            day = '0' + f'{day}'
            # merge day and month into mm-dd format
        dateEndUsed = f'{month}-{day}'
    
    yearStartUsed = defaultYear if not value['year_start'] else value['year_start']
    yearEndUsed = defaultYear if not value['year_end'] else value['year_end']
        
    # Merge to yyyy-mm-dd format 
    merged_date_start = f"{yearStartUsed}-{dateStartUsed}"
    merged_date_end = f"{yearEndUsed}-{dateEndUsed}"
    
    value['merged_date_start'] = merged_date_start
    value['merged_date_end'] = merged_date_end


### Extract cleaned data to csv

In [61]:
fieldnames = former_subdivision_clean[0].keys()

with open('former-subdivision-final-cleaned.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    writer.writerows(former_subdivision_clean)

#### Add Page Description and Wiki URL for each title

In [59]:
import wikipedia

def retrieve_paragraph(article_title):
    wikipedia.set_lang("id")
    # Auto Suggest off karena membuat return page yang berbeda dengan page title
    return wikipedia.summary(article_title, sentences=5, auto_suggest=False)

def get_url_wikipedia(article_title):
    wikipedia.set_lang("id")
    # Auto Suggest off karena membuat return page yang berbeda dengan page title
    page = wikipedia.page(article_title, auto_suggest=False)
    return page.url


In [60]:
for index, data in enumerate(former_subdivision_clean):
    summary = retrieve_paragraph(data['page_title'])
    wikiurl = get_url_wikipedia(data['page_title'])
    data['summary'] = summary
    data['wikiurl'] = wikiurl

### Add articles Flag Image

In [70]:
page = wikipedia.page('Daerah Federal Jakarta', auto_suggest=False)
page.images

['https://upload.wikimedia.org/wikipedia/commons/9/9f/Flag_of_Indonesia.svg',
 'https://upload.wikimedia.org/wikipedia/commons/9/90/National_emblem_of_Indonesia_Garuda_Pancasila.svg']