# Process XML Wiki Voyage Dump
https://github.com/aaronteoh/tyeoh/blob/master/destinations_graph_from_wikivoyage.ipynb

In [1]:
import os
from pprint import pprint
import urllib
import json

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## 1) Download data from WikiVoyage

In [3]:
# https://en.wikivoyage.org/wiki/Wikivoyage:Database_dump
url = 'https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2'
if not os.path.isfile(url.split('/')[-1]):
    urllib.request.urlretrieve(url, url.split('/')[-1])
    
    # https://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python
    

## 2) Decompress Data

In [4]:
from bz2 import BZ2Decompressor

decompressed = ('.').join(url.split('/')[-1].split('.')[:-1])

if not os.path.isfile(decompressed):
    with open(decompressed, 'wb') as new_file, open(url.split('/')[-1], 'rb') as file:
        decompressor = BZ2Decompressor()
        for data in iter(lambda : file.read(100 *1024), b''):
            ##new_file.write(decompressor.decompress(data))
            print(decompressor.decompress(data))
print("hello")

hello


## 3) Convert to XML Dict

In [5]:
if not os.path.isfile('wikivoyage_latest_articles_text.json'):
    import sys
    !{sys.executable} -m pip install xmltodict
    # https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
    
    import xmltodict
    
    with open('enwikivoyage-latest-pages-articles.xml', encoding='utf8') as fd:
        doc = xmltodict.parse(fd.read())
        
    data = doc['mediawiki']['page']
    print('To process %s records' %len(data))
    del doc

## 4) Ignore Redirects and Retrieve Articles Only

In [6]:
if not os.path.isfile('wikivoyage_latest_articles_text.json'):
    from collections import defaultdict
    completed = 0 
    articles = defaultdict(list)
    
    for item in data:
        if 'redirect' not in item:
            try:
                articles[item['title']].append(item['revision']['text']['#text'])
                completed += 1
            except KeyError:
                continue
                

            if completed%10000==0 or completed==len(data):
                print('Completed %s' %completed)
                
    print('Found %s articles' %len(articles))
    for article, text in articles.items():
        articles[article] = "".join(text)
        
    with open('wikivoyage_latest_articles_text.json', 'w') as f:
        json.dump(articles, f)
        
    del articles
    del data
    

## 5.practice) Extracting parent article, lat/long, and destination category from tags in text

In [9]:
import unicodedata
import re
with open('wikivoyage_latest_articles_text.json', 'r') as f:
    consolidated = json.load(f)

print(len(consolidated))

cleaned = {}
completed = 0
issues = 0

for article_name in consolidated:
#   1.a ignore articles which are not destinations (from article name and article tags)
    if not article_name.startswith('Module') and not article_name.startswith('Template:') and not article_name.startswith('Category:')\
    and not article_name.startswith('File:') and not article_name.startswith('Wikivoyage:') and not article_name.startswith('MediaWiki:') and not article_name in ['Moon', 'Space']\
        and len(re.findall('{{outlinetopic}}', consolidated[article_name].lower()))==0 and len(re.findall('{{usabletopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{guidetopic}}', consolidated[article_name].lower())) == 0 and len(re.findall('{{startopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{disamb}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambig}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambiguation}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{itinerary}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{usablephrasebook}}', consolidated[article_name].lower()))==0 and len(re.findall('{{phrasebookguide}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{Title-Index page}}', consolidated[article_name]))==0 \
        and len(re.findall('{{GalleryPageOf.*}}', consolidated[article_name]))==0 \
        and len(re.findall('{{stub}}', consolidated[article_name].lower())) == 0 \
        and len(re.findall('{{historical}}', consolidated[article_name].lower())) == 0:

        #1.b get 'id' tags
        #page_id = re.findall('', consolidated[article_name])
        
        # 2. get 'ispartof' tags
        IsPartOf = re.findall('{{IsPartOf.*}}', consolidated[article_name]) + re.findall('{{isPartOf.*}}', consolidated[article_name])

        # 3. get geo tags
        geo = re.findall('{{geo.*}}', consolidated[article_name].lower())

        # 4. get page type tags
        rating = re.findall('{{usablecity}}', consolidated[article_name].lower()) + re.findall('{{outlinecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecity}}', consolidated[article_name].lower()) + re.findall('{{starcity}}', consolidated[article_name].lower()) \
                    + re.findall('{{ussblecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablecountry}}', consolidated[article_name].lower()) + re.findall('{{outlinecountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecountry}}', consolidated[article_name].lower()) + re.findall('{{starcountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{usabledistrict}}', consolidated[article_name].lower()) + re.findall('{{outlinedistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidedistrict}}', consolidated[article_name].lower())+ re.findall('{{stardistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{usableregion}}', consolidated[article_name].lower()) + re.findall('{{outlineregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=yes}}', consolidated[article_name].lower()) \
                    + re.findall('{{starregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=no}}', consolidated[article_name].lower()) \
                    + re.findall('{{extraregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{usableairport}}', consolidated[article_name].lower()) + re.findall('{{outlineairport}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideairport}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablepark}}', consolidated[article_name].lower()) + re.findall('{{outlinepark}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidepark}}', consolidated[article_name].lower()) + re.findall('{{starpark}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablediveguide}}', consolidated[article_name].lower()) + re.findall('{{outlinediveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidediveguide}}', consolidated[article_name].lower()) + re.findall('{{stardiveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablecontinent}}', consolidated[article_name].lower()) + re.findall('{{outlinecontinent}}', consolidated[article_name].lower())
        
        
        city = re.findall('{{usablecity}}', consolidated[article_name].lower()) + re.findall('{{outlinecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecity}}', consolidated[article_name].lower()) + re.findall('{{starcity}}', consolidated[article_name].lower()) \
                    + re.findall('{{ussblecity}}', consolidated[article_name].lower())


        country = re.findall('{{usablecountry}}', consolidated[article_name].lower()) + re.findall('{{outlinecountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecountry}}', consolidated[article_name].lower()) + re.findall('{{starcountry}}', consolidated[article_name].lower())

        district = re.findall('{{usabledistrict}}', consolidated[article_name].lower()) + re.findall('{{outlinedistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidedistrict}}', consolidated[article_name].lower())+ re.findall('{{stardistrict}}', consolidated[article_name].lower())

        region = re.findall('{{usableregion}}', consolidated[article_name].lower()) + re.findall('{{outlineregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=yes}}', consolidated[article_name].lower()) \
                    + re.findall('{{starregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=no}}', consolidated[article_name].lower()) \
                    + re.findall('{{extraregion}}', consolidated[article_name].lower())

        airport = re.findall('{{usableairport}}', consolidated[article_name].lower()) + re.findall('{{outlineairport}}', consolidated[article_name].lower())\
                    + re.findall('{{guideairport}}', consolidated[article_name].lower())

        park = re.findall('{{usablepark}}', consolidated[article_name].lower()) + re.findall('{{outlinepark}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidepark}}', consolidated[article_name].lower()) + re.findall('{{starpark}}', consolidated[article_name].lower())

        diveguide = re.findall('{{usablediveguide}}', consolidated[article_name].lower()) + re.findall('{{outlinediveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidediveguide}}', consolidated[article_name].lower()) + re.findall('{{stardiveguide}}', consolidated[article_name].lower())

        continent = re.findall('{{usablecontinent}}', consolidated[article_name].lower()) + re.findall('{{outlinecontinent}}', consolidated[article_name].lower())

        
        # 5. clean naming before saving
        if len(geo)>0 and len(diveguide)==0 and article_name not in ['Commonwealth of Independent States']: #skip dive guides
            article_name = article_name.replace('_', ' ').split('{{')[0].strip().lower()
            
            if unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'brac':
                article_name = 'brac'
            elif unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'rugen':
                article_name = 'rugen'
            
            cleaned[article_name] = {}
            
            #cleaned[article_name]['article_id'] = str(page_id)
            cleaned[article_name]['rating'] = str(rating)

            # get lat long
            if len(geo)>0:
                cleaned[article_name]['latitude'] = geo[-1].split('|')[1]
                print(geo[-1])
                print(cleaned[article_name]['latitude'])
                
                cleaned[article_name]['longitude'] = geo[-1].split('|')[2]
                print(cleaned[article_name]['longitude'])

            # get parents
            cleaned[article_name]['ispartof'] = []
            for parts in IsPartOf:
                parent = parts.split('|')[1].replace('}','').replace('_', ' ').split('{{')[0].strip().lower()

                
                #fixes for inconsistent data
                if parent == 'ko pha ngan':
                    parent = 'ko pha-ngan'
                elif parent in ['lowland shandong', 'highland shandong', 'coastal shandong']:
                    parent = 'shandong'
                elif parent in ['southern delaware', 'northern delaware', 'central delaware']:
                    parent = 'delaware'
                elif parent in ['burgraviate', 'puster valley', 'eisack valley']:
                    parent = 'south tyrol'
                elif parent == 'bohemian-moravian highlands':
                    parent = 'highlands (czech republic)'
                elif parent == 'brahmanbaria district':
                    parent = 'chittagong division'
                elif parent == 'eastern desert':
                    parent = 'eastern desert (jordan)'
                elif parent == 'caribbean coast':
                    parent = 'caribbean coast (guatemala)'
                elif parent == 'santander (colombia)':
                    parent = 'santander (department, colombia)'
                elif parent == 'tripolitania':
                    parent = 'libya'
                elif parent == 'wooster area ohio':
                    parent = 'wooster area'
                elif parent == 'tatra mountains (poland)':
                    parent = 'tatra national park (poland)'
                elif parent == 'salcette':
                    parent = 'salcete'
                elif parent == 'eastern barbados':
                    parent = 'central eastern barbados'
                elif parent == 'east khasi hills':
                    parent = 'meghalaya'
                elif parent == 'samar':
                    parent = 'samar (philippines)'
                elif parent == 'chikmagalur (district)' and article_name != 'chikmagalur' :
                    parent = 'chikmagalur'
                elif unicodedata.normalize('NFKD', parent).encode('ascii', 'ignore') == 'rugen':
                    parent = 'rugen'
                elif article_name == 'chikmagalur':
                    parent = 'karnataka'
        
                cleaned[article_name]['ispartof'].append(parent)
                
                
            # 7. get destination type
            if len(airport)>0:
                cleaned[article_name]['type']='airport'
            elif len(city)>0:
                cleaned[article_name]['type']='city'
            elif len(continent)>0:
                cleaned[article_name]['type']='continent'
            elif len(country)>0:
                cleaned[article_name]['type']='country'
            elif len(district)>0:
                cleaned[article_name]['type']='district'
            elif len(park)>0:
                cleaned[article_name]['type']='park'
            elif len(region)>0:
                cleaned[article_name]['type']='region'

    completed +=1
    if completed%1000==0 or completed==len(consolidated):
        print('Completed: %s' %completed)


print('Total sorted: %s' %len(cleaned))

with open('destination_details_original.json', 'w') as f:
    json.dump(cleaned, f)

del consolidated

40693
{{geo|51.69014|5.29897|zoom=15}}
51.69014
5.29897
{{geo|50.7753|6.0828|zoom=14}}
50.7753
6.0828
{{geo|55.0694214|14.9204372}}
55.0694214
14.9204372}}
{{geo|57.05|9.93}}
57.05
9.93}}
{{geo|50.93809|4.03919|zoom=15}}
50.93809
4.03919
{{geo|47.4000|8.0500}}
47.4000
8.0500}}
{{geo|56.15|10.217}}
56.15
10.217}}
{{geo|31.901|102.22055|zoom=12}}
31.901
102.22055
{{geo|26.466666666667|-77.083333333333}}
26.466666666667
-77.083333333333}}
{{geo|30.36310|48.25925|zoom=15}}
30.36310
48.25925
{{geo|31.15883|52.65541|zoom=15}}
31.15883
52.65541
{{geo|1.83335|173.01158|zoom=11|layer=m}}
1.83335
173.01158
{{geo|53.7167|91.4167}}
53.7167
91.4167}}
{{geo|-13.6333|-72.8833}}
-13.6333
-72.8833}}
{{geo|44.02043|144.27330|zoom=15}}
44.02043
144.27330
{{geo|41.70651|42.84120|zoom=15}}
41.70651
42.84120
{{geo|50.10584|1.83437|zoom=15}}
50.10584
1.83437
{{geo|52.9000|-7.3500}}
52.9000
-7.3500}}
{{geo|49.04994|-122.29998|zoom=15}}
49.04994
-122.29998
{{geo|34.155833|73.219444}}
34.155833
73.219444}}
{{ge

KeyboardInterrupt: 

## 5.clean) Extracting parent article, lat/long, and destination category from tags in text

In [6]:
import unicodedata
import re
with open('wikivoyage_latest_articles_text.json', 'r') as f:
    consolidated = json.load(f)

print(len(consolidated))

cleaned = {}
completed = 0
issues = 0

for article_name in consolidated:
#   1. ignore articles which are not destinations (from article name and article tags)
    if not article_name.startswith('Module') and not article_name.startswith('Template:') and not article_name.startswith('Category:')\
    and not article_name.startswith('File:') and not article_name.startswith('Wikivoyage:') and not article_name.startswith('MediaWiki:') and not article_name in ['Moon', 'Space']\
        and len(re.findall('{{outlinetopic}}', consolidated[article_name].lower()))==0 and len(re.findall('{{usabletopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{guidetopic}}', consolidated[article_name].lower())) == 0 and len(re.findall('{{startopic}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{disamb}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambig}}', consolidated[article_name].lower()))==0 and len(re.findall('{{disambiguation}}', consolidated[article_name].lower()))==0\
        and len(re.findall('{{itinerary}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{usablephrasebook}}', consolidated[article_name].lower()))==0 and len(re.findall('{{phrasebookguide}}', consolidated[article_name].lower()))==0 \
        and len(re.findall('{{Title-Index page}}', consolidated[article_name]))==0 \
        and len(re.findall('{{GalleryPageOf.*}}', consolidated[article_name]))==0 \
        and len(re.findall('{{stub}}', consolidated[article_name].lower())) == 0 \
        and len(re.findall('{{historical}}', consolidated[article_name].lower())) == 0:

        # 2. get 'ispartof' tags
        IsPartOf = re.findall('{{IsPartOf.*}}', consolidated[article_name]) + re.findall('{{isPartOf.*}}', consolidated[article_name])

        # 3. get geo tags
        geo = re.findall('{{geo.*}}', consolidated[article_name].lower())

        # 4. get page type tags
        rating = re.findall('{{usablecity}}', consolidated[article_name].lower()) + re.findall('{{outlinecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecity}}', consolidated[article_name].lower()) + re.findall('{{starcity}}', consolidated[article_name].lower()) \
                    + re.findall('{{ussblecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablecountry}}', consolidated[article_name].lower()) + re.findall('{{outlinecountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecountry}}', consolidated[article_name].lower()) + re.findall('{{starcountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{usabledistrict}}', consolidated[article_name].lower()) + re.findall('{{outlinedistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidedistrict}}', consolidated[article_name].lower())+ re.findall('{{stardistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{usableregion}}', consolidated[article_name].lower()) + re.findall('{{outlineregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=yes}}', consolidated[article_name].lower()) \
                    + re.findall('{{starregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=no}}', consolidated[article_name].lower()) \
                    + re.findall('{{extraregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{usableairport}}', consolidated[article_name].lower()) + re.findall('{{outlineairport}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideairport}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablepark}}', consolidated[article_name].lower()) + re.findall('{{outlinepark}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidepark}}', consolidated[article_name].lower()) + re.findall('{{starpark}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablediveguide}}', consolidated[article_name].lower()) + re.findall('{{outlinediveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidediveguide}}', consolidated[article_name].lower()) + re.findall('{{stardiveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{usablecontinent}}', consolidated[article_name].lower()) + re.findall('{{outlinecontinent}}', consolidated[article_name].lower())
        
        
        city = re.findall('{{usablecity}}', consolidated[article_name].lower()) + re.findall('{{outlinecity}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecity}}', consolidated[article_name].lower()) + re.findall('{{starcity}}', consolidated[article_name].lower()) \
                    + re.findall('{{ussblecity}}', consolidated[article_name].lower())


        country = re.findall('{{usablecountry}}', consolidated[article_name].lower()) + re.findall('{{outlinecountry}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidecountry}}', consolidated[article_name].lower()) + re.findall('{{starcountry}}', consolidated[article_name].lower())

        district = re.findall('{{usabledistrict}}', consolidated[article_name].lower()) + re.findall('{{outlinedistrict}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidedistrict}}', consolidated[article_name].lower())+ re.findall('{{stardistrict}}', consolidated[article_name].lower())

        region = re.findall('{{usableregion}}', consolidated[article_name].lower()) + re.findall('{{outlineregion}}', consolidated[article_name].lower()) \
                    + re.findall('{{guideregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=yes}}', consolidated[article_name].lower()) \
                    + re.findall('{{starregion}}', consolidated[article_name].lower()) + re.findall('{{extraregion\|subregion=no}}', consolidated[article_name].lower()) \
                    + re.findall('{{extraregion}}', consolidated[article_name].lower())

        airport = re.findall('{{usableairport}}', consolidated[article_name].lower()) + re.findall('{{outlineairport}}', consolidated[article_name].lower())\
                    + re.findall('{{guideairport}}', consolidated[article_name].lower())

        park = re.findall('{{usablepark}}', consolidated[article_name].lower()) + re.findall('{{outlinepark}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidepark}}', consolidated[article_name].lower()) + re.findall('{{starpark}}', consolidated[article_name].lower())

        diveguide = re.findall('{{usablediveguide}}', consolidated[article_name].lower()) + re.findall('{{outlinediveguide}}', consolidated[article_name].lower()) \
                    + re.findall('{{guidediveguide}}', consolidated[article_name].lower()) + re.findall('{{stardiveguide}}', consolidated[article_name].lower())

        continent = re.findall('{{usablecontinent}}', consolidated[article_name].lower()) + re.findall('{{outlinecontinent}}', consolidated[article_name].lower())

        
        # 5. clean naming before saving
        if len(geo)>0 and len(diveguide)==0 and article_name not in ['Commonwealth of Independent States']: #skip dive guides
            article_name = article_name.replace('_', ' ').split('{{')[0].strip().lower()
            
            if unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'brac':
                article_name = 'brac'
            elif unicodedata.normalize('NFKD', article_name).encode('ascii', 'ignore') == 'rugen':
                article_name = 'rugen'
            
            cleaned[article_name] = {}
            
            cleaned[article_name]['rating'] = str(rating)

            # get lat long
            if len(geo)>0:
                cleaned[article_name]['latitude'] = geo[-1].split('|')[1]
                print(geo[-1])
                print(cleaned[article_name]['latitude'])
                
                cleaned[article_name]['longitude'] = geo[-1].split('|')[2]
                print(cleaned[article_name]['longitude'])

            # get parents
            cleaned[article_name]['ispartof'] = []
            for parts in IsPartOf:
                parent = parts.split('|')[1].replace('}','').replace('_', ' ').split('{{')[0].strip().lower()

                
                #fixes for inconsistent data
                if parent == 'ko pha ngan':
                    parent = 'ko pha-ngan'
                elif parent in ['lowland shandong', 'highland shandong', 'coastal shandong']:
                    parent = 'shandong'
                elif parent in ['southern delaware', 'northern delaware', 'central delaware']:
                    parent = 'delaware'
                elif parent in ['burgraviate', 'puster valley', 'eisack valley']:
                    parent = 'south tyrol'
                elif parent == 'bohemian-moravian highlands':
                    parent = 'highlands (czech republic)'
                elif parent == 'brahmanbaria district':
                    parent = 'chittagong division'
                elif parent == 'eastern desert':
                    parent = 'eastern desert (jordan)'
                elif parent == 'caribbean coast':
                    parent = 'caribbean coast (guatemala)'
                elif parent == 'santander (colombia)':
                    parent = 'santander (department, colombia)'
                elif parent == 'tripolitania':
                    parent = 'libya'
                elif parent == 'wooster area ohio':
                    parent = 'wooster area'
                elif parent == 'tatra mountains (poland)':
                    parent = 'tatra national park (poland)'
                elif parent == 'salcette':
                    parent = 'salcete'
                elif parent == 'eastern barbados':
                    parent = 'central eastern barbados'
                elif parent == 'east khasi hills':
                    parent = 'meghalaya'
                elif parent == 'samar':
                    parent = 'samar (philippines)'
                elif parent == 'chikmagalur (district)' and article_name != 'chikmagalur' :
                    parent = 'chikmagalur'
                elif unicodedata.normalize('NFKD', parent).encode('ascii', 'ignore') == 'rugen':
                    parent = 'rugen'
                elif article_name == 'chikmagalur':
                    parent = 'karnataka'
        
                cleaned[article_name]['ispartof'].append(parent)
                
                
            # 7. get destination type
            if len(airport)>0:
                cleaned[article_name]['type']='airport'
            elif len(city)>0:
                cleaned[article_name]['type']='city'
            elif len(continent)>0:
                cleaned[article_name]['type']='continent'
            elif len(country)>0:
                cleaned[article_name]['type']='country'
            elif len(district)>0:
                cleaned[article_name]['type']='district'
            elif len(park)>0:
                cleaned[article_name]['type']='park'
            elif len(region)>0:
                cleaned[article_name]['type']='region'

    completed +=1
    if completed%1000==0 or completed==len(consolidated):
        print('Completed: %s' %completed)


print('Total sorted: %s' %len(cleaned))

with open('destination_details_original.json', 'w') as f:
    json.dump(cleaned, f)

del consolidated

40693
{{geo|51.69014|5.29897|zoom=15}}
51.69014
5.29897
{{geo|50.7753|6.0828|zoom=14}}
50.7753
6.0828
{{geo|55.0694214|14.9204372}}
55.0694214
14.9204372}}
{{geo|57.05|9.93}}
57.05
9.93}}
{{geo|50.93809|4.03919|zoom=15}}
50.93809
4.03919
{{geo|47.4000|8.0500}}
47.4000
8.0500}}
{{geo|56.15|10.217}}
56.15
10.217}}
{{geo|31.901|102.22055|zoom=12}}
31.901
102.22055
{{geo|26.466666666667|-77.083333333333}}
26.466666666667
-77.083333333333}}
{{geo|30.36310|48.25925|zoom=15}}
30.36310
48.25925
{{geo|31.15883|52.65541|zoom=15}}
31.15883
52.65541
{{geo|1.83335|173.01158|zoom=11|layer=m}}
1.83335
173.01158
{{geo|53.7167|91.4167}}
53.7167
91.4167}}
{{geo|-13.6333|-72.8833}}
-13.6333
-72.8833}}
{{geo|44.02043|144.27330|zoom=15}}
44.02043
144.27330
{{geo|41.70651|42.84120|zoom=15}}
41.70651
42.84120
{{geo|50.10584|1.83437|zoom=15}}
50.10584
1.83437
{{geo|52.9000|-7.3500}}
52.9000
-7.3500}}
{{geo|49.04994|-122.29998|zoom=15}}
49.04994
-122.29998
{{geo|34.155833|73.219444}}
34.155833
73.219444}}
{{ge

# 6) Map out parent child relationship for all articles into a dictionary

In [7]:
with open('destination_details_original.json', 'r') as f:
    cleaned = json.load(f)


def map_destinations(mapped, current_dict):
    global destination_mapping, parent, destination
    if not mapped:
        if parent in current_dict:
            current_dict[parent].update({destination: {}})
            mapped = True
            
            #find if any of the top level keys match this article
            if destination in destination_mapping:
                current_dict[parent][destination] = destination_mapping.pop(destination)
        
        # if parent not in dict but article in dict. given previous step, this can only happen if destination is at top level
        elif destination in current_dict and current_dict==destination_mapping:
            current_dict[parent] = {}
            current_dict[parent][destination] = current_dict.pop(destination)
            mapped = True
            attached = False
            #find if any of the values match this parent
            step_through_dict(attached, destination_mapping)

        else:
            for next_level in current_dict:
                mapped, current_dict[next_level] = map_destinations(mapped, current_dict[next_level])
    
    return mapped, current_dict


def step_through_dict(attached, current_dict):
    global destination_mapping, destination, parent

    iter_list = list(current_dict)
    for item in iter_list:
        if not attached:
            if item == parent and current_dict!=destination_mapping:
                current_dict[parent][destination] = destination_mapping.pop(parent)[destination]
                attached = True
            elif len(current_dict[item])>0:
                attached = step_through_dict(attached, current_dict[item])
    return attached
        
    
destination_mapping = {}
print('To process %s records' %len(cleaned))
processed = 0
for destination in cleaned:
    for parent in cleaned[destination]['ispartof']:
            
        mapped = False
        mapped, destination_mapping = map_destinations(mapped, destination_mapping)
        
        if not mapped:
            destination_mapping[parent] = {}
            destination_mapping[parent][destination] = {}
    processed+=1
    if processed%1000==0 or processed==len(cleaned):
        print('Completed: %s' %processed)

with open('destination_mapping.json', 'w') as f:
    json.dump(destination_mapping, f)
    
del destination_mapping

To process 27203 records
Completed: 1000
Completed: 2000
Completed: 3000
Completed: 4000
Completed: 5000
Completed: 6000
Completed: 7000
Completed: 8000
Completed: 9000
Completed: 10000
Completed: 11000
Completed: 12000
Completed: 13000
Completed: 14000
Completed: 15000
Completed: 16000
Completed: 17000
Completed: 18000
Completed: 19000
Completed: 20000
Completed: 21000
Completed: 22000
Completed: 23000
Completed: 24000
Completed: 25000
Completed: 26000
Completed: 27000
Completed: 27203


# 7) Examine data and return to previous step to fix inconsistent spellings

In [8]:
with open('destination_mapping.json', 'r') as f:
    destination_mapping = json.load(f)

fixed = 0
    
# retrieve loose ends and identify inconsistent spellings
for item in destination_mapping:
    print(item)
    fixed =+1
    
print(fixed)
    
del destination_mapping

southwestern iran
europe
south america
sindhudurg
dohuk governorate
antarctica
north america
central western iran
erbil governorate
other destinations
havana (province)
oceania
northern baden-württemberg
sapphire coast
free state<!--
east gippsland
asia
africa
saint john (virgin islands)
zamboanga del norte
ratnagiri (district)
river valley (maine)
york (ontario)
akershus
south and west gippsland
central somalia
thabo mofutsanyana
raigad
mt. hood and columbia gorge
tolima
khuzestan
northern togo
udupi (district)
la rioja (province, argentina)
maritime togo
annapurna circuit
pinar del rio (province)
west fujian
xhariep
zamboanga del sur
zeravshan
sulemania governorate
central gippsland
nyingchi (prefecture)
central togo
upper moldavia
northern coast (fujian)
northern free state
east central alberta
lejweleputswa
guantánamo (province)
akwa ibom state
akwa ibom
capitale nationale
sault-au-cochon
nationale-capitale
côte-nord

1


# 8) Retrieve parent chain for destination input

In [9]:
destination = 'singapore'

with open('destination_details_original.json', 'r') as f:
    details =json.load(f)
    
def get_parent(current, chain = ''):
    if chain is '':
        chain = current.lower()
        current = current.lower()
    try:
        for parent in details[current]['ispartof']:
            chain = '%s|$s' %(parent, chain)
            chain = get_parent(parent, chain)
    except KeyError:
        return chain
    else:
        return chain
print(get_parent)#(destination)

del details

<function get_parent at 0x7fa363f7d950>


# 9) Get child articles from input

In [10]:
desination = 'singapore'

with open('destination_details_original.json', 'r') as f:
    details = json.load(f)
    
def get_child(search):
    child_articles = []
    for article in details:
        for parent in details[article]['ispartof']:
            if parent == search.lower():
                child_articles.append(article)
    return child_articles

for item in get_child(destination):
    print(item)
    
del details

singapore/balestier
singapore/bugis
singapore/chinatown
singapore/east coast
singapore/little india
singapore/marina bay
singapore/north and west
singapore/orchard
singapore/riverside
singapore/sentosa and harbourfront
singapore changi airport


# 10) Search destinations containing input

In [13]:
destination = 'singapore'

with open ('destination_details_original.json', 'r') as f:
    details = json.load(f)
    
def search(input):
    results =[]
    for item in details:
        if input.lower() in item:
            results.append(item)
    return results

for result in search(destination):
    print(result)
    print(get_parent)#(result)
    print(get_child)#(result)
    print
    
del details

singapore
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/balestier
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/bugis
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/chinatown
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/east coast
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/little india
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/marina bay
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/north and west
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/orchard
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/riverside
<function get_parent at 0x7fa363f7d950>
<function get_child at 0x7fa363fb2710>
singapore/sentos