# Create network of neighboring countries

This Jupyter notebook fetches HTML data from Wikipedia and parses the information in it to get countries and neighboring countries.

## References

- Wikipedia:
    - [List of countries and territories by land borders](https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_borders)
    - [List of countries and territories by land and maritime borders](https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_and_maritime_borders)

In [None]:
import json
import os

import requests
from bs4 import BeautifulSoup

## Land and maritime borders

### Get HTML text from Wikipedia

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_and_maritime_borders'
response = requests.get(url)
html_text = response.text

### Parse HTML text

Find the table, iterate over the rows and extract information from each.

In [None]:
soup = BeautifulSoup(html_text, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')


def parse_row(row):
    columns = row.find_all('td')

    # Source in column 0
    source = columns[0]
    source_name = source.a.text
    if not source_name:
        source_name = source.text.strip().split('[')[0].strip()
    
    # Number of neighbor countries in column 3
    num_targets = columns[3].text.strip()
    if '(' in num_targets:
        num_targets = num_targets.split('(')[1].split(')')[0].strip()
    num_targets = int(num_targets)
    
    # Neighbor countries in column 4
    targets = columns[4]
    target_names = [t.text for t in targets.find_all('a')
                    if t.text
                    and t['href'].startswith('/wiki/')
                    and not t.parent.text.startswith('(')]
    
    # Excluding cases ("Kingdom of ...", "Realm of ...")
    if 'includes' in source.text:
        return False
    if len(target_names) == 0:
        return False

    # Cleaning special cases, inspected manually
    if source_name == 'Australia (excluding outlying islands)':  # remove specifying comment in country name
        source_name = 'Australia'
    if source_name == 'Mauritania':
        target_names = target_names[:5]  # remove 2 extra links from a comment about Western Sahara
    if source_name == 'Norway':
        target_names = target_names[:7]  # remove 1 extra link that specifies a part of a country
    if source_name == 'Pitcairn Islands':
        target_names = target_names[:1]  # remove 1 extra link that specifies a part of a country
    if source_name == 'Russia':
        target_names = target_names[:20]  # remove 1 extra link that specifies a part of a country
    if source_name == 'Serbia':
        target_names = target_names[:8]  # remove 2 extra links from a comment about Kosovo
    if source_name == 'Turkey':
        target_names = target_names[:12]  # remove 1 extra link from a comment about Cyprus
    if 'United Nations Buffer Zone in Cyprus' in target_names:  # remove UN zone which is not counted as country
        target_names = [t for t in target_names if t != 'United Nations Buffer Zone in Cyprus']

    if num_targets != len(target_names):
        print('Caution: Given number of targets ({}) does not fit to length of target list ({})'.format(
            num_targets, len(target_names)))
    return source_name, target_names


data = {}
for i, row in enumerate(rows[3:]):
    try:
        result = parse_row(row)
        if not result:
            continue  # Skip it, either has no neighbors or is a group of countries
        source_name, target_names = result
        data[source_name] = target_names
    except Exception as e:
        columns = row.find_all('td')
        print(i, 'error:', e, len(columns), columns[0].text)

print('Parsed border data, found {} countries and their neighbors.'.format(len(data)))

### Export data as adjacency list to a JSON file

In [None]:
filepath = os.path.join('..', 'networks', 'neighboring_countries.json')
with open(filepath, 'w') as file_handle:
    json.dump(data, file_handle)