In [1]:
import re
import spacy
import requests
import multiprocessing as mp
from ast import literal_eval
from bs4 import BeautifulSoup
from unidecode import unidecode
from pandas import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
df = pd.read_csv('../data/countries.tsv', names=['wiki', 'country', 'capitals'], sep='\t', converters={'capitals': literal_eval}, index_col=[0])
df.capitals = df.capitals.apply(lambda capitals: {unidecode(capital) for capital in capitals})
df.capitals = df.capitals.apply(set)
countries = set(df['country'].apply(unidecode))

In [4]:
def extract_text(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    [s.extract() for s in soup.find_all(id='toc')]
    [s.extract() for s in soup.find_all(['table', 'script', 'sup'])]
    [s.extract() for s in soup.find_all(class_=['portal', 'navigation-not-searchable', 'reflist', 'noprint'])]
    text = soup.find(class_='mw-parser-output').get_text()
    text = re.sub(r' \([^)]*\)', '', text).strip()

    # articles with no mention of the capital in the article body
    no_capital = any(capital not in text for capital in df.at[url, 'capitals'])

    return '\n'.join(text.split('\n')[:15]), no_capital

In [5]:
def concat_compound(token):
    res = []
    for child in token.children:
        if child.dep_ == 'compound':
            res.append(unidecode(child.text))
            res.extend(concat_compound(child))
    return res

In [6]:
def bfs(root):
  queue = list(root.children)
  while len(queue):
    head = queue.pop(0)
    if head.pos_ == 'PROPN':
      compound = concat_compound(head)
      compound.insert(0, unidecode(head.text))
      capital = ' '.join(reversed(compound))
      if capital in countries or len(set.intersection(countries, set(compound))):
        return None
      return capital
    queue.extend(list(head.children))

In [7]:
def get_parent_verb(token):
    if token.pos_ != 'VERB':
        if token.dep_ == 'ROOT':
            return None
        return get_parent_verb(token.head)
    return token

In [8]:
def extract_capitals(url):
#     print(url)
    text, no_capital = extract_text(url)
    doc = nlp(text, disable=['ner', 'textcat'])

    result = set()
    for sentence in doc.sents:
        if 'capital' in sentence.text:
            for token in sentence:
                if token.text == 'capital':
                    verb = get_parent_verb(token)
                    if verb:
                        propn = bfs(verb)
                        if propn:
                            result.add(propn)
                            break

    return url, result, no_capital, get_jaccard_similarily(url, (df.at[url, 'capitals'], result))

In [9]:
def addToDataFrame(data):
    url, result, no_capital, similarity = data
    df.at[url, 'capitals_found'] = result
    df.at[url, 'no_capital'] = no_capital
    df.at[url, 'similarity'] = similarity

In [10]:
def get_jaccard_similarily(url, data):
    capitals, capitals_found = data    
    return len(set.intersection(capitals, capitals_found)) / len(set.union(capitals, capitals_found))

In [11]:
def get_df_jaccard_similarily(dataframe):
    df = dataframe.copy()
    df['similarity'] = df[['capitals', 'capitals_found']].apply(lambda x: get_jaccard_similarily(x.name, x), axis=1)
    return df['similarity'].mean()

In [12]:
def process():
    pool = mp.Pool()

    for idx in df.index:
        pool.apply_async(extract_capitals, args=[idx], callback=addToDataFrame)

    pool.close()
    pool.join()

    return df['similarity'].mean()

In [13]:
process()

0.6478696741854636

In [14]:
#similarity for countries with countries without capitals in body text filtered out
get_df_jaccard_similarily(df[df['no_capital'] == False])

0.66005291005291

In [15]:
#similarity for countries without capitals in body text and capitals amount greater than 2
get_df_jaccard_similarily(df[(df['no_capital'] == False) & (df['capitals'].map(len) == 1)])

0.6805555555555556

In [16]:
# similarity for countries, where number of capitals equals number of found capitals
get_df_jaccard_similarily(df[df['capitals_found'].map(len) == df['capitals'].map(len)])

0.9195402298850575

In [17]:
# share of countries with redundant capitals found
len(df[df['capitals_found'].map(len) > df['capitals'].map(len)]) / len(df)

0.06766917293233082

In [18]:
# share of countries with with not all capitals found (countries without found capitals are not considered)
len(df[(df['capitals_found'].map(len) < df['capitals'].map(len)) & (df['capitals_found'].map(len) > 0)]) / len(df)

0.03007518796992481

In [19]:
# share of countries without a single capital found
len(df[df['capitals_found'].map(len) == 0]) / len(df)

0.24812030075187969

In [20]:
df.head()

Unnamed: 0_level_0,country,capitals,capitals_found,no_capital,similarity
wiki,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
http://en.wikipedia.org/wiki/Egypt,Egypt,{Cairo},{},False,0.0
http://en.wikipedia.org/wiki/Afghanistan,Afghanistan,{Kabul},{Kabul},False,1.0
http://en.wikipedia.org/wiki/Algeria,Algeria,{Algiers},{Algiers},False,1.0
http://en.wikipedia.org/wiki/Angola,Angola,{Luanda},Luanda,False,1.0
http://en.wikipedia.org/wiki/Germany,Germany,{Berlin},{Berlin},False,1.0
