In [3]:
import re
import spacy
import requests
import multiprocessing as mp
from ast import literal_eval
from bs4 import BeautifulSoup
from unidecode import unidecode
from pandas import pandas as pd

In [4]:
nlp = spacy.load("en_core_web_lg")

In [5]:
df = pd.read_csv('../data/countries.tsv', names=['wiki', 'country', 'capitals'], sep='\t', converters={'capitals': literal_eval}, index_col=[0])
df.capitals = df.capitals.apply(lambda capitals: {unidecode(capital) for capital in capitals})
df.capitals = df.capitals.apply(set)
countries = set(df['country'].apply(unidecode))

In [6]:
def extract_text(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    [s.extract() for s in soup.find_all(id='toc')]
    [s.extract() for s in soup.find_all(['table', 'script', 'sup'])]
    [s.extract() for s in soup.find_all(class_=['portal', 'navigation-not-searchable', 'reflist', 'noprint'])]
    text = soup.find(class_='mw-parser-output').get_text()
    text = re.sub(r' \([^)]*\)', '', text).strip()

    # articles with no mention of the capital in the article body
    no_capital = any(capital not in text for capital in df.at[url, 'capitals'])

    return '\n'.join(text.split('\n')[:15]), no_capital

In [26]:
def concat_compound(token):
    res = []
    for child in token.children:
        if child.dep_ == 'compound':
            res.extend(concat_compound(child))
            res.append(unidecode(child.text))
    return res

In [27]:
def bfs(root):
    queue = list(root.children)
    while len(queue):
        head = queue.pop(0)
        if head.pos_ == 'PROPN':
            compound = concat_compound(head)
            compound.append(unidecode(head.text))
            capital = ' '.join(compound)
            if capital in countries or len(set.intersection(countries, set(compound))):
                return None
            return capital
        queue.extend(list(head.children))

In [28]:
def get_parent_verb(token):
    if token.pos_ != 'VERB':
        if token.dep_ == 'ROOT':
            return None
        return get_parent_verb(token.head)
    return token

In [29]:
def extract_capitals(url):
#     print(url)
    text, no_capital = extract_text(url)
    doc = nlp(text, disable=['ner', 'textcat'])

    result = set()
    for sentence in doc.sents:
        if 'capital' in sentence.text:
            for token in sentence:
                if token.text == 'capital':
                    verb = get_parent_verb(token)
                    if verb:
                        propn = bfs(verb)
                        if propn:
                            result.add(propn)
                            break
    return url, result, no_capital

In [30]:
def addToDataFrame(data):
    url, result, no_capital = data
    df.at[url, 'capitals_found'] = result
    df.at[url, 'no_capital'] = no_capital

In [31]:
def getRecall(tp, fn):
    if (not tp and not fn):
        return 0
    return tp / (tp + fn)

def getPrecision(tp, fp):
    if (not tp and not fp):
        return 0
    return tp / (tp + fp)

def getFScore(row):
    truth, prediction = row
    #dirty hack for 'Luanda' being string instead of set for no reason
    if isinstance(prediction, str):
        prediction = set([prediction])
    tp = [val for val in truth if val in prediction]
    fp = [val for val in prediction if val not in tp]
    fn = [val for val in truth if val not in prediction]
    
    precision = getPrecision(len(tp), len(fp))
    recall = getRecall(len(tp), len(fn))

    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def getFScoreDf(dataframe):
    df = dataframe.copy()
    df['f1'] = df[['capitals', 'capitals_found']].apply(getFScore, axis=1)
    return df

In [32]:
def get_jaccard_similarily(data):
    capitals, capitals_found = data    
    return len(set.intersection(capitals, capitals_found)) / len(set.union(capitals, capitals_found))

def get_df_jaccard_similarily(dataframe):
    df = dataframe.copy()
    df['similarity'] = df[['capitals', 'capitals_found']].apply(get_jaccard_similarily, axis=1)
    return df['similarity'].mean()

In [33]:
def process():
    pool = mp.Pool()

    for idx in df.index:
        pool.apply_async(extract_capitals, args=[idx], callback=addToDataFrame)

    pool.close()
    pool.join()

In [34]:
process()

### F1

In [36]:
df_f1 = getFScoreDf(df)
df_f1.f1.mean()

0.6616541353383458

In [38]:
#f1 for countries with countries without capitals in body text filtered out
getFScoreDf(df[df['no_capital'] == False]).f1.mean()

0.6825396825396826

In [239]:
# similarity for countries, where number of capitals equals number of found capitals
getFScoreDf(df[df['capitals_found'].map(len) == df['capitals'].map(len)]).f1.mean()

0.9090909090909091

In [242]:
# share of countries with with not all capitals found (countries without found capitals are not considered)
len(df[(df['capitals_found'].map(len) < df['capitals'].map(len)) & (df['capitals_found'].map(len) > 0)]) / len(df)

0.03007518796992481

In [243]:
# share of countries without a single capital found
len(df[df['capitals_found'].map(len) == 0]) / len(df)

0.24060150375939848

In [248]:
df_f1.head()

Unnamed: 0_level_0,country,capitals,capitals_found,no_capital,f1
wiki,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
http://en.wikipedia.org/wiki/Egypt,Egypt,{Cairo},{},False,0.0
http://en.wikipedia.org/wiki/Afghanistan,Afghanistan,{Kabul},{Kabul},False,1.0
http://en.wikipedia.org/wiki/Algeria,Algeria,{Algiers},{Algiers},False,1.0
http://en.wikipedia.org/wiki/Angola,Angola,{Luanda},Luanda,False,1.0
http://en.wikipedia.org/wiki/Germany,Germany,{Berlin},{Berlin},False,1.0


### Here are Jaccard similarity scores, which I no longer consider as an appropriate metrics

In [69]:
get_df_jaccard_similarily(df)

0.6478696741854636

In [70]:
#similarity for countries with countries without capitals in body text filtered out
get_df_jaccard_similarily(df[df['no_capital'] == False])

0.6679894179894179

In [71]:
#similarity for countries without capitals in body text and capitals amount greater than 2
get_df_jaccard_similarily(df[(df['no_capital'] == False) & (df['capitals'].map(len) == 1)])

0.6888888888888888

In [72]:
# similarity for countries, where number of capitals equals number of found capitals
get_df_jaccard_similarily(df[df['capitals_found'].map(len) == df['capitals'].map(len)])

0.9101123595505618

In [73]:
# share of countries with redundant capitals found
len(df[df['capitals_found'].map(len) > df['capitals'].map(len)]) / len(df)

0.06015037593984962