# Initial Loading

In [6]:
from json import load
DATAP = "C:\Programmierung\Repos\WikiOnto\data"
ad = load(open(DATAP+"/articledict.json", "r"))
print("Loaded article dictionary with a total number of "+str(len(ad)) + " articles.")

Loaded article dictionary with a total number of 134480 articles.


In [7]:
seed = [a for a in ad if ad[a]["Seed"]]
print("There are " + str(len(seed)) + " seed articles from TIOBE and GitHub.")

There are 321 seed articles from TIOBE and GitHub.


## Set Up - Infobox Indicator

### Seed Exploration

In [17]:
# Infobox templates are saved at the key "DbpediaInfoboxTemplate"
seed_infobox = [a for a in ad if "DbpediaInfoboxTemplate" in ad[a] and ad[a]["Seed"]]
print("  "+str(len(seed_infobox)) + " seed articles have an infobox")

  264 seed articles have an infobox


In [18]:
# We count Infobox template frequencies
from collections import Counter
templates = [template for a in ad if "DbpediaInfoboxTemplate" in ad[a] for template in set(ad[a]["DbpediaInfoboxTemplate"])
             if ad[a]["Seed"]]
counter = Counter(templates)
for infobox, count in counter.items():
    print("*  "+infobox + " : " + str(count))

*  infobox_programming_language : 213
*  infobox_software : 31
*  infobox_file_format : 33
*  infobox_software_license : 1
*  infobox_technology_standard : 3


### Initial Configuration

In [19]:
# We configured programming_language and file_format as positive infoboxes!
seed_infobox_positive = [a for a in ad if ad[a]["PositiveInfobox"] and ad[a]["Seed"]]
print(str(len(seed_infobox_positive)) + " seed articles have a positive infobox")
print("The recall for the seed is " + str(100 * (len(seed_infobox_positive) / len(seed))) + "%")

238 seed articles have a positive infobox
The recall for the seed is 74.14330218068535%


### Unseen Data Statistics

In [20]:
# First estimate for unseen data
articles_infobox = [a for a in ad if "DbpediaInfoboxTemplate" in ad[a]]
print(str(len(articles_infobox)) + " articles in the scope have an infobox")
articles_infobox_positive = [a for a in ad if ad[a]["PositiveInfobox"]]
print(str(len(articles_infobox_positive)) + " articles in the scope have a positive infobox")

53122 articles in the scope have an infobox
885 articles in the scope have a positive infobox


## Set Up - URL Pattern Indicator

### Seed Exploration

In [21]:
# URL Patterns have to be mined from titles
seed_urlpattern = [a for a in ad if "(" in a and ad[a]["Seed"]]
print("  "+str(len(infoboxavailable)) + " seed articles have a URL pattern")

  264 seed articles have a URL pattern


In [32]:
# We count words inside of braces in titles
urlwords = [urlword for a in ad if '(' in a for urlword in a.split('(')[1].split(')')[0].split('_') if ad[a]["Seed"]]
counter = Counter(urlwords)
print("Word frequencies in title braces with frequency > 5.")
for urlword, count in counter.items():
    if count > 5:
        print("  "+urlword + " : " + str(count))
print()
programming_languages = [a for a in ad if '(programming_language)' in a if ad[a]["Seed"]]
print(str(len(programming_languages)) + " seed articles have (programming_language)")

Word frequencies in title braces with frequency > 5.
  programming : 115
  language : 126
  software : 6

113 seed articles have (programming_language)


### Initial Configuration

In [31]:
# We only consider language to be positive and programming as irrelevant.
seed_with_languagetitle = [a for a in ad if '(' in a 
                               and 'language' in a.split('(')[1].split(')')[0].split('_')
                               and ad[a]["Seed"]]
# We want to check what we gain from URLs
exclusive = [a for a in seed_with_languagetitle if a not in seed_infobox_positive]
print("We recalled "+str(len(exclusive)) + " seed articles that have no positive Infobox!")

We recalled 19 seed articles that have no positive Infobox!


In [33]:
# We compute the combined recall for the seed
seed_ibup = [a for a in ad if a in seed_with_languagetitle or a in seed_infobox_positive]
print("The combined recall for the seed is " + str(100 * (len(seed_ibup) / len(seed))) + "%")

The combined recall for the seed is 80.06230529595015%


### Unseen Data Statistics

In [39]:
# We explore how many language candidates are identified in the complete data set.
articles_urlpattern_positive = [a for a in ad if '(' in a and 
                       'language' in a.split('(')[1].split(')')[0].split('_')]
print(str(len(articles_urlpattern_positive)) + " articles have 'language' in braces.")
articles_ibup = [a for a in ad if ad[a]["PositiveInfobox"] or a in articles_urlpattern_positive]
print(str(len(articles_ibup)) + " are indicated as positive by infobox and URL pattern so far.")

375 articles have 'language' in braces.
1045 are indicated as positive by infobox and URL pattern so far.


### Indicator Contradiction

In [43]:
# Infobox indicator says yes while URL pattern says no
contradict_ibup = [a for a in ad if ad[a]["PositiveInfobox"] and a not in articles_urlpattern_positive
                  and '(' in a]
print(str(len(contradict_ibup)) + " contradictions!")
print()
# We search for missed URL words by frequency analysis.
urlwords = [urlword for a in contradict_ibup for urlword in a.split('(')[1].split(')')[0].split('_')]
counter = Counter(urlwords)
print("Word frequencies in title braces with frequency > 5.")
for urlword, count in counter.items():
    if count > 5:
        print("  "+urlword + " : " + str(count))

59 contradictions!

Word frequencies in title braces with frequency > 5.
  format : 25
  software : 10
  file : 18
