In [1]:
import json

In [2]:
wikidata = sc.textFile('hdfs://hdfs-mesos/data/wikidata-20170313-all.json')

In [4]:
# Borrowed from @arnes: parser function to turn lines into JSON objects.
def parseLine(line):
    # Parse all lines except the first "[" and last "]"
    if len(line) > 1:
        # Strip tailing comma, except for last element in entire list
        if line[-1:] == ',':
            line = line[:-1]
        try:
            return json.loads(line)
        except:
            raise ValueError('Unable to decode line: {}'.format(line))

In [5]:
# Borrowed from @arnes: process wikidata and filter away entries where parsing failed.
data = (wikidata.map(parseLine)
        .filter(lambda item: item != False and item != None)
        .filter(lambda entry: entry['type'] == 'item')
        .cache())

print('Number of entries: {}'.format(data.count()))

Number of entries: 25324233


In [6]:
data.take(1)

[{u'aliases': {u'ar': [{u'language': u'ar',
     u'value': u'\u0625\u0633\u0643\u062a\u0644\u0646\u062f\u0627'}],
   u'be-tarask': [{u'language': u'be-tarask',
     u'value': u'\u0428\u043a\u043e\u0446\u044b\u044f'}],
   u'ca': [{u'language': u'ca', u'value': u'Scotland'}],
   u'en': [{u'language': u'en', u'value': u'Alba'},
    {u'language': u'en', u'value': u'Scotland, United Kingdom'},
    {u'language': u'en', u'value': u'SCT'},
    {u'language': u'en', u'value': u'Caledonia'},
    {u'language': u'en', u'value': u'scot'}],
   u'en-gb': [{u'language': u'en-gb', u'value': u'Caledonia'}],
   u'eo': [{u'language': u'eo', u'value': u'Skotio'},
    {u'language': u'eo', u'value': u'Skotujo'}],
   u'hr': [{u'language': u'hr', u'value': u'Kaledonija'}],
   u'kn': [{u'language': u'kn', u'value': u'\u0c86\u0cb2\u0ccd\u0cac\u0cbe'}],
   u'ta': [{u'language': u'ta', u'value': u'\u0b86\u0bb2\u0bcd\u0baa\u0bbe'}],
   u'th': [{u'language': u'th',
     u'value': u'\u0e41\u0e04\u0e27\u0e49\u0e19\u0e2

# Instance of

In [7]:
def parseInstanceOf(i):
    res = []
    try:
        if 'P31' in i['claims'].keys():
            for c in i['claims']['P31']:
                res.append(json.dumps({'child':i['id'], 'parent':c['mainsnak']['datavalue']['value']['id']}))
    except:
        return res
    
    return res

In [8]:
instances = data.flatMap(parseInstanceOf).cache()

In [16]:
instances.count()

20547613

In [9]:
instances.take(10)

['{"parent": "Q3336843", "child": "Q22"}',
 '{"parent": "Q3624078", "child": "Q31"}',
 '{"parent": "Q43702", "child": "Q31"}',
 '{"parent": "Q185441", "child": "Q31"}',
 '{"parent": "Q6256", "child": "Q31"}',
 '{"parent": "Q160016", "child": "Q31"}',
 '{"parent": "Q6505795", "child": "Q31"}',
 '{"parent": "Q1454986", "child": "Q1"}',
 '{"parent": "Q175854", "child": "Q13"}',
 '{"parent": "Q5", "child": "Q23"}']

In [11]:
instances.saveAsTextFile("hdfs://hdfs-mesos/user/simonj/data/wikidata-20170313_instances")

# Subclass of

In [12]:
def parseSubclassOf(i):
    res = []
    try:
        if 'P279' in i['claims']:
            for c in i['claims']['P279']:
                res.append(json.dumps({'child':i['id'], 'parent':c['mainsnak']['datavalue']['value']['id']}))
    except:
        return res
    
    return res

In [13]:
subclasses = data.flatMap(parseSubclassOf).cache()

In [17]:
subclasses.count()

2160029

In [14]:
subclasses.take(10)

['{"parent": "Q37141", "child": "Q13"}',
 '{"parent": "Q154", "child": "Q44"}',
 '{"parent": "Q11019", "child": "Q82"}',
 '{"parent": "Q4135602", "child": "Q102"}',
 '{"parent": "Q41825", "child": "Q105"}',
 '{"parent": "Q18602249", "child": "Q109"}',
 '{"parent": "Q82955", "child": "Q116"}',
 '{"parent": "Q18602249", "child": "Q124"}',
 '{"parent": "Q18602249", "child": "Q126"}',
 '{"parent": "Q41825", "child": "Q127"}']

In [15]:
subclasses.saveAsTextFile("hdfs://hdfs-mesos/user/simonj/data/wikidata-20170313_subclasses")

# Labels

In [18]:
def parseLabelOf(i):
    res = []
    for label in i['labels'].values():
        res.append(json.dumps({'id':i['id'], 'language':label['language'], 'value':label['value']}))
    return res

In [19]:
labels = data.flatMap(parseLabelOf).cache()

In [20]:
labels.count()

134489309

In [21]:
labels.take(10)

['{"value": "Nalbin", "id": "Q22", "language": "gv"}',
 '{"value": "Scotland", "id": "Q22", "language": "sco"}',
 '{"value": "Scozzia", "id": "Q22", "language": "scn"}',
 '{"value": "\\u82cf\\u683c\\u5170", "id": "Q22", "language": "wuu"}',
 '{"value": "S\\u016d-g\\u00e1ik-l\\u00e0ng", "id": "Q22", "language": "cdo"}',
 '{"value": "\\u8607\\u683c\\u862d", "id": "Q22", "language": "zh-hk"}',
 '{"value": "Alba", "id": "Q22", "language": "gd"}',
 '{"value": "Esc\\u00f3cia", "id": "Q22", "language": "pt-br"}',
 '{"value": "Albain", "id": "Q22", "language": "ga"}',
 '{"value": "Ekosia", "id": "Q22", "language": "gn"}']

In [22]:
labels.saveAsTextFile("hdfs://hdfs-mesos/user/simonj/data/wikidata-20170313_labels")

# Aliases

In [23]:
def parseAliasOf(i):
    res = []
    if 'aliases' in i:
        for aliases in i['aliases'].values():
            for alias in aliases:
                res.append(json.dumps({'id':i['id'], 'language':alias['language'], 'value':alias['value']}))
    return res

In [24]:
aliases = data.flatMap(parseAliasOf).cache()

In [26]:
aliases.count()

13588187

In [25]:
aliases.take(10)

['{"value": "Caledonia", "id": "Q22", "language": "en-gb"}',
 '{"value": "Skotio", "id": "Q22", "language": "eo"}',
 '{"value": "Skotujo", "id": "Q22", "language": "eo"}',
 '{"value": "Alba", "id": "Q22", "language": "en"}',
 '{"value": "Scotland, United Kingdom", "id": "Q22", "language": "en"}',
 '{"value": "SCT", "id": "Q22", "language": "en"}',
 '{"value": "Caledonia", "id": "Q22", "language": "en"}',
 '{"value": "scot", "id": "Q22", "language": "en"}',
 '{"value": "Kaledonija", "id": "Q22", "language": "hr"}',
 '{"value": "Scotland", "id": "Q22", "language": "ca"}']

In [27]:
aliases.saveAsTextFile("hdfs://hdfs-mesos/user/simonj/data/wikidata-20170313_aliases")