**Fetch metadata and properties on the Wiki Loves Monument photos dataset**

API Source:
https://commons.wikimedia.org/wiki/Commons:API/MediaWiki
https://commons.wikimedia.org/w/api.php?action=help&modules=query

In [1]:
import pymongo
import pandas as pd
import requests

**Initialize the MongoDB instance**

The script requires the presence of a database *wikilm* and a collection *photos*

In [2]:
db_client = pymongo.MongoClient("mongodb://localhost:27017/")
db_name = db_client["wikilm"]
db_collection = db_name["photos"]

Define auxiliary functions

In [19]:
API_URL_BASEURL = "https://commons.wikimedia.org/w/api.php?"
API_QUERY_PARAMS = {
    'action':'query',
    "pageids":"",
    "prop":"categories|imageinfo|coordinates|templates|linkshere|links|globalusage",
    "format":"json",
    "iiprop":"metadata"
}

def get_title(title):
    if not title.startswith("File:"):
        return "File:"+title
    return title

def retrieve_info(pids):
    r_json = None
    new_collection = []
    if len(pids)>0:
        r_json = get_json_response(pids)
    if r_json:
        new_collection = parse_json_response(r_json)
    save_json_response(new_collection)

def get_json_response(pids):
    API_QUERY_PARAMS['pageids'] = "|".join([str(x) for x in pids])
    r =requests.get(API_URL_BASEURL, params=API_QUERY_PARAMS)
    r_json = r.json()
    return r_json

def parse_json_response(r_json):
    photos = []
    if 'query' in r_json:    
        for current in r_json['query']['pages']:
            r_json['query']['pages'][current]['_id']=int(current)
            photos.append(r_json['query']['pages'][current])
    return photos
    
def save_json_response(new_collection):
    if len(new_collection)>0:
        db_collection.insert_many(new_collection, ordered=False)
        

Run the crawler

In [None]:
from random import randint
from time import sleep

DATA_FOLDER = '../data/'
BATCH_N_ITEMS = 50
inputfile = "wlm_data_%d.tsv.bz2"

years = range(2010, 2012)

for year in years:
    
    ids_already_fetched = set(db_collection.find().distinct('_id'))
    current = DATA_FOLDER+inputfile %year
    
    print('loading %s' %current)
    photos = pd.read_csv(current, sep="\t", error_bad_lines=False) \
        .drop_duplicates(subset=['page_id']) \
        .reset_index()
    print('retrieved %d photos' %photos.shape[0])
    candidates = []
    count = 0
    
    for i, p in photos.iterrows():
        try:
            pid = int(p['page_id'])
            if not (pid in ids_already_fetched):
                
                candidates.append(pid)
                count+=1
                
                if count%BATCH_N_ITEMS==0:
                    sleep(randint(2, 4))
                    retrieve_info(candidates)
                    candidates.clear()
        
            if i>0 and i%10000==0:
                print('fetched: ', i)
                
        except Exception as ex:
            print(ex)
            
    if len(candidates)>0:
        retrieve_info(candidates)
    
    print()
    
print('done')

loading ../data/wlm_data_2010.tsv.bz2
retrieved 12581 photos
10000

loading ../data/wlm_data_2011.tsv.bz2
retrieved 86722 photos
10000
20000
30000
40000
50000
60000
cannot convert float NaN to integer
