In [1]:
import re
import requests
import pandas as pd
import numpy as np

In [2]:
from pandas.io.json import json_normalize

#### Below is the wikipedia api call for a category search:

`http://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category%3A+machine+learning&cmlimit=max`

`action=query`: query the wikipedia api

`format=json`: return a json format

`list=categorymembers`: List of pages that belong to a given category, ordered by page sort title

`cmtitle=Category%3A+machine+learning`: title of category

`climit=max`: return up to the maximum amount of responses (500)

You may use this to get page titles from the wikipedia API. Things to watch out for:
* The responses contain categories
* You will want to fetch articles in those subcategories

The API's detailed documentation can be found [here](https://www.mediawiki.org/wiki/API:Main_page)

#### Make a function that formats a request for pages of a category

In [3]:
def categ_format(category):
    get_name = re.sub('\s', '+', category)
    return get_name

In [4]:
categ_format('machine learning')

'machine+learning'

In [5]:
def get_wikipedia_cat(category):
    
    atts = {'action': 'query', 
            'format': 'json', 
            'list': 'categorymembers', 
            'cmtitle': category, 
            'cmlimit': 'max'}

    resp = requests.get('http://en.wikipedia.org/w/api.php', params = atts)
    cats = resp.json()['query']['categorymembers']

    subcats = list(filter(lambda x: x['ns'] == 14, cats))
    pages = list(filter(lambda x: x['ns'] == 0, cats))

    if subcats: 
        for row in subcats:
            pages += get_wikipedia_cat(row['title'])        
    else:
        return pages
        
    return pages
   

In [6]:
max_depth = 3

In [7]:
def get_wikipedia_cat_2(category, depth=0):
    
    atts = {'action': 'query', 
            'format': 'json', 
            'list': 'categorymembers', 
            'cmtitle': category, 
            'cmlimit': 'max'}

    resp = requests.get('http://en.wikipedia.org/w/api.php', params = atts)
    cats = resp.json()['query']['categorymembers']

    subcats = list(filter(lambda x: x['ns'] == 14, cats))
    pages = list(filter(lambda x: x['ns'] == 0, cats))

    if subcats: 
        while depth < max_depth:
            for row in subcats:
                pages += get_wikipedia_cat_2(row['title'], max_depth)
            depth += 1
    else:
        return pages
        
    return pages

In [8]:
def get_wiki_page(pageid):
    r = requests.get("http://en.wikipedia.org/w/api.php?action=query&\
              format=json&prop=extracts&pageids={}&rvprop=content".format(pageid))
    return r.json()

In [9]:
pages = get_wikipedia_cat('Category:Machine learning')

In [10]:
pages_2 = get_wikipedia_cat_2('Category:Business software')

In [None]:
len(pages_2)

In [None]:
pages[0]['pageid']

In [None]:
var = get_wiki_page(pageid)['query']['pages']

In [None]:
df = json_normalize(var)

In [None]:
df

In [None]:
df['43385931.extract']

In [11]:
!pip install pymongo



In [12]:
from pymongo import MongoClient
client = MongoClient('52.27.26.161', 27016)
db_ref = 'my_database'
client.drop_database(db_ref)
db_ref = client.my_database
mongo_coll = db_ref.my_collection

In [13]:
client.database_names()

['admin', 'local', 'pages', 'test']

db_ref = client.my_database

In [14]:
db_ref.collection_names()

[]

mongo_coll = db_ref.my_collection

pages[0]['pageid']

In [15]:
for i, page in enumerate(pages):
    pageid = str(pages[i]['pageid'])
    page = get_wiki_page(pageid)

    mongo_coll.insert_one(page['query']['pages'][pageid])

print(mongo_coll.count())

1621


In [16]:
for i, page in enumerate(pages_2):
    pageid = str(pages_2[i]['pageid'])
    page = get_wiki_page(pageid)

    mongo_coll.insert_one(page['query']['pages'][pageid])

print(mongo_coll.count())

6258


In [17]:
all_pages = list(mongo_coll.find())

In [18]:
all_pages_df = pd.DataFrame(all_pages)

In [19]:
all_pages_df

Unnamed: 0,_id,extract,ns,pageid,title
0,5a218fc1e15d6e00515f3004,<p><b>Data exploration</b> is an approach simi...,0,43385931,Data exploration
1,5a218fc1e15d6e00515f3005,<p>These datasets are used for machine-learnin...,0,49082762,List of datasets for machine learning research
2,5a218fc2e15d6e00515f3006,<p><b>Machine learning</b> is a field of compu...,0,233488,Machine learning
3,5a218fc2e15d6e00515f3007,<p>The following outline is provided as an ove...,0,53587467,Outline of machine learning
4,5a218fc2e15d6e00515f3008,<p>The <b>accuracy paradox</b> for predictive ...,0,3771060,Accuracy paradox
5,5a218fc2e15d6e00515f3009,<p><b>Action model learning</b> (sometimes abb...,0,43808044,Action model learning
6,5a218fc3e15d6e00515f300a,<p><b>Active learning</b> is a special case of...,0,28801798,Active learning (machine learning)
7,5a218fc3e15d6e00515f300b,<p><b>Adversarial machine learning</b> is a re...,0,45049676,Adversarial machine learning
8,5a218fc3e15d6e00515f300c,<p><b>AIVA</b> (Artificial Intelligence Virtua...,0,52642349,AIVA
9,5a218fc4e15d6e00515f300d,"<p><b>AIXI</b> <span title=""Representation in ...",0,30511763,AIXI


In [23]:
all_pages_df.to_pickle('all_pages_df.p')