In [2]:
import re
import requests
import pandas as pd
import numpy as np


In [3]:
def format_cat_name(cat_name): 
    cat_name = re.sub('\s','_', cat_name)
    return cat_name

In [4]:
def go_query(cat_name):
    cate_name = format_cat_name(cat_name)
    params = {"action": "query",
            "format": "json",
            "list": "categorymembers",
            "cmtitle": cate_name,
            "cmlimit": "max"}
    query = requests.get("http://en.wikipedia.org/w/api.php?",params=params)
    
    return query.json()

In [5]:
def json_df(cat_name):
    temp_dict = go_query(cat_name)
    df = pd.DataFrame(temp_dict['query']['categorymembers'])
    return df

In [6]:
def cat_pages(cat_name, max_depth=3):
    
    params = {'action':'query',
          'format':'json',
          'list':'categorymembers',
          'cmtitle': format_cat_name(cat_name),
          'cmlimit':'max'}
    
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    
    members = data['query']['categorymembers']

    pages = list(filter(lambda x: x['ns'] == 0, members))
    subpages = list(filter(lambda x: x['ns'] == 14, members))
      
    while max_depth >= 0:
 
        if not subpages:    
            return pages
    
        else:
            max_depth -= 1
            for subpage in subpages:
                pages += cat_pages(subpage['title'], max_depth)
    
    return pages

In [7]:
def page_list(cat_name):
    pages = cat_pages(format_cat_name(cat_name))
    pages_df = pd.DataFrame(pages)
    page_list = list(pages_df['title'])
    return page_list

In [8]:
def get_content_df(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, content],index=(['page_id', 'title', 'content'])).T
    
    return content

In [15]:
aiva=get_content_df('AIVA')

In [16]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(aiva)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


In [18]:
aiva

'<p><b>AIVA</b> (Artificial Intelligence Virtual Artist) is a deep learning algorithm applied to music composition. In June 2016, it became the first system of algorithmic composition to be registered, as a composer, in an authors\' right Society SACEM.</p>\n<p></p>\n\n<p></p>\n<h2><span id="Description">Description</span></h2>\n<p>Created in February 2016, AIVA specializes in Classical and Symphonic music composition. It became the world’s first virtual composer to be recognized by a music society (SACEM). By reading a large collection of existing works of classical music (written by human composers such as Bach, Beethoven, Mozart) AIVA is capable of understanding concepts of music theory and composing on its own. The algorithm AIVA is based on deep learning and reinforcement learning architectures</p>\n<h2><span id="Discography">Discography</span></h2>\n<p>AIVA is a published composer; its first studio album “Genesis” was released in November 2016 and counts 20 original and 4 orchest

In [17]:
soup.get_text()

'AIVA (Artificial Intelligence Virtual Artist) is a deep learning algorithm applied to music composition. In June 2016, it became the first system of algorithmic composition to be registered, as a composer, in an authors\' right Society SACEM.\n\n\n\nDescription\nCreated in February 2016, AIVA specializes in Classical and Symphonic music composition. It became the world’s first virtual composer to be recognized by a music society (SACEM). By reading a large collection of existing works of classical music (written by human composers such as Bach, Beethoven, Mozart) AIVA is capable of understanding concepts of music theory and composing on its own. The algorithm AIVA is based on deep learning and reinforcement learning architectures\nDiscography\nAIVA is a published composer; its first studio album “Genesis” was released in November 2016 and counts 20 original and 4 orchestrated works composed by AIVA. The tracks were recorded by human musicians: Olivier Hecho as the Conductor of the Aiv

In [38]:
def clean_content(cat_name):
    page = get_content_df(cat_name)
    soup = BeautifulSoup(page)
    temp_list=[]
    for string in soup.stripped_strings:
        temp_list.append(string)
    s=''
    clean = s.join(temp_list)

    return clean 

In [39]:
clean_content('AIVA')



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


'AIVA(Artificial Intelligence Virtual Artist) is a deep learning algorithm applied to music composition. In June 2016, it became the first system of algorithmic composition to be registered, as a composer, in an authors\' right Society SACEM.DescriptionCreated in February 2016, AIVA specializes in Classical and Symphonic music composition. It became the world’s first virtual composer to be recognized by a music society (SACEM). By reading a large collection of existing works of classical music (written by human composers such as Bach, Beethoven, Mozart) AIVA is capable of understanding concepts of music theory and composing on its own. The algorithm AIVA is based on deep learning and reinforcement learning architecturesDiscographyAIVA is a published composer; its first studio album “Genesis” was released in November 2016 and counts 20 original and 4 orchestrated works composed by AIVA. The tracks were recorded by human musicians: Olivier Hecho as the Conductor of the Aiva Sinfonietta O

In [42]:
ml_page_list = page_list("Category:Machine learning")

In [43]:
ml_page_list

['Data exploration',
 'List of datasets for machine learning research',
 'Machine learning',
 'Outline of machine learning',
 'Accuracy paradox',
 'Action model learning',
 'Active learning (machine learning)',
 'Adversarial machine learning',
 'AIVA',
 'AIXI',
 'Algorithm selection',
 'Algorithmic bias',
 'Algorithmic inference',
 'AlphaGo',
 'AlphaGo Zero',
 'Apprenticeship learning',
 'Automated machine learning',
 'Bag-of-words model',
 'Ball tree',
 'Base rate',
 'Bayesian interpretation of kernel regularization',
 'Bayesian optimization',
 'Bayesian structural time series',
 'Bias–variance tradeoff',
 'BigDL',
 'Binary classification',
 'Bing Predicts',
 'Bongard problem',
 'Bradley–Terry model',
 'Caffe (software)',
 'Catastrophic interference',
 'Category utility',
 'CBCL (MIT)',
 'CIML community portal',
 'Cleverbot',
 'Cognitive robotics',
 'Committee machine',
 'Computational learning theory',
 'Concept drift',
 'Concept learning',
 'Conditional random field',
 'Confusion ma

In [44]:
content_list=[]
for title in ml_page_list:
    content_list.append(clean_content(title))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html5lib")

  markup_type=markup_type))


KeyboardInterrupt: 