In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pymongo import MongoClient

In [2]:
def format_cat_name(cat_name): 
    cat_name = re.sub('\s','_', cat_name)
    return cat_name

In [3]:
def go_query(cat_name):
    cate_name = format_cat_name(cat_name)
    params = {"action": "query",
            "format": "json",
            "list": "categorymembers",
            "cmtitle": cate_name,
            "cmlimit": "max"}
    query = requests.get("http://en.wikipedia.org/w/api.php?",params=params)
    
    return query.json()

In [4]:
def json_df(cat_name):
    temp_dict = go_query(cat_name)
    df = pd.DataFrame(temp_dict['query']['categorymembers'])
    return df

In [5]:
def cat_pages(cat_name, max_depth=3):
    
    params = {'action':'query',
          'format':'json',
          'list':'categorymembers',
          'cmtitle': format_cat_name(cat_name),
          'cmlimit':'max'}
    
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    
    members = data['query']['categorymembers']

    pages = list(filter(lambda x: x['ns'] == 0, members))
    subpages = list(filter(lambda x: x['ns'] == 14, members))
      
    while max_depth > 0:
 
        if not subpages:    
            return pages
    
        else:
            max_depth -= 1
            for subpage in subpages:
                pages += cat_pages(subpage['title'], max_depth)
    
    return pages

In [6]:
def page_list(cat_name):
    pages = cat_pages(format_cat_name(cat_name))
    pages_df = pd.DataFrame(pages)
    page_list = list(pages_df['title'])
    return page_list

In [7]:
def get_content(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, content],index=(['page_id', 'title', 'content'])).T
    
    return content

In [8]:
def get_content_df(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    soup = BeautifulSoup(content,"html5lib")
    
    temp_list=[]
    for string in soup.stripped_strings:
        temp_list.append(string)
    
    #s=''
    #clean = s.join(temp_list)
    
    
    
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, clean],index=(['page_id', 'title', 'content'])).T
    
    return content_df

In [16]:
def clean_content(cat_name):
    page = get_content(cat_name) 
    soup = BeautifulSoup(page, "html5lib")
    temp_list=[]
    for string in soup.stripped_strings:
        temp_list.append(string)
    
    s=''
    clean = s.join(temp_list)

    return clean 

In [10]:
def get_content_df(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    soup = BeautifulSoup(content,"html5lib")
    
    temp_str=str()
    for string in soup.stripped_strings:
        temp_str += string 
    
    
    #clean = str(temp_list)
    
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, temp_str],index=(['page_id', 'title', 'content'])).T
    
    
    return content_df

In [11]:
#Mongo client IP
client = MongoClient('54.190.53.213', 27016)

In [13]:
db_ref = client.my_database
db_wiki_ref = db_ref.my_wikipedia

In [14]:
db_wiki_whole_ref = db_ref.my_wikipedia_all

In [17]:
#Wiki Collection Process for Machine Learning Categories

#Create list with set so duplicates are gone
ml_page_list = set(page_list("Category:Machine learning"))

#Store cleaned contents in the list 
content_list=[]
for title in ml_page_list:
    content_list.append(clean_content(title))

#Replcae '.' with space since Mongo deosn't like keys with periods 
ml_page_list_2 = [x.replace('.',' ') for x in ml_page_list]

#Create a list of dictionaries of Title:content
new_list = []
for i in range(len(ml_page_list_2)):
    new_dict = {ml_page_list_2[i]:content_list[i]}
    new_list.append(new_dict)

#Store list of dictionaries to Mongo
for i in new_list:
    db_wiki_whole_ref.insert_one(i)

In [19]:
db_wiki_whole_ref.count()

1106

In [20]:
db_wiki_bs_whole_ref = db_ref.my_wiki_bs_all

In [21]:
bs_page_list = set(page_list("Category:Business software"))

bs_content_list=[]
for title in bs_page_list:
    bs_content_list.append(clean_content(title))

bs_page_list_2 = [x.replace('.',' ') for x in bs_page_list]

bs_new_list = []
for i in range(len(bs_page_list_2)):
    bs_new_dict = {bs_page_list_2[i]:bs_content_list[i]}
    bs_new_list.append(bs_new_dict)

for i in bs_new_list:
    db_wiki_bs_whole_ref.insert_one(i)