In [1]:
from requests import get
from urllib import quote
import pandas as pd
from bs4 import BeautifulSoup
from database_connections import connect_to_postgres
import unidecode
import wikipedia
import database_module

In [2]:
def create_query_param_string(params):
    param_list = [key+'='+str(value) for key, value in params.items()]
    return '?'+'&'.join(param_list)

def parse_pages_from_json(response_json):
    return response_json['query']['pages']

def parse_headings_from_json(response_json):
    return response_json['mobileview']['sections']

#ADDED
def parse_category_pages_from_json(response_json):
    return response.json()['query']['categorymembers']

def wikipedia_page_format(page):
    return page.lower().capitalize().replace(' ','_')

base_url = "https://en.wikipedia.org/w/api.php"

def wikipedia_get(title, category=False):
    """this function returns the text of a wikipedia page"""
    params = { 'action' : 'query',
               'format' : 'json',
               'prop' : 'extracts',
               'exlimit' : 'maxl'
             }
    
    if category:
        title = "Category:"+title
        
    params['titles'] = wikipedia_page_format(title)
    query_param_string = create_query_param_string(params)
    response = get(base_url+query_param_string)
    return response.json()['query']['pages']

def wikipedia_get_category(category):
    params = { 'action' : 'query',
               'format' : 'json',
               'prop' : 'extracts',
               'exlimit' : 'maxl'
             }
    
    params['titles'] = 'Category:'+quote(category)
    query_param_string = create_query_param_string(params)
    response = get(base_url+query_param_string)
    try:
        return parse_pages_from_json(response.json())
    except:
        return response
    
def wikipedia_get_pages_for_category(category):
    params = { 'action' : 'query',
               'format' : 'json',
               'list' : 'categorymembers',
               'cmlimit' : 'max'
             }
    
    params['cmtitle'] = 'Category:'+wikipedia_page_format(category)
    query_param_string = create_query_param_string(params)
    response = get(base_url+query_param_string)
    return response.json()['query']['categorymembers']

# def wikipedia_get_page_headings(title):
#     params = { 'action' : 'mobileview',
#                'format' : 'json',
#                'prop' : 'sections',
#                'sections' : 'all'
#              }
    
#     params['page'] = quote(title)
#     query_param_string = create_query_param_string(params)
#     response = get(base_url+query_param_string)
#     try:
#         return response.json() #parse_headings_from_json(response.json())
#     except:
#         return response

# https://en.wikipedia.org/w
#     /api.php?
#     action=parse&format=json&pageid={}&prop=text%7Csections&contentmodel=wikitext

def wikipedia_get_sections_by_id(pageid):
    """this function returns the text of a wikipedia page"""
    params = { 'action' : 'parse',
               'format' : 'json',
               'prop' : 'sections'
             }
        
    params['pageid'] = pageid
    query_param_string = create_query_param_string(params)
    response = get(base_url+query_param_string)
    return response.json()#['query']['pages']

In [3]:
category = 'Sandwiches'
category_dict = wikipedia_get_category(category)
category_dict

{u'757471': {u'extract': u'',
  u'ns': 14,
  u'pageid': 757471,
  u'title': u'Category:Sandwiches'}}

In [4]:
category_number = int(unidecode.unidecode(category_dict.keys()[0]))

database_module.create_or_update_category_in_database(category_number,category,'local')

Connected to server localhost.


'OK'

In [5]:
pages_list = wikipedia_get_pages_for_category('Sandwiches')
pages_list[:8]

[{u'ns': 0, u'pageid': 82425, u'title': u'Sandwich'},
 {u'ns': 0, u'pageid': 33686134, u'title': u'List of sandwiches'},
 {u'ns': 0, u'pageid': 49033306, u'title': u'Afghani burger'},
 {u'ns': 0, u'pageid': 2546911, u'title': u'Al pastor'},
 {u'ns': 0, u'pageid': 31439892, u'title': u'Bagel toast'},
 {u'ns': 0, u'pageid': 26081358, u'title': u'Bake and Shark'},
 {u'ns': 0, u'pageid': 3334477, u'title': u'Baked bean sandwich'},
 {u'ns': 0, u'pageid': 857888, u'title': u'B\xe1nh m\xec'}]

In [6]:
category_number = int(unidecode.unidecode(category_dict.keys()[0]))
category_number

757471

In [7]:

category_number = int(unidecode.unidecode(category_dict.keys()[0]))

for i in pages_list[:8]:
#    print i
    title = i[u'title']
#    print title
    if 'Category:' in title:
         continue
    page_id = i[u'pageid']
#   print page_id
    title = title.replace("'","")
    html = wikipedia.page(pageid=page_id).content
    html = html.replace("'","")
#    print html
    print u"Adding {}, id={}".format(title, page_id)
    database_module.create_or_update_page_in_database(page_id, category_number, title, html, 'remote')
   

Adding Sandwich, id=82425
Connected to server joshuacook.me.
Adding List of sandwiches, id=33686134
Connected to server joshuacook.me.
Adding Afghani burger, id=49033306
Connected to server joshuacook.me.
Adding Al pastor, id=2546911
Connected to server joshuacook.me.
Adding Bagel toast, id=31439892
Connected to server joshuacook.me.
Adding Bake and Shark, id=26081358
Connected to server joshuacook.me.
Adding Baked bean sandwich, id=3334477
Connected to server joshuacook.me.
Adding Bánh mì, id=857888
Connected to server joshuacook.me.


In [8]:
page_ids = [857888,3334477,26081358,49033306]

In [9]:
database_module.select_pages(page_ids,'remote')

Connected to server joshuacook.me.


[(49033306,
  'Afghani burger',
  'An Afghani burger (also known as the Kabuli burger) is a fast food wrap consisting of a piece of Afghan bread rolled around french fries, along with chutney and other condiments, vegetables, and often sausages or other meat. It borrows influences from Afghan cuisine and was popularized inside Pakistan by Afghan immigrants (especially in Islamabad and Peshawar). It is also found in Delhi, India.\n\n\n== See also ==\nWrap (sandwich)\nShawarma\nGyro (food)\n\n\n== Notes =='),
 (26081358,
  'Bake and Shark',
  'Bake and Shark is a traditional fast food dish of Trinidadian cuisine.\n\n\n== Preparation ==\nBake and Shark is a classic street food dish that is sold at a multitude of food stalls and cookshops all over Trinidad and Tobago. It consists of a fried flatbread ("bake") filled with fried pieces of shark meat and various other ingredients and sauces. Before frying, the shark meat is either seasoned with a herb blend and breaded, or marinated in a mix 

In [10]:
database_module.select_categories_for_page(3334477,'remote')

Connected to server joshuacook.me.


[('Sandwiches', 757471)]

In [None]:
import encoding_funcs