# Scraping Data from BGG

In [96]:
import requests
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd

### Writing functions to parse through specific parts of the XML file

In [46]:
def num_players(xml):
    """"
    Returns the community-recommended range of number of players.
    
    Paramaters:
        poll (bytes): The content of an xml file.
        
    Returns:
        list: A list with all the number of players the game is recommended for.
    """
    
    poll_results = dict()    
    for poll in xml.findall(".//poll[@name='suggested_numplayers']"):
        responses = poll.findall('results')
        for resp in responses:
            num_players = resp.attrib['numplayers'] 
            votes_int = [i.attrib for i in resp.findall('result')]
            votes = [int(i['numvotes']) for i in votes_int]
            poll_results[num_players] = votes
    
    rec = list()
    
    for key,item in poll_results.items():
        if sum(item) != 0:
            percentages = [i/sum(item) for i in item]
            if percentages[0] >= 0.5 or percentages[1] >= 0.5:
                rec.append(key)
            
    return rec

In [61]:
def player_age(xml):
    """
    Returns the results of the community poll asking "what is the minimum age you recommend for this game?"
    
    Parameters:
        xml (bytes): The content of an xml file.
    
    Returns:
        string: Minimum age the game is recommended for.
    """
    
    poll_results = dict()
    responses = xml.findall(".//poll[@name='suggested_playerage']")[0].findall('results')[0].findall('result')
    for response in responses:
        poll_results[response.attrib['value']] = response.attrib['numvotes']

    return list(poll_results.keys())[list(poll_results.values()).index(max(poll_results.values()))]

In [76]:
def mechanics(xml):
    all_mechs = dict()
    mechs = xml.findall(".//boardgamemechanic")
    for mech in mechs:
        all_mechs[mech.attrib['objectid']] = mech.text
    return all_mechs

In [114]:
def category(xml):
    all_cats = dict()
    cats = xml.findall(".//boardgamecategory")
    for cat in cats:
        all_cats[cat.attrib['objectid']] = cat.text
    return all_cats

In [77]:
#testing 

root = ET.fromstring(x.content)
print(num_players(root))
print(player_age(root))
print(mechanics(root))

['3', '4', '5']
8
{'2040': 'Hand Management', '2910': 'Investment', '2900': 'Market', '2940': 'Square Grid', '2005': 'Stock Holding', '2002': 'Tile Placement', '2874': 'Victory Points as a Resource'}


### Creating dataset of all games

In [88]:
x = requests.get('https://boardgamegeek.com/xmlapi/boardgame/5')

root = ET.fromstring(x.content)
root.findall(".//error")

root.findall

[]

In [119]:
i = 1
empty = 0
games = pd.DataFrame()

while empty < 10:
    entry = dict()
    x = requests.get('https://boardgamegeek.com/xmlapi/boardgame/' + str(i))
    root = ET.fromstring(x.content)
    if len(root.findall(".//error")) != 0:
        empty += 1
    else:
        empty = 0 
        name = root.findall(".//name[@primary='true']")[0].text
        time = root.findall(".//playingtime")[0].text
        entry = {'name':name, 'playing time':time, 'number players':num_players(root), 'min age':player_age(root), 
                'category':category(root), 'mechanics':mechanics(root)}
        games = games.append(entry, ignore_index=True)
    i += 1
        
        

ParseError: mismatched tag: line 6, column 2 (<string>)

In [120]:
games

Unnamed: 0,category,mechanics,min age,name,number players,playing time
0,"{'1021': 'Economic', '1026': 'Negotiation', '1...","{'2916': 'Alliances', '2080': 'Area Majority /...",12,Die Macher,"[4, 5]",240
1,"{'1002': 'Card Game', '1010': 'Fantasy'}",{'2009': 'Trick-taking'},2,Dragonmaster,"[3, 4]",30
2,"{'1009': 'Abstract Strategy', '1035': 'Medieval'}","{'2080': 'Area Majority / Influence', '2040': ...",6,Samurai,"[2, 3, 4]",60
3,{'1050': 'Ancient'},"{'2001': 'Action Points', '2080': 'Area Majori...",12,Tal der Könige,"[2, 3, 4]",60
4,"{'1021': 'Economic', '1086': 'Territory Buildi...","{'2040': 'Hand Management', '2910': 'Investmen...",8,Acquire,"[3, 4, 5]",90
...,...,...,...,...,...,...
225,"{'1023': 'Bluffing', '1026': 'Negotiation', '1...","{'2080': 'Area Majority / Influence', '2046': ...",12,Colonial Diplomacy,"[6, 7]",360
226,"{'1051': 'Napoleonic', '1019': 'Wargame'}","{'2072': 'Dice Rolling', '2026': 'Hexagon Grid...",2,The Battle of Borodino: Napoleon in Russia 1812,"[1, 2]",180
227,"{'1021': 'Economic', '1051': 'Napoleonic', '10...","{'2046': 'Area Movement', '2072': 'Dice Rollin...",18,Empires in Arms,"[5, 6, 7]",12000
228,"{'1051': 'Napoleonic', '1019': 'Wargame'}",{'2026': 'Hexagon Grid'},8,Jena-Auerstadt: The Battle for Prussia,"[1, 2]",150
