In [None]:
import requests
import json
import sys
import os
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
def get_soup(url):
    
    page = requests.get(url)
    
    if page.status_code != 200:
        return {url : page.status_code}, None
    
    soup = BeautifulSoup(page.content,'html5lib')
    
    return None, soup

In [None]:
def get_bgg_last_page(list_url):
    res, soup = get_soup(list_url)
    
    if res:
        raise Exception("Problem getting base SOUP from %s, response: %s" % (list_url , res[base_url]))
        
    current_page = 1
    last_page = int(soup.find("a", {"title":"last page"}).text[1:-1])
    
    return last_page

In [None]:
def get_bgg_list_of_boardgames(base_url, num_pages = float("inf")):
    
    list_url = base_url + "/browse/boardgame/page/"
    last_page = get_bgg_last_page(list_url)
    
    if last_page > num_pages:
        last_page = num_pages
    
    dic_bg_info = {}
    dic_of_errors = {}
    
    for page in tqdm(range(1, last_page + 1)):
        index = page * 10000
        
        res, soup = get_soup(list_url + str(page))
        
        if res:
            continue
        
        list_of_bg = soup.find("div", {"id": "collection"}).find_all("tr", {"id": "row_"})
        
        for boardgame in list_of_bg:
            dic_bg_info[index] = {}
            dic_of_errors[index] = {}

            #BGG rank
            try:
                dic_bg_info[index]["bgg_rank"] = boardgame.find("td", {"class": "collection_rank"}).text.strip()
            except:
                dic_of_errors[index]["rank_error"] = True

            #Name, URL and Year
            try:
                nuy = boardgame.select('div[id*="results_objectname"]')[0]

                #Name
                try:
                    dic_bg_info[index]["name"] = nuy.find("a").text.strip()
                except:
                    dic_of_errors[index]["name_error"] = True

                #URL
                try:
                    dic_bg_info[index]["url"] = base_url + nuy.find("a")['href']
                except:
                    dic_of_errors[index]["url_error"] = True

                #Year
                try:
                    dic_bg_info[index]["year"] = nuy.find("span", {"class" : "smallerfont dull"}).text[1:-1].strip()
                except:
                    dic_of_errors[index]["year_error"] = True

            except:
                dic_of_errors[index]["nuy_error"] = True

            #Ratings
            try:
                ratings = boardgame.find_all("td", {"class" : "collection_bggrating"})
                dic_bg_info[index]["geek_rating"] = ratings[0].text.strip()
                dic_bg_info[index]["avg_rating"] = ratings[1].text.strip()
                dic_bg_info[index]["num_voters"] = ratings[2].text.strip()

            except:
                dic_of_errors[index]["ratings_error"] = True

            index += 1
        
    return dic_of_errors, dic_bg_info

In [None]:
def get_bgg_details(dic_bg_info, data_loc):
    
    driver = webdriver.Chrome()
    driver.implicitly_wait(5)
    num_done = 0
    for key in tqdm(dic_bg_info.keys()):
        num_done += 1
        if 'family' in dic_bg_info[key].keys() and dic_bg_info[key]['family']:
            continue
        
        driver.get(dic_bg_info[key]['url'] + "/credits")
        list_of_g_attributes = driver.find_elements_by_xpath('//div[@class="gameplay-item-primary"]')
        
        cols = ['player_num', 'play_time', 'min_age', 'weight']
        for i in range(len(list_of_g_attributes)):
            dic_bg_info[key][cols[i]] = list_of_g_attributes[i].text.split("\n")[0]
        
        list_of_c_attributes = driver.find_elements_by_xpath('//li[@class="outline-item ng-scope"]')
        
        for attribute in list_of_c_attributes:
            
            list_of_info = attribute.text.split("\n")
            
            dic_bg_info[key][list_of_info[0].strip().lower()] = [x.strip().lower() for x in list_of_info[1:]]
            
        if num_done % 100 == 0:
            with open(os.path.join(data_loc, "bgg_basic_info"), 'w') as json_file:
                json.dump(dic_bg_info, json_file)
                
    driver.close()
    return dic_bg_info

In [None]:
def bgg_scrape(basic = True, detail = True ,num_page = float("inf")):
    base_url = "https://boardgamegeek.com"
    data_loc = os.path.join(".", "data", "bgg")
    if basic:
        print("Get BGG basic info")
        print("-----")
        dic_of_errors, dic_bg_info = get_bgg_list_of_boardgames(base_url, num_page)

        with open(os.path.join(data_loc, "bgg_basic_info"), 'w') as json_file:
            json.dump(dic_bg_info, json_file)

        with open(os.path.join(data_loc, "bgg_basic_errors"), 'w') as json_file:
            json.dump(dic_of_errors, json_file)
    else:
        with open(os.path.join(data_loc, "bgg_basic_info"), 'r') as json_file:
            dic_bg_info = json.load(json_file)

        with open(os.path.join(data_loc, "bgg_basic_errors"), 'r') as json_file:
            dic_of_errors = json.load(json_file)
      
    if detail:  
        print("Get BGG details")
        print("-----")
        dic_bg_info = get_bgg_details(dic_bg_info, data_loc)

        with open(os.path.join(data_loc, "bgg_basic_info"), 'w') as json_file:
            json.dump(dic_bg_info, json_file)
        
    return dic_of_errors, dic_bg_info

In [None]:
dic_of_errors, dic_bg_info = bgg_scrape(True, True)