In [8]:
# Importing packages

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import glob
import os

pd.options.mode.chained_assignment = None

In [21]:
# Defining functions to scrape from metacritic games

def get_soup(url):
    '''
    Parse url with BeautifulSoup
    
    Inputs
    ----
    url (string)
        url of page to parse
    
    Outputs
    ----
    soup (BeautifulSoup)
        Soup object
    '''

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def genre_scraper(a):
    '''
    Get game genre from game page link
    '''

    gamelink = a['href']
    url = f'https://www.metacritic.com{gamelink}'
    soup = get_soup(url)
    genres = soup.find('li',{'class': 'product_genre'})
    if genres!= None:
        genres = soup.find('li',{'class': 'product_genre'}).find_all('span', {'class': 'data'})
    else:
        return np.nan
    return ', '.join(list(map(lambda x: x.text, genres)))

def scraper(content):
    '''
    Extract information from content
    Inputs
    ----
    content
        content from a table in page

    Outputs
    ----
    df (DataFrame)
        dataframe with details in table
    '''

    df = pd.DataFrame(columns = ['name','platform','release_date','genre','metascore','user_score'])

    table_rows = content.find_all('tr')
    for tr in table_rows:
        new_row = {}
        if len(tr)<1:
            continue
        td = tr.find_all('td')
        
        #get game name
        a = td[1].find('a', {"class":"title"})
        new_row['name'] = a.find('h3').text
        
        genre = np.nan
        tries = 0
        while tries < 10 and type(genre)==float:
            genre = genre_scraper(a)
            tries += 1

        if type(genre)==float:
            print(f"Giving up on genre scrape for {new_row['name']}")

        new_row['genre'] = genre
        
        #get release date
        date = td[1].find('span',{"class":""})
        new_row['release_date'] = datetime.strptime(date.text, "%B %d, %Y")
        
        #get platform
        p1 = td[1].find('div',{"class":"platform"})
        new_row['platform'] = p1.find('span', {"class":"data"}).text.strip()

        #get userscore
        div_score = td[1].find('div', {"class":"clamp-userscore"})
        user = div_score.find('div',{"class":"metascore_w"})
        try:
            new_row['user_score'] = float(user.text.strip())
        except:
            new_row['user_score'] = np.nan

        #get metacore
        score = td[1].find('div', {"class":"metascore_w"})
        try:
            new_row['metascore'] = float(score.text)
        except:
            new_row['metascore'] = np.nan

        new_row = pd.DataFrame(new_row, index=[0])
        df = pd.concat([new_row,df.loc[:]]).reset_index(drop=True)
    
    return(df)

def scrape_page(page): 
    '''
    Scrapes the page from metacritic of games

    Inputs
    ----
    page (int)
        page number

    Outputs
    ----
    dfs (list)
        list of dataframes with page details
    '''
    print(page)
    
    #create list of tables in the webpage with game entries in each
    url = f'https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&view=detailed&page={page}'   
    soup = get_soup(url)
    content = soup.find_all('table')

    #scrape each table
    dfs = list(map(scraper,content))
    return(dfs)

def find_no_pages():
    '''
    Find total number of pages to scrape
    '''

    url = f'https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&view=detailed&page={0}'
    soup = get_soup(url)
    num = soup.find('li',{'class': 'last_page'}).find('a', {'class': 'page_num'}).text
    return int(num)

def merge_data():
    '''
    Merge all tempdata to final file
    '''
    all_files = glob.glob("./tempdata/*.csv")
    df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    df.to_csv("metacritic_data_022023.csv",index=False)

def main():
    '''
    Scrape each page and save
    '''
    
    n = find_no_pages()
    for i in range(n):
        scraped = scrape_page(i)
        pd.concat(scraped).reset_index(drop=True).to_csv(f'./tempdata/page{i}.csv')
    merge_data()


In [None]:
main()