In [1]:
import time
from parsel import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
# from selenium.common.exceptions import NoSuchElementException
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.service import Service as ChromeService
from bs4 import BeautifulSoup
import requests

In [2]:
data_dir = '../data/'

In [3]:
from fake_useragent import UserAgent
ua = UserAgent()

In [4]:
# configure webdriver
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")  # set window size to native GUI size
options.add_argument("start-maximized")  # ensure window is full-screen
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_experimental_option(
    # this will disable image loading
    "prefs", {"profile.managed_default_content_settings.images": 2}
)

In [5]:
#Initialize chrome webdriver
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options, chrome_options=options)

driver.get("https://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?year_selected=2021&view=detailed&sort=desc&page=0")

# Get list of product urls
url_list = []

while True:
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    games = soup.find_all(class_='clamp-summary-wrap')
    
    for game in games:
        url_list.append('https://www.metacritic.com'+game.find('a', href=True)['href'])
    
    try:
        next_button = driver.find_element_by_link_text('next')
        next_button.click()
    except:
        driver.close()
        break



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [C:\Users\siraj\.wdm\drivers\chromedriver\win32\98.0.4758.80\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), options=options, chrome_options=options)


In [6]:
#Initialize data structures
critic_reviews = {'game_id':[], 'game_name': [], 'score':[], 'author':[], 'date':[], 'summary': []}

game_id = 0

# Parse review data
for game in url_list:
    
    try:
        page = requests.get(game, headers={'User-Agent': ua.chrome})
        soup = BeautifulSoup(page.text, 'html.parser')

        game_name = soup.find(class_ = 'product_title').find('h1').text.lstrip().replace('\n', '')

        reviews = soup.find(class_= 'body product_reviews')

        score_elems = reviews.find_all(class_="review_grade")

        for elem in score_elems:
            critic_reviews['game_id'].append(game_id)
            critic_reviews['game_name'].append(game_name)
            critic_reviews['score'].append(int(elem.text))

        author_elems = reviews.find_all(class_="source")

        for elem in author_elems:
            critic_reviews['author'].append(elem.text)

        date_elems = reviews.find_all(class_="date")

        for elem in date_elems:
            critic_reviews['date'].append(elem.text)

        sum_elems = reviews.find_all(class_="review_body")

        for elem in sum_elems:
            critic_reviews['summary'].append(elem.text.lstrip().rstrip())

        game_id+=1
        
    except:
        pass

In [7]:
import pandas as pd

reviews = pd.DataFrame(critic_reviews)
reviews.to_csv(data_dir+'reviews.csv')

In [8]:
reviews

Unnamed: 0,game_id,game_name,score,author,date,summary
0,0,Disco Elysium: The Final Cut,100,GameGrin,"Oct 15, 2021",Traversing the mind of your character is as mu...
1,0,Disco Elysium: The Final Cut,100,Shindig,"Jun 18, 2021",Disco Elysium – The Final Cut is something rar...
2,0,Disco Elysium: The Final Cut,100,KeenGamer,"Apr 26, 2021","Enhanced with fantastic and fresh content, Dis..."
3,0,Disco Elysium: The Final Cut,100,GameSpot,"Apr 15, 2021",A fully voiced cast and new content of compara...
4,0,Disco Elysium: The Final Cut,100,Wccftech,"Apr 15, 2021",Disco Elysium: The Final Cut doesn't shy away ...
...,...,...,...,...,...,...
5503,320,eFootball 2022,30,PC Gamer,"Oct 13, 2021","In PR terms, it's the world's costliest public..."
5504,320,eFootball 2022,28,GameStar,"Oct 10, 2021",It's the worst-rated game on Steam for a reaso...
5505,320,eFootball 2022,20,Gamer.no,"Oct 10, 2021","The game is straight up horrible, and I can’t ..."
5506,320,eFootball 2022,20,PC Games,"Oct 5, 2021",The Pro Evolution Soccer series was all the ti...


In [10]:
#Initialize data structure
game_list = {'id':[], 'name':[], 'developer':[], 'release_date': [], 'metascore':[], 'review_platform': [], 'other_platforms': [], 'genres': [], 'summary': [],
            'online_player_num':[], 'rating': []}

det_urls = []

# Generate list of detail specific urls
for url in url_list:
    det_urls.append(url.replace('critic-reviews', 'details'))

game_id = 0

for url in det_urls:
    
    try:
        page = requests.get(url, headers={'User-Agent': ua.chrome})
        soup = BeautifulSoup(page.text, 'html.parser')
        
        # Fill in game attributes

        game_list['name'].append(
            soup.find(class_ = 'product_title').find('h1').text.lstrip().replace('\n', '')
        )
        
        game_list['release_date'].append(
            soup.find(class_="summary_detail release_data").find(class_='data').text
        )
        
        game_list['metascore'].append(
            soup.find(class_="metascore_anchor").find('div').text
        )
        
        game_list['review_platform'].append(
            soup.find(class_='platform').text.strip()
        )
        
        try:
            game_list['other_platforms'].append(
                soup.find(class_='summary_detail product_platforms').find(class_='data').text.strip().replace(" ", "")
            )
        except:
            game_list['other_platforms'].append("")
        
        game_list['summary'].append(
            soup.find(class_="summary_detail product_summary").find(class_='data').text
        )
        
        table = soup.find_all(class_='product_details')
        rows = table[1].find_all('tr')
        
        table_dict = {}
        for row in rows:
            key = row.find('th').text
            detail = row.find('td').text.strip().replace(" ", "")
            table_dict[key] = detail
        
        try:
            game_list['rating'].append(table_dict['Rating:'])
        except:
            game_list['rating'].append("")
        
        try:
            game_list['developer'].append(table_dict['Developer:'])
        except:
            game_list['developer'].append("")
            
        try:
            game_list['genres'].append(table_dict['Genre(s):'])
        except:
            game_list['genres'].append("")

        try:
            game_list['online_player_num'].append(table_dict['Number of Online Players:'])
        except:
            game_list['online_player_num'].append("")
        
        game_list['id'].append(game_id)
            
        game_id+=1
    except:
        pass

In [11]:
game_list = pd.DataFrame(game_list)
game_list.to_csv(data_dir+'game_list.csv')