In [1]:
from pathlib import Path
import sys

sys.path.append(str(Path().absolute().parent))
# sys.path

# Scraping for score
For a video to classify as a game review, searching for the title they're reviewing plus "metacritic" should yield metacritic's website as one of the first search results.

As far as I can see, there are 2 possible approaches to get the score:
1. Look through each frame of the video to find some picture that has the score.
2. Search for the review on IGN's website.

In [2]:
import pandas as pd

def read_video_csv(source="../video/db/video.csv"):
    df = pd.read_csv(source)
    
    return df

# df = read_video_csv()
# df.head()

### Deciding on a method to find the score
Sample for video ids to see if some methods work. Manual work on some of these.
Better than scrolling through the playlist on youtube.

For each of the video id in `sample`, manually find the score to see which method is more reliable.

In [10]:
import numpy as np
np.random.seed(42)

df = read_video_csv()

n = df.shape[0]
# 5 recent ones, 5 semi-recent ones, and 5 old ones
sample = [
    *list(df['id'][:int(n/3)].sample(5)),
    *list(df['id'][int(n/3):int(2*n/3)].sample(5)),
    *list(df['id'][int(2*n/3):].sample(5))
]
sample

['1HiVB_sz1DA',
 'l51UJqOsHzA',
 'caL_qhUIqhk',
 'FAXxQLLIZFI',
 'CP4TFr4TR1k',
 'k-caUhgBPCY',
 'dt2fImlT5xg',
 'Dqbbu2hWSpM',
 'Qae88uZDvtg',
 'R3rpDY0Gw1I',
 'Olzg8RHwXjQ',
 'cRzs5-2whLk',
 'IsUbRNUF6JI',
 '4vwZNtvev8g',
 'ow5Jkxv2Qv0']

#### Implementing Score Scraping

It has been decided that scores will be scraped by the following method:

1. Check if the video is actually reviewing a game. If it is a game, then move on to step 2. Otherwise, skip it.
    - The game should have its own metacritic page. If it does, it's very likely that when searching for the game along with the keyword "metacritic", the very first search result will be its metacritic page.
    - Duck Duck Go will be used just for convenience.
    - The titles will also have to be cleaned before putting them into the search engine (e.g. removing "review" and "commentary" at the end of the titles; making the titles are url encoded).
    
2. Go through the last 60 seconds of the video and look for the score.
    - Scraping is unreliable. See video id `UWVMHYKv6_4`, with the video title "Broken Age: The Complete Adventure Review Commentary". The video has a score, but IGN's article does not.
    - It is also more consistent with what the audience will experience.

In [77]:
import urllib
safe_string = urllib.parse.quote_plus("Pokémon Sword and Shield: The Crown Tundra DLC Review")
safe_string

'Pok%C3%A9mon+Sword+and+Shield%3A+The+Crown+Tundra+DLC+Review'

In [87]:
import urllib
import requests
from bs4 import BeautifulSoup as soup

def clean_video_title(title):
    title = str(title).lower()
    title = title.replace("review commentary", "").replace("commentary review", "review")\
                .replace("video review", "").replace("review video", "")\
                .replace("review", "").replace("commentary", "")
    title = title.strip()
    title = urllib.parse.quote_plus(title)
    return title

def search_metacritic_video_title(title):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"}
    ddgo_url = "https://duckduckgo.com/html/?q="
    
    link = ddgo_url + "metacritic" + clean_video_title(title)
    result = requests.get(link, headers=headers)
    
    try:
        s = soup(result.text, 'html.parser').find_all("a", class_="result__url", href=True)
        return s
    except:
        return []
    
def first_result_is_metacritic(links_list):
    if len(links_list) <= 0:
        return False
    return "www.metacritic.com" in str(links_list[0])

def is_game_page(link):
    # "ps5 review" will very likely bring up all ps5 games on metacritic
    # so this is a second filter: whether or not the "game" has its own metacritic page
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"}
    if str(type(link)) == "<class 'bs4.element.Tag'>":
        link = link.get_text().strip()
    
    if link.startswith("www"):
        link = "https://" + link
    
    result = requests.get(link, headers=headers)
    try:
        s = soup(result.text, 'html.parser').find("div", class_="c-gameReviews")
        return len(s) > 0
    except:
        return False

# search_links = search_video_title("Star Wars Battlefront 2 Review") # video id: 'ow5Jkxv2Qv0'
# first_result_is_metacritic(search_links)
# search_links = search_video_title("PlayStation 5 Review") # video id: 'QoAZEK52zRc'
# first_result_is_metacritic(search_links) # although first is metacritic, should filter for whether or not it's a "game" page

In [88]:
fl = search_metacritic_video_title("Broken Age: The Complete Adventure Review Commentary")[0]
print(is_game_page(fl))
print(fl)

True
<a class="result__url" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.metacritic.com%2Fgame%2Fbroken%2Dage%2F&amp;rut=41f7f63386eb806b4f04e2a4f224ec14f8336cb0c9f0a60f96ee89fd0ea3409f">
                  www.metacritic.com/game/broken-age/
                  </a>
