In [2]:
import re
import time
import json
import copy
import random
import string
import urllib
import asyncio
import requests
import unicodedata
import editdistance
from datetime import datetime
from bs4 import BeautifulSoup

In [3]:
json_list = []
alphabet = list(string.ascii_uppercase)

base_moby = 'https://www.mobygames.com/'

soup_headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
              'AppleWebKit/537.36 (KHTML, like Gecko) '
              'Chrome/50.0.2661.102 Safari/537.36'}

words_subs = {'1': ['i', 'one', '1'], 'one': ['i', 'one', '1'], 'i': ['i', 'one', '1'],
              '2': ['ii', 'two', '2'], 'two': ['ii', 'two', '2'], 'ii': ['ii', 'two', '2'],
              '3': ['iii', 'three', '3'], 'three': ['iii', 'three', '3'], 'iii': ['iii', 'three', '3'],
              '4': ['iv', 'four', '4'], 'four': ['iv', 'four', '4'], 'iv': ['iv', 'four', '4'],
              '5': ['v', 'five', '5'], 'five': ['v', 'five', '5'], 'v': ['v', 'five', '5'],
              '6': ['vi', 'six', '6'], 'six': ['vi', 'six', '6'], 'vi': ['vi', 'six', '6'],
              '7': ['vii', 'seven', '7'], 'seven': ['vii', 'seven', '7'], 'vii': ['vii', 'seven', '7'],
              '8': ['viii', 'eight', '8'], 'eight': ['viii', 'eight', '8'], 'viii': ['viii', 'eight', '8'],
              '9': ['ix', 'nine', '9'], 'nine': ['ix', 'nine', '9'], 'ix': ['ix', 'nine', '9'],
              '10': ['x', 'ten', '10'], 'ten': ['x', 'ten', '10'], 'x': ['x', 'ten', '10']}

In [4]:
def format_string(str_obj):
    str_obj = str_obj.replace('&', 'and')
    title_words = [v.translate(str.maketrans('', '', string.punctuation))
                       .lower().strip() for v in re.sub('/|_|-|:', ' ', str_obj).split(' ')]
    title_words = [unicodedata.normalize('NFKD', v).encode('ASCII', 'ignore').decode('utf-8')
                   for v in title_words if v != '']
    return title_words


def get_best_match(candidates, title):
    best_match = 0
    best_index = 0
    best_score = 0

    title_words = format_string(title)
    
    for ind, candidate in enumerate(candidates):

        temp_candidate = candidate.replace('video game', '')
        candidate_words = format_string(temp_candidate)
        
        nb_common_words = 0
        if len(title_words) < len(candidate_words):
            smaller_title = title_words
            bigger_title = copy.copy(candidate_words)
        else:
            smaller_title = candidate_words
            bigger_title = copy.copy(title_words)

        for word in smaller_title:
            if word in bigger_title:
                nb_common_words += 1
                bigger_title.remove(word)
            elif word in words_subs:
                for sub_word in words_subs[word]:
                    if sub_word in bigger_title:
                        nb_common_words += 1
                        bigger_title.remove(sub_word)
        max_length = max(len(title_words), len(candidate_words))
        nb_smaller_words = nb_common_words / len(smaller_title)
        nb_common_words /= max_length

        # score = (nb_smaller_words + nb_common_words) / 2
        if nb_common_words > best_match:
            best_match = nb_common_words
            best_score = nb_smaller_words
            best_index = ind

    return best_index, best_match, best_score

In [5]:
def get_best_edit_distance(candidates, title):
    best_index = 0
    best_score = 2000

    new_title = ' '.join(format_string(title))
    for ind, candidate in enumerate(candidates):

        temp_candidate = candidate.replace('video game', '')
        temp_candidate = ' '.join(format_string(temp_candidate))
        distance = editdistance.distance(new_title, temp_candidate)
        if distance < best_score:
            best_score = distance
            best_index = ind

    return best_index, best_score

In [6]:
def get_soup(url, steam=False):
    if steam:
        
        webpage = requests.get(url, headers=soup_headers)
    else:
        webpage = requests.get(url, headers=soup_headers)
    return BeautifulSoup(webpage.text, 'html.parser')

In [15]:
def get_moby_url(title):
    temp_title = title
    score, edist_score = 0, 0
    url, edist_url = '', ''
    success = True
    try:
        base_url = 'https://www.mobygames.com/search/?q='
        url = base_url + temp_title

        soup = get_soup(url)
        urls = []
        table_elem = soup.find_all('table')[0]
        search_results = table_elem.find_all('b')
        candidates = []
        for result in search_results:
            search_title = result.text
            urls.append(result.find('a').attrs['href'])
            candidates.append(search_title)

        print(candidates)
        best_candidate = get_best_match(candidates, title)
        temp_title = candidates[best_candidate[0]]
        score = best_candidate[1]
        url = urls[best_candidate[0]]
        
        best_edist_candidate = get_best_edit_distance(candidates, title)
        edist_url = urls[best_edist_candidate[0]]
        edist_score = best_edist_candidate[1]

    except Exception as e:
        print(e)
        success = False

    return {'moby-title': temp_title,
            'moby-score': score, 'moby-edist-score': edist_score,
            'moby-url': url, 'moby-edist-url': edist_url,'moby-success': success}

In [8]:
def get_moby_info(url, score):
    soup = get_soup(url)
    if score >= 0.7:
        title_elem = soup.find('h1', {'class': 'mb-0'})
        title = title_elem.text
        next_elem = title_elem.find_next('div')
        if 'aka' in next_elem.text:
            title_aliases = next_elem.text.split('aka:\n')[1].strip()
            next_elem = next_elem.find_next('div')
        title_id = next_elem.text.split('Moby ID:' )[1].strip()
        print(title, '#', title_aliases, '#', title_id)

        temp_div = soup.find('div', {'class': 'info-release'})
        temp_dl = temp_div.find('dl')
        temp_elems = temp_dl.find_all(recursive=False)
        for el in range(0, len(temp_elems), 2):
            dt_elem = temp_elems[el].text
            dd_elem = temp_elems[el+1].text
            if 'Released' in dt_elem:
                release_date = dd_elem.split('on')[0].strip()
                continue
            if 'Publishers' in dt_elem:
                publishers = dd_elem.strip()
                continue
            if 'Developers' in dt_elem:
                developers = dd_elem.strip()
                continue
        print(release_date, '#', publishers, '#', developers)

        temp_div = soup.find('div', {'class': 'info-score'})
        temp_dl = temp_div.find('dl')
        temp_elems = temp_dl.find_all(recursive=False)
        for el in range(0, len(temp_elems), 2):
            dt_elem = temp_elems[el].text
            dd_elem = temp_elems[el+1].text
            if 'Moby Score' in dt_elem:
                dd_split = dd_elem.split('#')
                moby_score = dd_split[0].strip()
                moby_rank = dd_split[1].split(' of')[0].strip()
                continue
            if 'Critics' in dt_elem:
                dd_split = dd_elem.split('%')
                critics_score = dd_split[0].strip()
                critics_count = dd_split[1].replace('(', '').replace(')', '').strip()
                continue
        print(moby_score, '#', moby_rank, '#', critics_score, '#', critics_count)

        temp_div = soup.find('div', {'class': 'info-genres'})
        temp_dl = temp_div.find('dl')
        temp_elems = temp_dl.find_all(recursive=False)
        genre_dict = dict()
        for el in range(0, len(temp_elems), 2):
            dt_elem = temp_elems[el].text
            dd_elem = temp_elems[el+1].get_text(' ; ')
            genre_dict[dt_elem] = dd_elem
        print(genre_dict)

        try:
            description = soup.find('section', {'id': 'gameOfficialDescription'}).text.replace('\n\n', '\n').strip()
        except AttributeError as _:
            description = soup.find('section', {'id': 'gameDescription'}).text.replace('\n\n', '\n').strip()
        print(description)

        tags_elem = soup.find('section', {'id': 'gameGroups'})
        tags_li = tags_elem.find_all('li')
        tags = ' ; '.join([v.text.strip() for v in tags_li])
        print(tags)

        review_section = soup.find('section', {'id': 'critic-reviews'})
        reviews_elem = json.loads(str(review_section.find('critic-reviews').attrs.get(':reviews')))
        reviews_dict = dict()
        for review in reviews_elem:
            journal = review['source']['name']
            if journal not in reviews_dict:
                reviews_dict[journal] = []
            reviews_dict[journal].append({'review': review['citation'], 'score': review['normalized_score']}) 
        print(reviews_dict)

In [16]:
moby_best = get_moby_url('13 Sentinels - Aegis Rim')
print(moby_best)
get_moby_info(moby_best['moby-url'], moby_best['moby-score'])

list index out of range
{'moby-title': '13 Sentinels - Aegis Rim', 'moby-score': 0, 'moby-edist-score': 0, 'moby-url': 'https://www.mobygames.com/search/?q=13 Sentinels - Aegis Rim', 'moby-edist-url': '', 'moby-success': False}
