In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd

import glob
import os

## Extracting Review Scores 

In [3]:
# Fetch a single page of reviews from Gamespot 
def fetch_gamespot_review_page(page_num):
    '''
    Fetch a page of reviews from gamespot.com 
    and return the page as an HTML parse tree. 
    '''
    url = f'https://www.gamespot.com/games/reviews/?page={page_num}'
    r = requests.get(url)
    page = BeautifulSoup(r.text, 'lxml')
    return page
    

![Pages and Review Card](./ss_gamespot_pages.png)

In [4]:
# Fetch all pages of reviews from Gamespot 
def fetch_gamespot_reviews(progress=False, start=1, stop=729): 
    '''
    Fetch pages of reviews from gamespot.com and
    return a collection of pages as HTML parse trees. 
    '''
    pages = [] 
    for i in range(start, stop): 
        page = fetch_gamespot_review_page(i)
        pages.append(page)
        if progress:
            print(f'Downloading page {i} of {stop} ....... {round(i/stop*100)}% complete')
    return pages

Read the [HTML Structure](./review_card.html) of the review card to create a parser that will extract the required information. 

In [5]:
def extract_gamespot_critic_score_and_text(card):
    '''
    Extracts the critic score and text from the review card
    if they exist. 
    '''
    review = card.select('.card-review')[0]
    score, text = None, None
    if review.find(class_='review-ring-score__score'):
        score = review.find(class_='review-ring-score__score').text
    if review.find(class_='review-ring-score__text'):
        text = review.find(class_='review-ring-score__text').text
    return (score, text)


In [6]:
## Extract a single review from a review card on Gamespot 
def extract_gamespot_review(card):
    '''
    Extract a single review card into a Python dictionary 
    '''
    review = {}

    review['title'] = card.h4.text
    review['platform'] = card.span.text
    review['review_date'] = card.time['datetime']

    # Critic Score may or may not exist 
    review['critic_score'], review['critic_score_text'] =(
        extract_gamespot_critic_score_and_text(card))

    # URL Of Review
    review['url'] =  card.a['href'] 

    # Metadata 
    review['comments'] = card.select('.card-metadata span')[0].text
    review['upvotes'] = card.select('.card-metadata span')[1].text
    review['img'] = card.img['src']

    return review

In [7]:
# Extract all reviews in a page into a list of review dicts 
def extract_gamespot_reviews(page, reviews): 
    for card in page.select('.editorial .card-item'): 
        review = extract_gamespot_review(card)
        reviews.append(review)
    return reviews

### Download Critic Review Data

In [8]:
# pages = fetch_gamespot_reviews(progress=True)

### Extract Review Data from HTML Parse Trees 

In [9]:
# len(pages)

In [10]:
# reviews = []
# for page in pages: 
#     extract_gamespot_reviews(page, reviews)

In [11]:
# len(reviews)

### Export Raw File

In [12]:
# df = pd.DataFrame(reviews)

In [13]:
# df.head()

In [14]:
# df.info()

In [15]:
# df.to_csv('gamespot_reviews_raw.csv', index=False)

### Process & Clean Raw Data

In [118]:
df = pd.read_csv('gamespot_reviews_raw.csv')

In [119]:
df.head()

Unnamed: 0,title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img
0,Soundfall Review - Not Quite My Tempo,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...
1,Apex Legends Mobile Review - Pocket-Sized Royale,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...
2,Hatsune Miku Project Diva Megamix+ Review - Th...,PC,"Thursday, May 26, 2022 1:33pm",8.0,Great,/reviews/hatsune-miku-project-diva-megamix-rev...,1,4,https://www.gamespot.com/a/uploads/screen_peti...
3,Sniper Elite 5 Review - Longer-Range,PC,"Thursday, May 26, 2022 8:00am",8.0,Great,/reviews/sniper-elite-5-review-longer-range/19...,26,7,https://www.gamespot.com/a/uploads/screen_peti...
4,Evil Dead: The Game Review - Somewhat Groovy,XBSX,"Friday, May 20, 2022 2:09pm",6.0,Fair,/reviews/evil-dead-the-game-review-somewhat-gr...,32,6,https://www.gamespot.com/a/uploads/screen_peti...


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15235 entries, 0 to 15234
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              15235 non-null  object 
 1   platform           15235 non-null  object 
 2   review_date        15235 non-null  object 
 3   critic_score       15234 non-null  float64
 4   critic_score_text  15234 non-null  object 
 5   url                15235 non-null  object 
 6   comments           15235 non-null  int64  
 7   upvotes            15235 non-null  int64  
 8   img                15235 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.0+ MB


#### Extract Game Title from the Review Title 

The title that we have in our raw dataset is for the review. As we are trying to build a database of games, we must extract the title of the game into another column. 

In [121]:
df.rename(columns={'title': 'review_title'}, inplace=True)

Luckily, for us, it appears that the title of the game appears before the word review or before a hyphen. Let's use a regular expression to extract the title.

In [122]:
regex = '(.+?)((?i)Review|-)'
df['review_title'].str.extract(regex).head()

Unnamed: 0,0,1
0,Soundfall,Review
1,Apex Legends Mobile,Review
2,Hatsune Miku Project Diva Megamix+,Review
3,Sniper Elite 5,Review
4,Evil Dead: The Game,Review


That seems to have worked, but we need to look out for cases where our regular expression failed.

In [123]:
df['review_title'].str.extract(regex).isnull().sum()

0    1
1    1
dtype: int64

It failed for one instance. Let's check that game out. 

In [124]:
df[df['review_title'].str.extract(regex)[0].isnull()]

Unnamed: 0,review_title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img
712,Lumines Remastered: Groove Is In The Heart,"NS, PS4, XONE, PC","Thursday, Jun 28, 2018 11:56am",8.0,Great,/reviews/lumines-remastered-groove-is-in-the-h...,1,18,https://www.gamespot.com/a/uploads/screen_peti...


I manually looked up the game on Gamespot and it's name is 'Lumines Remastered'. We will update the title after building the column. 

In [125]:
df['title'] = df['review_title'].str.extract(regex)[0].str.strip()

df.head()

Unnamed: 0,review_title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img,title
0,Soundfall Review - Not Quite My Tempo,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...,Soundfall
1,Apex Legends Mobile Review - Pocket-Sized Royale,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...,Apex Legends Mobile
2,Hatsune Miku Project Diva Megamix+ Review - Th...,PC,"Thursday, May 26, 2022 1:33pm",8.0,Great,/reviews/hatsune-miku-project-diva-megamix-rev...,1,4,https://www.gamespot.com/a/uploads/screen_peti...,Hatsune Miku Project Diva Megamix+
3,Sniper Elite 5 Review - Longer-Range,PC,"Thursday, May 26, 2022 8:00am",8.0,Great,/reviews/sniper-elite-5-review-longer-range/19...,26,7,https://www.gamespot.com/a/uploads/screen_peti...,Sniper Elite 5
4,Evil Dead: The Game Review - Somewhat Groovy,XBSX,"Friday, May 20, 2022 2:09pm",6.0,Fair,/reviews/evil-dead-the-game-review-somewhat-gr...,32,6,https://www.gamespot.com/a/uploads/screen_peti...,Evil Dead: The Game


In [126]:
# Correct 'Lumines Remastered: Groove Is In The Heart'
df.loc[df['review_title'] == 'Lumines Remastered: Groove Is In The Heart', 'title'] = 'Lumines Remastered'

In [127]:
len(df.columns)

10

In [128]:
# Make title the first column and review the last column 
df = df.iloc[:, [9, 1,2,3,4,5,6,7,8, 0]]

In [129]:
df.head(2)

Unnamed: 0,title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img,review_title
0,Soundfall,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...,Soundfall Review - Not Quite My Tempo
1,Apex Legends Mobile,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...,Apex Legends Mobile Review - Pocket-Sized Royale


#### Convert Review Data to a Datetime Type 
Let's ensure that the review date is being recognized as a datetime value. 

In [130]:
df['review_date'] = pd.to_datetime(df['review_date'])

#### Attach URL Subdirectory to Base Domain 
It appears that the url has only captured the subdirectory. Let's convert this into the full URL, as it will be used for further data extraction. 

In [131]:
df['url'].head()

0    /reviews/soundfall-review-not-quite-my-tempo/1...
1    /reviews/apex-legends-mobile-review-pocket-siz...
2    /reviews/hatsune-miku-project-diva-megamix-rev...
3    /reviews/sniper-elite-5-review-longer-range/19...
4    /reviews/evil-dead-the-game-review-somewhat-gr...
Name: url, dtype: object

In [132]:
base = 'https://www.gamespot.com'
base + df['url'].head()

0    https://www.gamespot.com/reviews/soundfall-rev...
1    https://www.gamespot.com/reviews/apex-legends-...
2    https://www.gamespot.com/reviews/hatsune-miku-...
3    https://www.gamespot.com/reviews/sniper-elite-...
4    https://www.gamespot.com/reviews/evil-dead-the...
Name: url, dtype: object

In [133]:
df['url'] = base + df['url']

### Save Cleaned Dataset

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15235 entries, 0 to 15234
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   title              15235 non-null  object        
 1   platform           15235 non-null  object        
 2   review_date        15235 non-null  datetime64[ns]
 3   critic_score       15234 non-null  float64       
 4   critic_score_text  15234 non-null  object        
 5   url                15235 non-null  object        
 6   comments           15235 non-null  int64         
 7   upvotes            15235 non-null  int64         
 8   img                15235 non-null  object        
 9   review_title       15235 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 1.2+ MB


In [136]:
df.to_csv('gamespot_reviews.csv', index=False)

## Extracting User Reviews

### Build Review Details Dataset

The dataset that we currently have only provides us a URL to the review details page. From that page, we must find the url to the game info page. 

#### Crawl Review Details Page

In [302]:
def extract_gamespot_review_details(page, url, base='https://www.gamespot.com'): 
    review = {}

    review['title'] = None
    review['reviewer'] = None
    review['current_review_url'] = url
    review['info_url'] = None

    if page.select_one('body.body-error'):
        return review

    if page.h1: 
        review['title'] = page.h1.text.strip()
    if page.select_one('a.byline-author__name'): 
        review['reviewer'] = page.select_one('a.byline-author__name').text.strip()

    # Append base in one go
    if page.select_one('a.follow-buy__title'):
        review['info_url'] = base + page.select_one('a.follow-buy__title')['href']
    
    # if len(page.select('.subnav-list a')) >= 2:
    #     review['reviews_url'] = base + page.select('.subnav-list a')[1]['href']
    #     review['info_url'] = base + page.select('.subnav-list a')[0]['href']
    # else:
    #     review['reviews_url'], review['info_url'] = None, None
        
    return review

In [307]:
reviews = []
n = 0
urls = df['url']

for i, url in enumerate(urls):
    # Periodically inform us of the progress
    if i % 25 == 0:
        print(f'Downloading review {i} of {urls.size} .... {round((i)/urls.size*100)}% complete')
        
    # Periodically save files in case of network errors
    if i > 0 and i % 1000 == 0: 
        pd.DataFrame(reviews).to_csv(
            f'./gamespot_review_details/gamespot_review_details_{n}-{i}.csv',
            index=False
            )
        n = i+1
        reviews = []
        
    # Fetch data and parse it at the same time to avoid saving large Parse Trees
    r = requests.get(url)
    page = BeautifulSoup(r.text, 'lxml')
    reviews.append(extract_gamespot_review_details(page, url))

# Save remaining reviews to a file
pd.DataFrame(reviews).to_csv(
    f'./gamespot_review_details/gamespot_review_details_{n}-{urls.size}.csv',
    index=False
    )

Downloading review 0 of 15235 .... 0% complete
Downloading review 25 of 15235 .... 0% complete
Downloading review 50 of 15235 .... 0% complete
Downloading review 75 of 15235 .... 0% complete
Downloading review 100 of 15235 .... 1% complete
Downloading review 125 of 15235 .... 1% complete
Downloading review 150 of 15235 .... 1% complete
Downloading review 175 of 15235 .... 1% complete
Downloading review 200 of 15235 .... 1% complete
Downloading review 225 of 15235 .... 1% complete
Downloading review 250 of 15235 .... 2% complete
Downloading review 275 of 15235 .... 2% complete
Downloading review 300 of 15235 .... 2% complete
Downloading review 325 of 15235 .... 2% complete
Downloading review 350 of 15235 .... 2% complete
Downloading review 375 of 15235 .... 2% complete
Downloading review 400 of 15235 .... 3% complete
Downloading review 425 of 15235 .... 3% complete
Downloading review 450 of 15235 .... 3% complete
Downloading review 475 of 15235 .... 3% complete
Downloading review 500 of

#### Build Review Details Dataset

In [324]:
files = os.path.join('./gamespot_review_details', "gamespot_review_details_*.csv")
files = glob.glob(files)
files.sort()
files

['./gamespot_review_details/gamespot_review_details_00000-1000.csv',
 './gamespot_review_details/gamespot_review_details_01001-2000.csv',
 './gamespot_review_details/gamespot_review_details_02001-3000.csv',
 './gamespot_review_details/gamespot_review_details_03001-4000.csv',
 './gamespot_review_details/gamespot_review_details_04001-5000.csv',
 './gamespot_review_details/gamespot_review_details_05001-6000.csv',
 './gamespot_review_details/gamespot_review_details_06001-7000.csv',
 './gamespot_review_details/gamespot_review_details_07001-8000.csv',
 './gamespot_review_details/gamespot_review_details_08001-9000.csv',
 './gamespot_review_details/gamespot_review_details_09001-10000.csv',
 './gamespot_review_details/gamespot_review_details_10001-11000.csv',
 './gamespot_review_details/gamespot_review_details_11001-12000.csv',
 './gamespot_review_details/gamespot_review_details_12001-13000.csv',
 './gamespot_review_details/gamespot_review_details_13001-14000.csv',
 './gamespot_review_details/g

In [322]:
files.sort

In [325]:
review_details = pd.concat(map(pd.read_csv, files), ignore_index=True)

In [327]:
review_details.head(2)

Unnamed: 0,title,reviewer,current_review_url,info_url
0,Soundfall Review - Not Quite My Tempo,Alessandro Barbosa,https://www.gamespot.com/reviews/soundfall-rev...,https://www.gamespot.com/games/soundfall/
1,Apex Legends Mobile Review - Pocket-Sized Royale,Jordan Ramée,https://www.gamespot.com/reviews/apex-legends-...,https://www.gamespot.com/games/apex-legends-mo...


In [328]:
review_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15235 entries, 0 to 15234
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               15214 non-null  object
 1   reviewer            14807 non-null  object
 2   current_review_url  15235 non-null  object
 3   info_url            15189 non-null  object
dtypes: object(4)
memory usage: 476.2+ KB


In [329]:
review_details.to_csv('./gamespot_review_details.csv', index=False)

### Build User Reviews Dataset

Armed with the url to the game info page, we can crawl and build a dataset of user reviews. 

In [348]:
def extract_gamespot_user_reviews(page, review_url): 
    user_review = {}

    # Review URL will be used to join to the review details dataset 
    user_review['review_url'] =review_url

    # Data to extract 
    user_review['title'] = None
    user_review['metacritic'] = None
    user_review['user_avg'] = None
    user_review['count'] = None
    user_review['10'] = None
    user_review['9'] = None
    user_review['8'] = None
    user_review['7'] = None
    user_review['6'] = None
    user_review['5'] = None
    user_review['4'] = None
    user_review['3'] = None
    user_review['2'] = None
    user_review['1'] = None

    if page.select_one('.gameObject__title'):
        user_review['title'] = page.select_one('.gameObject__title').text.strip()
    
    if page.select_one('.reviewObject__metacritic a'):
        user_review['metacritic'] = page.select_one('.reviewObject__metacritic a').text.strip()
    
    if page.select_one('.reviewObject__userAvg a'):
        user_review['user_avg'] = page.select_one('.reviewObject__userAvg a').text.strip()

    if page.select_one('.breakdown-avgScore__title a'):
        user_review['count'] = page.select_one('.breakdown-avgScore__title a').text.strip()

    if page.select('.breakdown-score__row') and len(page.select('.breakdown-score__row')) == 10:
        for row in page.select('.breakdown-score__row'):
            if row.select_one('span.col-score') and row.select_one('.col-score span'):
                score = row.select_one('span.col-score').text.strip()[:2].strip()
                count = row.select_one('.col-score span').text.strip()[1:-1]
                user_review[score] = count

    return user_review

In [406]:
user_reviews = []
urls = review_details[review_details['info_url'].notnull()]['info_url'] + 'reviews/'
start = 7000
n = start

for i, url in enumerate(urls[start:]):
    # Periodically inform us of the progress
    if i % 25 == 0:
        print(f'Downloading review {i+start} of {urls.size} .... {round((i+start)/urls.size*100)}% complete')
        
    # Periodically save files in case of network errors
    if i > 0 and i % 1000 == 0: 
        pd.DataFrame(user_reviews).to_csv(
            f'./gamespot_user_reviews/gamespot_user_review_{n:0>5}-{i+start-1:0>5}.csv',
            index=False
            )
        n = start + i
        user_reviews = []
        
    # Fetch data and parse it at the same time to avoid saving large Parse Trees
    r = requests.get(url)
    if r.ok:
        page = BeautifulSoup(r.text, 'lxml')
        review = extract_gamespot_user_reviews(page, review_url=review_details['current_review_url'].iloc[i])
        user_reviews.append(review)

# Save remaining reviews to a file
pd.DataFrame(user_reviews).to_csv(
    f'./gamespot_user_reviews/gamespot_user_review_{15000}-{urls.size}.csv',
    index=False
    )

Downloading review 7000 of 15189 .... 46% complete
Downloading review 7025 of 15189 .... 46% complete
Downloading review 7050 of 15189 .... 46% complete
Downloading review 7075 of 15189 .... 47% complete
Downloading review 7100 of 15189 .... 47% complete
Downloading review 7125 of 15189 .... 47% complete
Downloading review 7150 of 15189 .... 47% complete
Downloading review 7175 of 15189 .... 47% complete
Downloading review 7200 of 15189 .... 47% complete
Downloading review 7225 of 15189 .... 48% complete
Downloading review 7250 of 15189 .... 48% complete
Downloading review 7275 of 15189 .... 48% complete
Downloading review 7300 of 15189 .... 48% complete
Downloading review 7325 of 15189 .... 48% complete
Downloading review 7350 of 15189 .... 48% complete
Downloading review 7375 of 15189 .... 49% complete
Downloading review 7400 of 15189 .... 49% complete
Downloading review 7425 of 15189 .... 49% complete
Downloading review 7450 of 15189 .... 49% complete
Downloading review 7475 of 1518

In [420]:
files = os.path.join('./gamespot_user_reviews', "gamespot_user_review_*.csv")
files = glob.glob(files)
files.sort()
files

['./gamespot_user_reviews/gamespot_user_review_00000-00999.csv',
 './gamespot_user_reviews/gamespot_user_review_01000-01999.csv',
 './gamespot_user_reviews/gamespot_user_review_02000-02999.csv',
 './gamespot_user_reviews/gamespot_user_review_03000-03999.csv',
 './gamespot_user_reviews/gamespot_user_review_04000-04999.csv',
 './gamespot_user_reviews/gamespot_user_review_05000-05999.csv',
 './gamespot_user_reviews/gamespot_user_review_06000-06999.csv',
 './gamespot_user_reviews/gamespot_user_review_07000-07999.csv',
 './gamespot_user_reviews/gamespot_user_review_08000-08999.csv',
 './gamespot_user_reviews/gamespot_user_review_09000-09999.csv',
 './gamespot_user_reviews/gamespot_user_review_10000-10999.csv',
 './gamespot_user_reviews/gamespot_user_review_11000-11999.csv',
 './gamespot_user_reviews/gamespot_user_review_12000-12999.csv',
 './gamespot_user_reviews/gamespot_user_review_13000-13999.csv',
 './gamespot_user_reviews/gamespot_user_review_14000-14999.csv',
 './gamespot_user_reviews

In [421]:
user_reviews = pd.concat(map(pd.read_csv, files), ignore_index=True)

In [422]:
user_reviews.head()

Unnamed: 0,review_url,title,metacritic,user_avg,count,10,9,8,7,6,5,4,3,2,1
0,https://www.gamespot.com/reviews/soundfall-rev...,Soundfall,64.0,6.0,,,,,,,,,,,
1,https://www.gamespot.com/reviews/apex-legends-...,Apex Legends Mobile,,7.0,,,,,,,,,,,
2,https://www.gamespot.com/reviews/hatsune-miku-...,Hatsune Miku: Project Diva MegaMix,79.0,6.6,5.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
3,https://www.gamespot.com/reviews/sniper-elite-...,Sniper Elite 5,,4.6,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,https://www.gamespot.com/reviews/evil-dead-the...,Evil Dead: The Game,74.0,7.0,,,,,,,,,,,


In [423]:
user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15168 entries, 0 to 15167
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   review_url  15168 non-null  object 
 1   title       15168 non-null  object 
 2   metacritic  10608 non-null  float64
 3   user_avg    15168 non-null  object 
 4   count       12966 non-null  float64
 5   10          12965 non-null  float64
 6   9           12965 non-null  float64
 7   8           12965 non-null  float64
 8   7           12965 non-null  float64
 9   6           12965 non-null  float64
 10  5           12965 non-null  float64
 11  4           12965 non-null  float64
 12  3           12965 non-null  float64
 13  2           12965 non-null  float64
 14  1           12965 non-null  float64
dtypes: float64(12), object(3)
memory usage: 1.7+ MB


In [425]:
user_reviews.to_csv('gamespot_user_reviews.csv', index=False)

In [394]:
review_details[review_details['info_url'].notnull()].iloc[3000]

title                                    Need for Speed: The Run Review
reviewer                                                  Carolyn Petit
current_review_url    https://www.gamespot.com/reviews/need-for-spee...
info_url              https://www.gamespot.com/games/need-for-speed-...
Name: 3010, dtype: object

In [367]:
pd.DataFrame(user_reviews).iloc[-1]

review_url    https://www.gamespot.com/reviews/catherine-ful...
title                                                 Catherine
metacritic                                                   80
user_avg                                                    8.2
count                                                      1432
10                                                          317
9                                                           604
8                                                           309
7                                                            91
6                                                            27
5                                                            12
4                                                            11
3                                                            10
2                                                            13
1                                                            38
Name: 449, dtype: object

In [332]:
review_details['info_url']

0                https://www.gamespot.com/games/soundfall/
1        https://www.gamespot.com/games/apex-legends-mo...
2        https://www.gamespot.com/games/hatsune-miku-pr...
3           https://www.gamespot.com/games/sniper-elite-5/
4        https://www.gamespot.com/games/evil-dead-the-g...
                               ...                        
15230    https://www.gamespot.com/games/under-a-killing...
15231    https://www.gamespot.com/games/warhammer-shado...
15232        https://www.gamespot.com/games/whiplash-1996/
15233                https://www.gamespot.com/games/worms/
15234    https://www.gamespot.com/games/star-trek-the-n...
Name: info_url, Length: 15235, dtype: object

## Extracting Metadata 

In [2]:
def parse_gamespot_metadata(page, url):
    metadata = {}
    metadata['url'] = url
    metadata['title'] = None 
    metadata['release_date'] = None
    metadata['platforms'] = None 
    metadata['description'] = None 
    metadata['developers'] = None 
    metadata['publishers'] = None 
    metadata['genres'] = None
    metadata['esrb'] = None


    if page.select_one('h4.game-module__name'): 
        metadata['title'] = page.select_one('h4.game-module__name').text
    if page.select_one('.game-module__release-date span'):
        metadata['release_date'] = page.select_one('.game-module__release-date span').text
    if page.select('.game-module__platform li'):
        metadata['platforms'] = ', '.join([s.text for s in page.select('.game-module__platform li') 
                                if not ('class' in s.attrs and 'js-unhide-list' in s.attrs['class'])])
    if page.select_one('.game-module__description'): 
        metadata['description'] = page.select_one('.game-module__description').text
    if page.select('.game-module__developers li'):
        metadata['developers'] = ', '.join([s.text for s in page.select('.game-module__developers li')])
    if page.select('.game-module__publishers li'):
        metadata['publishers'] = ', '.join([s.text for s in page.select('.game-module__publishers li')])
    if page.select('.game-module__genres li'):
        metadata['genres'] = ', '.join([s.text for s in page.select('.game-module__genres li')])
    if page.select_one('.pod-object-stats__esrb dt'): 
        metadata['esrb'] = page.select_one('.pod-object-stats__esrb dt').text

    return metadata

In [3]:
def print_progress(curr, total, rate): 
    if curr % rate == 0:
        print(f'Downloading review {curr} of {total} .... {round((curr)/total*100)}% complete')

def save(last, first, dataset, filepath, rate=False,): 
    if last > first and (not rate or (last - first == rate)):
        filename = filepath + f'_{first:0>5}-{last-1:0>5}.csv'
        pd.DataFrame(dataset).to_csv(filename, index=False)
        return True
    return False 
        
def crawl(url): 
    page = None
    try:
        r = requests.get(url)
        if r.ok:
            page =  BeautifulSoup(r.text, 'lxml')
    finally:
        return page

def parse(page, url, parser):
    return parser(page, url)

def crawl_parse(url, parser):
    page = crawl(url)
    if page: 
        return parser(page, url)
    return None     

In [4]:
def crawl_parse_save(urls, start, parser, filepath, print_rate=0, save_rate=0): 
    dataset = []
    prev = start

    for i, url in enumerate(urls[start:]):

        # Periodically inform us of the progress
        if print_rate:
            print_progress(curr=i+start, total=urls.size, rate=print_rate)
            
        # Periodically save files in case of network errors
        if save_rate: 
            if save(
                last=i+start,
                first=prev,
                dataset=dataset,
                filepath=filepath,
                rate=save_rate
            ): 
                prev = start + i
                # Clear list to conserve memory 
                dataset = []
            
        # Fetch data and parse it at the same time to avoid saving large Parse Trees
        data = crawl_parse(url, parser)

        # Add data if it was parsed successfully 
        if data:
            dataset.append(data)

    # Save remaining reviews to a file
    save(
        last=urls.size,
        first=prev,
        dataset=dataset,
        filepath=filepath
    )

In [5]:
review_details = pd.read_csv('./gamespot_review_details.csv')

In [6]:
filepath = './gamespot_metadata_set/gamespot_metadata'
urls = review_details[review_details['info_url'].notnull()]['info_url']

crawl_parse_save(urls=urls, start=13000, parser=parse_gamespot_metadata, filepath=filepath, print_rate=25, save_rate=500)

Downloading review 13000 of 15189 .... 86% complete
Downloading review 13025 of 15189 .... 86% complete
Downloading review 13050 of 15189 .... 86% complete
Downloading review 13075 of 15189 .... 86% complete
Downloading review 13100 of 15189 .... 86% complete
Downloading review 13125 of 15189 .... 86% complete
Downloading review 13150 of 15189 .... 87% complete
Downloading review 13175 of 15189 .... 87% complete
Downloading review 13200 of 15189 .... 87% complete
Downloading review 13225 of 15189 .... 87% complete
Downloading review 13250 of 15189 .... 87% complete
Downloading review 13275 of 15189 .... 87% complete
Downloading review 13300 of 15189 .... 88% complete
Downloading review 13325 of 15189 .... 88% complete
Downloading review 13350 of 15189 .... 88% complete
Downloading review 13375 of 15189 .... 88% complete
Downloading review 13400 of 15189 .... 88% complete
Downloading review 13425 of 15189 .... 88% complete
Downloading review 13450 of 15189 .... 89% complete
Downloading 

#### Save to single file 

In [8]:
def find_file_paths(folder, filename):
    files = os.path.join(folder, filename)
    files = glob.glob(files)
    files.sort()
    return files

def save_to_merged_file(files, outfile): 
    data = pd.concat(map(pd.read_csv, files), ignore_index=True)
    data.to_csv(outfile, index=False)
    return None 

In [9]:
files = find_file_paths(folder='gamespot_metadata_set', filename='gamespot_metadata_*.csv')

In [11]:
save_to_merged_file(files, './gamespot_metadata.csv')

## Creating Master Dataset 