In [1]:
import requests 
from bs4 import BeautifulSoup
import re
import pandas as pd

## Extract Review Scores 

In [3]:
# Fetch a single page of reviews from Gamespot 
def fetch_gamespot_review_page(page_num):
    '''
    Fetch a page of reviews from gamespot.com 
    and return the page as an HTML parse tree. 
    '''
    url = f'https://www.gamespot.com/games/reviews/?page={page_num}'
    r = requests.get(url)
    page = BeautifulSoup(r.text, 'lxml')
    return page
    

![Pages and Review Card](./ss_gamespot_pages.png)

In [4]:
# Fetch all pages of reviews from Gamespot 
def fetch_gamespot_reviews(progress=False, start=1, stop=729): 
    '''
    Fetch pages of reviews from gamespot.com and
    return a collection of pages as HTML parse trees. 
    '''
    pages = [] 
    for i in range(start, stop): 
        page = fetch_gamespot_review_page(i)
        pages.append(page)
        if progress:
            print(f'Downloading page {i} of {stop} ....... {round(i/stop*100)}% complete')
    return pages

Read the [HTML Structure](./review_card.html) of the review card to create a parser that will extract the required information. 

In [5]:
def extract_gamespot_critic_score_and_text(card):
    '''
    Extracts the critic score and text from the review card
    if they exist. 
    '''
    review = card.select('.card-review')[0]
    score, text = None, None
    if review.find(class_='review-ring-score__score'):
        score = review.find(class_='review-ring-score__score').text
    if review.find(class_='review-ring-score__text'):
        text = review.find(class_='review-ring-score__text').text
    return (score, text)


In [6]:
## Extract a single review from a review card on Gamespot 
def extract_gamespot_review(card):
    '''
    Extract a single review card into a Python dictionary 
    '''
    review = {}

    review['title'] = card.h4.text
    review['platform'] = card.span.text
    review['review_date'] = card.time['datetime']

    # Critic Score may or may not exist 
    review['critic_score'], review['critic_score_text'] =(
        extract_gamespot_critic_score_and_text(card))

    # URL Of Review
    review['url'] =  card.a['href'] 

    # Metadata 
    review['comments'] = card.select('.card-metadata span')[0].text
    review['upvotes'] = card.select('.card-metadata span')[1].text
    review['img'] = card.img['src']

    return review

In [7]:
# Extract all reviews in a page into a list of review dicts 
def extract_gamespot_reviews(page, reviews): 
    for card in page.select('.editorial .card-item'): 
        review = extract_gamespot_review(card)
        reviews.append(review)
    return reviews

### Download Critic Review Data

In [8]:
# pages = fetch_gamespot_reviews(progress=True)

### Extract Review Data from HTML Parse Trees 

In [9]:
# len(pages)

In [10]:
# reviews = []
# for page in pages: 
#     extract_gamespot_reviews(page, reviews)

In [11]:
# len(reviews)

### Export Raw File

In [12]:
# df = pd.DataFrame(reviews)

In [13]:
# df.head()

In [14]:
# df.info()

In [15]:
# df.to_csv('gamespot_reviews_raw.csv', index=False)

### Process & Clean Raw Data

In [118]:
df = pd.read_csv('gamespot_reviews_raw.csv')

In [119]:
df.head()

Unnamed: 0,title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img
0,Soundfall Review - Not Quite My Tempo,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...
1,Apex Legends Mobile Review - Pocket-Sized Royale,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...
2,Hatsune Miku Project Diva Megamix+ Review - Th...,PC,"Thursday, May 26, 2022 1:33pm",8.0,Great,/reviews/hatsune-miku-project-diva-megamix-rev...,1,4,https://www.gamespot.com/a/uploads/screen_peti...
3,Sniper Elite 5 Review - Longer-Range,PC,"Thursday, May 26, 2022 8:00am",8.0,Great,/reviews/sniper-elite-5-review-longer-range/19...,26,7,https://www.gamespot.com/a/uploads/screen_peti...
4,Evil Dead: The Game Review - Somewhat Groovy,XBSX,"Friday, May 20, 2022 2:09pm",6.0,Fair,/reviews/evil-dead-the-game-review-somewhat-gr...,32,6,https://www.gamespot.com/a/uploads/screen_peti...


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15235 entries, 0 to 15234
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              15235 non-null  object 
 1   platform           15235 non-null  object 
 2   review_date        15235 non-null  object 
 3   critic_score       15234 non-null  float64
 4   critic_score_text  15234 non-null  object 
 5   url                15235 non-null  object 
 6   comments           15235 non-null  int64  
 7   upvotes            15235 non-null  int64  
 8   img                15235 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1.0+ MB


#### Extract Game Title from the Review Title 

The title that we have in our raw dataset is for the review. As we are trying to build a database of games, we must extract the title of the game into another column. 

In [121]:
df.rename(columns={'title': 'review_title'}, inplace=True)

Luckily, for us, it appears that the title of the game appears before the word review or before a hyphen. Let's use a regular expression to extract the title.

In [122]:
regex = '(.+?)((?i)Review|-)'
df['review_title'].str.extract(regex).head()

Unnamed: 0,0,1
0,Soundfall,Review
1,Apex Legends Mobile,Review
2,Hatsune Miku Project Diva Megamix+,Review
3,Sniper Elite 5,Review
4,Evil Dead: The Game,Review


That seems to have worked, but we need to look out for cases where our regular expression failed.

In [123]:
df['review_title'].str.extract(regex).isnull().sum()

0    1
1    1
dtype: int64

It failed for one instance. Let's check that game out. 

In [124]:
df[df['review_title'].str.extract(regex)[0].isnull()]

Unnamed: 0,review_title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img
712,Lumines Remastered: Groove Is In The Heart,"NS, PS4, XONE, PC","Thursday, Jun 28, 2018 11:56am",8.0,Great,/reviews/lumines-remastered-groove-is-in-the-h...,1,18,https://www.gamespot.com/a/uploads/screen_peti...


I manually looked up the game on Gamespot and it's name is 'Lumines Remastered'. We will update the title after building the column. 

In [125]:
df['title'] = df['review_title'].str.extract(regex)[0].str.strip()

df.head()

Unnamed: 0,review_title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img,title
0,Soundfall Review - Not Quite My Tempo,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...,Soundfall
1,Apex Legends Mobile Review - Pocket-Sized Royale,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...,Apex Legends Mobile
2,Hatsune Miku Project Diva Megamix+ Review - Th...,PC,"Thursday, May 26, 2022 1:33pm",8.0,Great,/reviews/hatsune-miku-project-diva-megamix-rev...,1,4,https://www.gamespot.com/a/uploads/screen_peti...,Hatsune Miku Project Diva Megamix+
3,Sniper Elite 5 Review - Longer-Range,PC,"Thursday, May 26, 2022 8:00am",8.0,Great,/reviews/sniper-elite-5-review-longer-range/19...,26,7,https://www.gamespot.com/a/uploads/screen_peti...,Sniper Elite 5
4,Evil Dead: The Game Review - Somewhat Groovy,XBSX,"Friday, May 20, 2022 2:09pm",6.0,Fair,/reviews/evil-dead-the-game-review-somewhat-gr...,32,6,https://www.gamespot.com/a/uploads/screen_peti...,Evil Dead: The Game


In [126]:
# Correct 'Lumines Remastered: Groove Is In The Heart'
df.loc[df['review_title'] == 'Lumines Remastered: Groove Is In The Heart', 'title'] = 'Lumines Remastered'

In [127]:
len(df.columns)

10

In [128]:
# Make title the first column and review the last column 
df = df.iloc[:, [9, 1,2,3,4,5,6,7,8, 0]]

In [129]:
df.head(2)

Unnamed: 0,title,platform,review_date,critic_score,critic_score_text,url,comments,upvotes,img,review_title
0,Soundfall,PC,"Friday, May 27, 2022 3:39pm",5.0,Mediocre,/reviews/soundfall-review-not-quite-my-tempo/1...,0,2,https://www.gamespot.com/a/uploads/screen_peti...,Soundfall Review - Not Quite My Tempo
1,Apex Legends Mobile,AND,"Thursday, May 26, 2022 7:08pm",7.0,Good,/reviews/apex-legends-mobile-review-pocket-siz...,3,4,https://www.gamespot.com/a/uploads/screen_peti...,Apex Legends Mobile Review - Pocket-Sized Royale


#### Convert Review Data to a Datetime Type 
Let's ensure that the review date is being recognized as a datetime value. 

In [130]:
df['review_date'] = pd.to_datetime(df['review_date'])

#### Attach URL Subdirectory to Base Domain 
It appears that the url has only captured the subdirectory. Let's convert this into the full URL, as it will be used for further data extraction. 

In [131]:
df['url'].head()

0    /reviews/soundfall-review-not-quite-my-tempo/1...
1    /reviews/apex-legends-mobile-review-pocket-siz...
2    /reviews/hatsune-miku-project-diva-megamix-rev...
3    /reviews/sniper-elite-5-review-longer-range/19...
4    /reviews/evil-dead-the-game-review-somewhat-gr...
Name: url, dtype: object

In [132]:
base = 'https://www.gamespot.com'
base + df['url'].head()

0    https://www.gamespot.com/reviews/soundfall-rev...
1    https://www.gamespot.com/reviews/apex-legends-...
2    https://www.gamespot.com/reviews/hatsune-miku-...
3    https://www.gamespot.com/reviews/sniper-elite-...
4    https://www.gamespot.com/reviews/evil-dead-the...
Name: url, dtype: object

In [133]:
df['url'] = base + df['url']

### Save Cleaned Dataset

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15235 entries, 0 to 15234
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   title              15235 non-null  object        
 1   platform           15235 non-null  object        
 2   review_date        15235 non-null  datetime64[ns]
 3   critic_score       15234 non-null  float64       
 4   critic_score_text  15234 non-null  object        
 5   url                15235 non-null  object        
 6   comments           15235 non-null  int64         
 7   upvotes            15235 non-null  int64         
 8   img                15235 non-null  object        
 9   review_title       15235 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(6)
memory usage: 1.2+ MB


In [136]:
df.to_csv('gamespot_reviews.csv', index=False)

## Extract User Reviews