# Extracting user reviews of games on Steam

In [5]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

import pandas as pd
import re
import string
import nltk

In [6]:
nltk.download(['stopwords', 'wordnet'])

[nltk_data] Downloading package stopwords to /home/sanjiv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sanjiv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Web scraping

In [7]:
# grabbing 500 reviews from each of the four game products sold on Steam

reviews_num = 500
game_ids = [292030, 1888930, 1151640, 1174180]

In [8]:
# infinite scroll page scraping

all_reviews = []
# to hide browser
# op = webdriver.ChromeOptions()
# op.add_argument('headless')
# driver = webdriver.Chrome(options=op)
driver = webdriver.Chrome()

for g in game_ids:

  r = driver.get("https://steamcommunity.com/app/" + str(g) + "/reviews/?p=1&browsefilter=mostrecent&filterLanguage=english")
  page = 1
  reviews = []

  while len(reviews) < reviews_num:

    # wait till scrolling loads the new review contents
    WebDriverWait(driver, 10).until(
      lambda x: x.find_element(By.CSS_SELECTOR, "#page" + str(page) + " .apphub_Card.modalContentLink.interactable .apphub_CardTextContent") and x.find_element(By.CSS_SELECTOR, "#page" + str(page) + " .apphub_Card.modalContentLink.interactable .apphub_UserReviewCardContent .title")
    )

    elems = driver.find_elements(By.CSS_SELECTOR, "#page" + str(page) + " .apphub_Card.modalContentLink.interactable")
    for ele in elems:
      rec_text = ele.find_element(By.CSS_SELECTOR, ".apphub_UserReviewCardContent .title").text
      # grabbing text inside of parent element excluding that of within the child elements
      all_review_text = ele.find_element(By.CSS_SELECTOR, ".apphub_CardTextContent").text
      child_text = ele.find_element(By.CSS_SELECTOR, ".apphub_CardTextContent .date_posted").text
      parent_text = all_review_text.replace(child_text, '')
      reviews.append({
        "game_id": g,
        "review": parent_text,
        "recommendation": rec_text
      })
    
    body = driver.find_element(By.CSS_SELECTOR, 'body')
    body.send_keys("scroll down", Keys.END)
    page += 1
  
  all_reviews.extend(reviews)

driver.quit()

In [9]:
print(len(all_reviews))
print(all_reviews[997:1002])

2000
[{'game_id': 1888930, 'review': "\nI always wanted to play this game, and longed for the day it came out on PC... I had the old ps3 but was never allowed the game when I were a wee lad. When the game came out, I waited patiently for the bugs to be resolved before purchasing. I am so glad I did. The game has played almost flawlessly. There's been the odd stutter in extra demanding parts of the game, and it is a demanding game, no doubts there. BUT holy moley is it worth it! Amazing story, great UI and game play, fantastic characters, graphics galore.\n\nNow patiently awaiting TLOU Part 2!üòÅ", 'recommendation': 'Recommended'}, {'game_id': 1888930, 'review': "\nIt's barely what I'd call playable now. It's plagued with shader compilation stutters even on a 20GB 7900xt. Looking down a hallway at a section of the game that's entirely indoors shouldn't use 14GB of VRAM. No excuses. The game looks gorgeous when it's not stuttering all over the place. Also had a weird sound bug that woul

## Data cleaning

In [10]:
df = pd.DataFrame.from_dict(all_reviews)
df.head()

Unnamed: 0,game_id,review,recommendation
0,292030,\nOne of the greatest RPGs to date and a trium...,Recommended
1,292030,"\nStunning upgraded graphics, best RPG game ever.",Recommended
2,292030,\nrowch,Recommended
3,292030,"\nHere I will leave the cat, friends who pass ...",Recommended
4,292030,\nI have been consumed,Recommended


In [11]:
# convert recommendation column to sentiment

df.recommendation = df.recommendation.apply(lambda r: 1 if r == 'Recommended' else 0)
df = df.rename(columns={'recommendation': 'sentiment'})
df.head()

Unnamed: 0,game_id,review,sentiment
0,292030,\nOne of the greatest RPGs to date and a trium...,1
1,292030,"\nStunning upgraded graphics, best RPG game ever.",1
2,292030,\nrowch,1
3,292030,"\nHere I will leave the cat, friends who pass ...",1
4,292030,\nI have been consumed,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   game_id    2000 non-null   int64 
 1   review     2000 non-null   object
 2   sentiment  2000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 47.0+ KB


In [14]:
# remove escape sequences, strip leading and trailing whitespaces
def escape_n_strip(s):
  escapes = '\b\n\r\t\\' 
  for c in escapes: 
    s = s.replace(c, ' ')
  return s.strip()

# shrink large whitespaces into one
def shrink_space(s):
  return re.sub('\s+', ' ', s)

# find more unwanted non alphanumeric characters
def find_impurities(s):
  aliens = set()
  for c in s:
    if not c.isalnum() and c not in string.punctuation:
      aliens.add(c)
  return aliens

# remove words in list 'r' from string 's' 
def remove_words(s, r):
  words = s.split(" ")
  words = list( filter(lambda x: False if x in r else True, words) )
  return " ".join(words)

In [15]:
# remove escape characters

df["review"] = df["review"].apply(escape_n_strip)

In [16]:
# find other strange characters

strangers = df["review"].apply(find_impurities).to_list()
strangers = list(set().union(*strangers))
if ' ' in strangers:
  strangers.remove(' ')
strangers

['Ôºè',
 '‚†¥',
 '‚†£',
 '‚¢¥',
 '‚°Ä',
 '‚ú°',
 '‚ô•',
 '‚†ñ',
 '‚†ô',
 '‚¢Ä',
 '‡πà',
 '‚£ô',
 '‚††',
 '‚¢è',
 '‚°ã',
 '‚¢∏',
 '‚òë',
 '‚†§',
 '‚°¥',
 '‚†¶',
 'üå∏',
 '‚£è',
 'üòä',
 '‚°ò',
 '‚£Ø',
 '‚£æ',
 '‚£ß',
 '‚†≥',
 '‚¢∞',
 '‚°∏',
 '‚£Æ',
 '‚†Ü',
 '‚£Ü',
 '‚îÇ',
 '‡∏∑',
 '‡∏µ',
 '‚°ß',
 '‚†ì',
 '‚†è',
 '‚¢õ',
 '‚¢§',
 '‚†∏',
 'üêê',
 '‚°ü',
 'üë®',
 '‚†ª',
 'üëâ',
 '‚£†',
 '‚†õ',
 '¬¥',
 '‚°å',
 '‚†∂',
 'üî•',
 '‚†à',
 '‚°õ',
 '‚°Ñ',
 'üòÅ',
 '‚†π',
 '‚¢¢',
 '‚£∑',
 '‚£≤',
 'üç≥',
 '¬∞',
 '‚Äô',
 '‚†É',
 '‚£π',
 '‚¢®',
 '‚°ê',
 '‚£Ω',
 '‚†Å',
 '‚Äò',
 '‚£§',
 '‚†ü',
 '‡πâ',
 '‚ù§',
 '‚£õ',
 '‚£ó',
 '‚¢π',
 'üëç',
 '‚£•',
 '‚Ç¨',
 '‚£ç',
 '‚¢ª',
 '\U0001fae1',
 '‚¢ø',
 '‚†ê',
 '‚†Ø',
 '‚†é',
 '‚¢ã',
 'Ôºø',
 '‚†¢',
 '‚†Ñ',
 '‚£º',
 '‚°Å',
 '‚£ª',
 '‚°ø',
 '‚£á',
 '‚£´',
 '‚Äî',
 '‚£ü',
 'üíã',
 '‚°ù',
 '‚¢ê',
 'Ôºå',
 'Ôºº',
 '‚¢†',
 '‚Äì',
 '‚†≤',
 '‚°†',
 '‚†Ä',
 '‚£∞',
 '‚¢©',
 '‚¢æ',
 '‚†Ç',
 '‚£∏',
 '‚£©',
 '‡∏∏',
 'üòç',
 '‚£Ä',
 'Ôºû',
 '‚£ø',
 '¬Ø',
 '‚¢ò',
 

In [17]:
# pattern for all punctuations to remove
punctuations =  "|".join([re.escape(sym) for sym in list(string.punctuation) + strangers])
print(punctuations)

# remove punctuations
df["review"] = df["review"].apply(lambda s: re.sub(punctuations, ' ', s))
df["review"] = df["review"].apply(shrink_space)

# we do not remove numbers because, many aspects of games require numbers for a review to be meaningful

!|"|\#|\$|%|\&|'|\(|\)|\*|\+|,|\-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|\~|Ôºè|‚†¥|‚†£|‚¢¥|‚°Ä|‚ú°|‚ô•|‚†ñ|‚†ô|‚¢Ä|‡πà|‚£ô|‚††|‚¢è|‚°ã|‚¢∏|‚òë|‚†§|‚°¥|‚†¶|üå∏|‚£è|üòä|‚°ò|‚£Ø|‚£æ|‚£ß|‚†≥|‚¢∞|‚°∏|‚£Æ|‚†Ü|‚£Ü|‚îÇ|‡∏∑|‡∏µ|‚°ß|‚†ì|‚†è|‚¢õ|‚¢§|‚†∏|üêê|‚°ü|üë®|‚†ª|üëâ|‚£†|‚†õ|¬¥|‚°å|‚†∂|üî•|‚†à|‚°õ|‚°Ñ|üòÅ|‚†π|‚¢¢|‚£∑|‚£≤|üç≥|¬∞|‚Äô|‚†É|‚£π|‚¢®|‚°ê|‚£Ω|‚†Å|‚Äò|‚£§|‚†ü|‡πâ|‚ù§|‚£õ|‚£ó|‚¢π|üëç|‚£•|‚Ç¨|‚£ç|‚¢ª|ü´°|‚¢ø|‚†ê|‚†Ø|‚†é|‚¢ã|Ôºø|‚†¢|‚†Ñ|‚£º|‚°Å|‚£ª|‚°ø|‚£á|‚£´|‚Äî|‚£ü|üíã|‚°ù|‚¢ê|Ôºå|Ôºº|‚¢†|‚Äì|‚†≤|‚°†|‚†Ä|‚£∞|‚¢©|‚¢æ|‚†Ç|‚£∏|‚£©|‡∏∏|üòç|‚£Ä|Ôºû|‚£ø|¬Ø|‚¢ò|‚¢ó|‚£ã|‚†í|‚¢â|‚úî|¬£|‚úÖ|‚†û|üó£|‚†ã|‚Ä¶|‚£¶|üôå|‚£ù|Ô∏è|‚†ó|‚°∂|üò≠|ü§î|‚¢∑|‚£â|‚£ê|‚†â|‚°á|‚£Å|‡πå|‚£¥|‚Äç|‚£¨|‚†ö|‚†á|‚†ò|‚†ø|‚òê|‚†æ|‚°Ω|‡∏±|‚°ª|‚£Ñ|‚£ï|‚°Ü|üèº|‚¢¶|‚¢É|‚¢∂|‚°è|‡∏π|„ÄÄ|‚£ë|Ôø£|‚£∂|‚£à


In [18]:
# lower case

df["review"] = df["review"].apply(lambda x: x.lower())

In [19]:
# remove stopwords, english & spanish

eng_esp_stops = nltk.corpus.stopwords.words(['english', 'spanish'])
df["review"] = df["review"].apply(lambda x: remove_words(x, eng_esp_stops))
df["review"] = df["review"].apply(shrink_space)

In [20]:
# lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()
df["review"] = df["review"].apply( lambda x: " ".join([lemmatizer.lemmatize(c) for c in x.split(" ")]) )

In [21]:
df.head()

Unnamed: 0,game_id,review,sentiment
0,292030,one greatest rpgs date triumph genre even 7 ye...,1
1,292030,stunning upgraded graphic best rpg game ever,1
2,292030,rowch,1
3,292030,leave cat friend pas pet give thumb „Éï l „Éü x„Éé „ÉΩ...,1
4,292030,consumed,1


## View clean text & save

In [22]:
df.loc[19, "review"]

'ng next gen say nearly 300 hr masterpiece'

In [23]:
df.sentiment.unique()

array([1, 0])

In [24]:
df.to_csv('../../datasets/steam_reviews.csv', index=False)