In [1]:
import requests
import pandas as pd
from tqdm.notebook import tqdm
import gc
import time

# Fetch every app from Steam

In [2]:
all_games_url = "http://api.steampowered.com/ISteamApps/GetAppList/v0002/?format=json"
response = requests.get(all_games_url)
data = response.json()

In [3]:
data = data['applist']['apps']

In [4]:
len(data)

274653

# Data filtering
First we remove entries that are for sure not games (based on appid not divisible by 10)

Then we get rid of any entries that have the following in their name:
- Demo
- Playtest
- Soundtrack/Sound track
- Dedicated server
- DLC
- Art book
- Pack (like "Level Pack", "Music Pack", "Map Pack" etc.)
- Games with Japanese/Chinese/Korean etc. characters in their name (they probably won't contain an english description anyways)

In [5]:
# if the appid is not divisible by 10, it's not a game
data = [app for app in data if app['appid'] % 10 == 0]
len(data)

248917

In [6]:
data = [app for app in data if not app['name'].lower().endswith(" demo")
                            and " demo " not in app['name'].lower()
                            and "(demo)" not in app['name'].lower()
                            and " demo)" not in app['name'].lower()
                            and r"{demo}" not in app['name'].lower()
                            and r" demo}" not in app['name'].lower()]
len(data)

212852

In [7]:
data = [app for app in data if "playtest" not in app['name'].lower()]
len(data)

205039

In [8]:
data = [app for app in data if "soundtrack" not in app['name'].lower() and "sound track" not in app['name'].lower()]
len(data)

195396

In [9]:
data = [app for app in data if "dedicated server" not in app['name'].lower()]
len(data)

194917

In [10]:
data = [app for app in data if "dlc" not in app['name'].lower()]
len(data)

191209

In [11]:
data = [app for app in data if "art book" not in app['name'].lower()]
len(data)

190929

In [12]:
import re
def contains_cjk(text):
    return re.search(r'[\u3000-\u303F\u4E00-\u9FFF\uF900-\uFAFF\uFF00-\uFFEF]', text) is not None

data = [app for app in data if not contains_cjk(app['name'])]
len(data)

181280

Reduction:

273 977 -> 180 845

Reduction of 93 132 entries (34%)

# Game details fetching
We only keep apps that are games, have more than 500 recommendations and have "English" in their supported languages.

In [None]:
from requests.exceptions import JSONDecodeError

app_details_url = "http://store.steampowered.com/api/appdetails?appids="
app_details = pd.DataFrame(columns=["name", "steam_appid", "short_description", "detailed_description", "recommendations", "genres"])
# load existing data if available
newest_file = 7750
newest_index = 0
try:
    app_details = pd.read_csv(f"details/steam_games_details_partial_{newest_file}.csv")
    newest_index = app_details.iloc[-1]['steam_appid']
except:
    pass

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

gc_counter = 0
# counter = 0

# 86599 crashes
for game in tqdm(data):
    # counter += 1
    # if counter <= 86590:
    #     continue
    appid = game['appid']
    if appid <= newest_index:
        continue
    
    flag = False
    for _ in range(3):
        try:
            response = requests.get(app_details_url + str(appid), headers=headers, timeout=10)
            if response.status_code != 200:
                time.sleep(5)
                continue
            details = response.json()
            flag = True
            break
        except JSONDecodeError:
            time.sleep(5)
        except Exception as e:
            time.sleep(5)

    if not flag:
        continue
    if not details or str(appid) not in details:
        continue

    for _ in range(3):
        try:
            if details[str(appid)]['success']:
                info = details[str(appid)]['data']

                # check if it's a game, has more than 500 recommendations and "english" is in supported languages
                if info['type'] == 'game' and info['recommendations']['total'] > 500 and 'supported_languages' in info and 'English' in info['supported_languages']:
                    new_row = pd.DataFrame({
                        "name": [info.get('name', None)],
                        "steam_appid": [info.get('steam_appid', None)],
                        "short_description": [info.get('short_description', None)],
                        "detailed_description": [info.get('detailed_description', None)],
                        "recommendations": [info.get('recommendations', {}).get('total', None)],
                        "genres": [', '.join([genre['description'] for genre in info.get('genres', [])])]
                    })
                    app_details = pd.concat([app_details, new_row], ignore_index=True)

                if (len(app_details)) % 25 == 0:
                    app_details_len = len(app_details)
                    app_details.to_csv(f"details/steam_games_details_partial_{app_details_len}.csv", index=False)
            break
        except KeyError:
            # appid doesn't have expected structure
            # this happens if a game redirects to a bundle for example
            break
        except Exception as e:
            time.sleep(3)

    gc_counter += 1
    if gc_counter >= 100:
        gc.collect()
        gc_counter = 0

app_details.to_csv("steam_games_details.csv", index=False)

  0%|          | 0/181280 [00:00<?, ?it/s]