In [1]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import numpy as np

load_dotenv()

# Get the API key from the environment
API_KEY = os.getenv("STEAM_API_KEY")
MARKET_API_KEY = os.getenv("STEAM_MARKET_API_KEY")

In [3]:
steam_store = "https://store.steampowered.com"
# /appreviews/10500?
base_params = {
    "json": 1,
    "language": "english",
    "filter": "recent",
    "num_per_page": 100,
    "cursor": "*",
}

In [4]:
steam_market = 'https://api.steamapis.com'
app_list_url = f"{steam_market}/market/apps?"
market_params = {
    "api_key": MARKET_API_KEY
}

In [5]:
# only run the first time to get the list of all games
# you can pull allgames from csv after scraping
all_games = requests.get(app_list_url, params=market_params).json()
all_games_df = pd.DataFrame(all_games)
all_games_df.head()

Unnamed: 0,appID,name,is_free,price_overview
0,10500,Total War: EMPIRE – Definitive Edition,False,"{'final_formatted': '$24.99', 'initial_formatt..."
1,391070,Stellar 2D,False,"{'final_formatted': '$0.99', 'initial_formatte..."
2,576670,Sky Is Arrows,False,"{'final_formatted': '$9.99', 'initial_formatte..."
3,256410,Might & Magic: Duel of Champions,True,
4,546330,SPACE MOUSE 35th Anniversary edition,False,"{'final_formatted': '$4.99', 'initial_formatte..."


In [10]:
all_games_df.to_csv("data/steam/all_games.csv", index=False)

In [4]:
all_games_df = pd.read_csv("data/steam/all_games.csv")

In [6]:
all_games_df.head()

Unnamed: 0,appID,name,is_free,price_overview
0,10500,Total War: EMPIRE – Definitive Edition,False,"{'final_formatted': '$24.99', 'initial_formatt..."
1,391070,Stellar 2D,False,"{'final_formatted': '$0.99', 'initial_formatte..."
2,576670,Sky Is Arrows,False,"{'final_formatted': '$9.99', 'initial_formatte..."
3,256410,Might & Magic: Duel of Champions,True,
4,546330,SPACE MOUSE 35th Anniversary edition,False,"{'final_formatted': '$4.99', 'initial_formatte..."


In [7]:
# the following code is to get the reviews for all games in the list
reviews_from_api = []

In [24]:
# This is the main loop that will get all reviews for all games

# urllib to encode the cursor
import urllib

# get index of last game that was scraped
last_scrap = reviews_from_api[-1]['app_id']
offset = all_games_df[all_games_df['appID'] == last_scrap].index[0]

for i, app_id in enumerate(all_games_df["appID"], start=offset):
    cursor = '*'
    reviews_url = f"{steam_store}/appreviews/{app_id}?"
    past_cursors = []

    positive = False

    print(f"Getting reviews for app {i+1}/{len(all_games_df) - offset}", end="\r")

    while cursor:
        params = base_params.copy()
        params["cursor"] = cursor
        params["review_type"] = 'positive' if positive else "negative"
        url = f"{reviews_url}"\
        + "json=1&language=english&filter=recent&num_per_page=100&"\
        + f"cursor={cursor}&review_type={'positive' if positive else 'negative'}"
        # print(url)
        res = requests.get(url)
        reviews = res.json()
        cursor = reviews.get("cursor")

        if cursor:
            cursor = urllib.parse.quote(cursor)
        if cursor in past_cursors:
            cursor = '*'
            if positive:
                break
            positive = True
            past_cursors = []
            continue
        if not cursor:
            cursor = '*'
            if positive:
                break
            positive = True
            past_cursors = []
            continue
        past_cursors.append(cursor)

        if cursor:
            for review in reviews["reviews"]:
                author = review["author"]["steamid"]

                reviews_from_api.append({
                    "app_id": app_id,
                    "author": author,
                    "recommended": positive
                })
        else:
            break

# sort reviews by review_weighted
reviews_df = pd.DataFrame(reviews_from_api)

# set max number of rows to none
pd.set_option('display.max_rows', None)

reviews_df.tail()


Duplicate cursor found, breaking
10500
AoJ4s9yN7awCfKT3Aw%3D%3D
['AoJ4wvf0sosDcP7htwQ%3D', 'AoJwqtLR2oQDdLnj5AM%3D', 'AoJwgLCC6PwCeLvLhAM%3D', 'AoJws/K5nPUCcMTCqAI%3D', 'AoJ4mIWpzO8CeY%2B96wE%3D', 'AoJ4pu%2Bz2ecCdNSxtQE%3D', 'AoJ4waeY%2BOICeZ3qngE%3D', 'AoJ4h8La5dkCf4OocA%3D%3D', 'AoJwxt/QjtECdO%2BHSQ%3D%3D', 'AoJwr/qjx8oCfKCKNA%3D%3D', 'AoJw%2BdbN57gCfM2MDw%3D%3D', 'AoJ4s9yN7awCfKT3Aw%3D%3D']
Duplicate cursor found, breaking
10500
AoJ4zcP4tKwCetuXBA%3D%3D
['AoJ4nejy2I8DeJe9%2BQQ%3D', 'AoJw1LnX144DcpeE6wQ%3D', 'AoJwmdvH540Df6Wq3gQ%3D', 'AoJ4wquFlI0Dcp/O1AQ%3D', 'AoJw0qHwt4wDfK6cygQ%3D', 'AoJw9aKZ9osDcfeGvgQ%3D', 'AoJw%2BrLriosDdqurtAQ%3D', 'AoJ4oprls4oDe5n3qwQ%3D', 'AoJ4zaDWwokDee2hoAQ%3D', 'AoJ4z7ad6IgDcvKalwQ%3D', 'AoJwxL7N9IcDdojXjQQ%3D', 'AoJ4p8DnhIcDfILMgwQ%3D', 'AoJw6MfV9YUDcLXs9QM%3D', 'AoJ4y6KroIUDdqPH7AM%3D', 'AoJw38Omw4QDffWi3wM%3D', 'AoJ4hMT1zYMDeo%2Bb1gM%3D', 'AoJ44c270YIDerWyzAM%3D', 'AoJ4kOuN44EDf77fwgM%3D', 'AoJw6uXwmIEDc5a2vAM%3D', 'AoJ4zszks4ADftzxtAM%3D', 'AoJw5syj2/8

JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0)

In [42]:
reviews_df = pd.DataFrame(reviews_from_api)

In [43]:
display(reviews_df.head())
display(reviews_df.tail())

Unnamed: 0,app_id,author,recommended
0,10500,76561198140439494,False
1,10500,76561198001149578,False
2,10500,76561199667392107,False
3,10500,76561198024411821,False
4,10500,76561198297284962,False


Unnamed: 0,app_id,author,recommended
4894712,578080,76561198345113950,True
4894713,578080,76561198120839561,True
4894714,578080,76561198278501688,True
4894715,578080,76561198193633982,True
4894716,578080,76561198106200010,True


In [44]:
name_id = pd.DataFrame(all_games_df[['appID', 'name']])

In [45]:
name_id = name_id.rename(columns={'appID': 'app_id'})

In [48]:
game_reviews = reviews_df.merge(name_id, on='app_id')
display(game_reviews.head(2))
display(game_reviews.tail(2))

Unnamed: 0,app_id,author,recommended,name
0,10500,76561198140439494,False,Total War: EMPIRE – Definitive Edition
1,10500,76561198001149578,False,Total War: EMPIRE – Definitive Edition


Unnamed: 0,app_id,author,recommended,name
4932246,578080,76561198193633982,True,PLAYERUNKNOWN'S BATTLEGROUNDS
4932247,578080,76561198106200010,True,PLAYERUNKNOWN'S BATTLEGROUNDS


In [49]:
game_reviews.drop(columns=['app_id'], inplace=True)
display(game_reviews.head())
display(game_reviews.tail())

Unnamed: 0,author,recommended,name
0,76561198140439494,False,Total War: EMPIRE – Definitive Edition
1,76561198001149578,False,Total War: EMPIRE – Definitive Edition
2,76561199667392107,False,Total War: EMPIRE – Definitive Edition
3,76561198024411821,False,Total War: EMPIRE – Definitive Edition
4,76561198297284962,False,Total War: EMPIRE – Definitive Edition


Unnamed: 0,author,recommended,name
4932243,76561198345113950,True,PLAYERUNKNOWN'S BATTLEGROUNDS
4932244,76561198120839561,True,PLAYERUNKNOWN'S BATTLEGROUNDS
4932245,76561198278501688,True,PLAYERUNKNOWN'S BATTLEGROUNDS
4932246,76561198193633982,True,PLAYERUNKNOWN'S BATTLEGROUNDS
4932247,76561198106200010,True,PLAYERUNKNOWN'S BATTLEGROUNDS


In [50]:
game_reviews.to_csv("data/steam/long_reviews.csv", index=False)

In [8]:
game_reviews = pd.read_csv("data/steam/long_reviews.csv")

In [9]:
print(np.shape(game_reviews))

(4932248, 3)


In [10]:
game_reviews.duplicated().sum()

39171

In [11]:
gr_no_dup = game_reviews.drop_duplicates()

In [13]:
# find drop any author that has less than 2 reviews
author_counts = gr_no_dup['author'].value_counts()
author_counts = author_counts[author_counts > 2]

gr_no_dup = gr_no_dup[gr_no_dup['author'].isin(author_counts.index)]

len(gr_no_dup)

1296152

In [16]:
gr_no_dup['recommended'] = gr_no_dup['recommended'].apply(lambda x: 1 if x else -1)

In [18]:
gr_no_dup['recommended'].value_counts()

 1    1109932
-1     186220
Name: recommended, dtype: int64

In [17]:
# create a game pivot table with positive reviews as 1 and negative reviews as -1 and no reviews as 0
# gr_no_dup['recommended'] = gr_no_dup['recommended'].map({True: 1, False: -1})
game_pivot = gr_no_dup.pivot_table(index='name', columns='author', values='recommended', fill_value=0)


In [42]:
gr_no_dup.to_csv("data/steam/processed_dataset.csv", index=False)

In [21]:
from scipy.sparse import csr_matrix

game_sparse = csr_matrix(game_pivot)

In [23]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(algorithm='brute')
model.fit(game_sparse)

In [25]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from typing import List
from fuzzywuzzy import fuzz, process



In [26]:
def get_likely_titles(game_title: str, df: pd.DataFrame = game_pivot, threshold: int = 80) -> List[str]:
    # return the title if it is an exact match
    if game_title in df.index.values:
        return [game_title]
    
    # otherwise, use fuzzy matching
    likely_titles = []
    for title in df.index.values.tolist():
        ratio = fuzz.ratio(game_title.lower(), title.lower())
        ratio = max(ratio, fuzz.partial_ratio(game_title.lower(), title.lower()))
        if ratio >= threshold:
            likely_titles.append(title)
    return likely_titles

In [27]:
def get_vectors(df: pd.DataFrame, game_titles: List[str]) -> np.ndarray:
    vectors = []
    for title in game_titles:
        vectors.append(df.loc[title].values)
    return np.array(vectors)

In [29]:
def get_recommendations(game_titles: List[str], model: NearestNeighbors = model,
    df: pd.DataFrame = game_pivot, n_neighbors: int = 5) -> List[str]:
    vectors = get_vectors(game_pivot, game_titles)
    avg_vector = np.mean(vectors, axis=0).reshape(1, -1)
    x_neighbors = n_neighbors + len(game_titles)
    distances, indices = model.kneighbors(avg_vector, n_neighbors=x_neighbors)
    recommendations = [df.index[i] for i in indices.flatten()]
    
    filtered_recommendations = [game for game in recommendations if game not in game_titles][:n_neighbors]
    
    return filtered_recommendations

In [41]:
titles = []
titles += get_likely_titles("Wobbly Jungle")

rec_games = []

if len(titles) == 0:
    print("No games found. Please try again.")
    
else:
    rec_games = get_recommendations(titles)

    print("Recommended games:")
    for i, game in enumerate(rec_games):
        print(f"{i + 1}. {game}")

Recommended games:
1. Death
2. Guardian Of December
3. Urizen Shadows of the Cold Deluxe Frosty Edition
4. Final Core
5. Fantastic 4 In A Row 2


In [13]:
params = {
    "key": API_KEY,
    "format": "json",
    "page": 2
}
app_list = requests.get(app_list_url, params=base_params).json()
app_list


{'applist': {'apps': [{'appid': 1941401, 'name': ''},
   {'appid': 1897482, 'name': ''},
   {'appid': 2112761, 'name': ''},
   {'appid': 2016512, 'name': ''},
   {'appid': 1820332, 'name': ''},
   {'appid': 1360782, 'name': ''},
   {'appid': 662172, 'name': ''},
   {'appid': 216938, 'name': 'Pieterw test app76 ( 216938 )'},
   {'appid': 660010, 'name': 'test2'},
   {'appid': 660130, 'name': 'test3'},
   {'appid': 1118314, 'name': ''},
   {'appid': 1275822, 'name': ''},
   {'appid': 1343832, 'name': ''},
   {'appid': 1828741, 'name': ''},
   {'appid': 1927051, 'name': ''},
   {'appid': 1496152, 'name': ''},
   {'appid': 1983382, 'name': ''},
   {'appid': 1808781, 'name': ''},
   {'appid': 1977312, 'name': ''},
   {'appid': 1700632, 'name': ''},
   {'appid': 1829051, 'name': ''},
   {'appid': 1567401, 'name': ''},
   {'appid': 2092072, 'name': ''},
   {'appid': 2119422, 'name': ''},
   {'appid': 596501, 'name': ''},
   {'appid': 2156011, 'name': ''},
   {'appid': 2177061, 'name': ''},
  

In [14]:
review_url = 'http://store.steampowered.com/appreviews/440?json=1'

In [9]:
requests.get(review_url, params=base_params).json()

{'success': 1,
 'query_summary': {'num_reviews': 20,
  'review_score': 5,
  'review_score_desc': 'Mixed',
  'total_positive': 12194,
  'total_negative': 11291,
  'total_reviews': 23485},
 'reviews': [{'recommendationid': '166633634',
   'author': {'steamid': '76561199102102581',
    'num_games_owned': 52,
    'num_reviews': 1,
    'playtime_forever': 102902,
    'playtime_last_two_weeks': 1018,
    'playtime_at_review': 99999,
    'last_played': 1719527834},
   'language': 'english',
   'review': "TF2 is an amazing game, yet its neglected by Valve. I'm gunna do my part for this awesome community and fight for a better TF2, one without bots and a game that won't go ignored anymore.\n\n#fixtf2",
   'timestamp_created': 1717465234,
   'timestamp_updated': 1717465234,
   'voted_up': False,
   'votes_up': 161,
   'votes_funny': 0,
   'weighted_vote_score': '0.842052221298217773',
   'comment_count': 0,
   'steam_purchase': True,
   'received_for_free': False,
   'written_during_early_access