In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
import gc
import keras
import json


In [2]:
with open("./input/steamspy/detailed/steam_spy_detailed.json", "r") as f:
    # with open('/Final/input/steamspy/detailed/steam_spy_detailed.json', 'r') as f:
    raw_file = json.load(f)


# Return format for an app:

- appid - Steam Application ID. If it's 999999, then data for this application is hidden on developer's request, sorry.
- name - game's name
- ~~developer - comma separated list of the developers of the game~~
- ~~publisher - comma separated list of the publishers of the game~~
- ~~score_rank - score rank of the game based on user reviews~~
- ~~owners - owners of this application on Steam as a range.~~
- average_forever - average playtime since March 2009. In minutes.
- ~~average_2weeks - average playtime in the last two weeks. In minutes.~~
- median_forever - median playtime since March 2009. In minutes.
- ~~median_2weeks - median playtime in the last two weeks. In minutes.~~
- ~~ccu - peak CCU yesterday.~~
- ~~price - current US price in cents.~~
- initialprice - original US price in cents.
- ~~discount - current discount in percents.~~
- tags - game's tags with votes in JSON array.
- languages - list of supported languages.
- genre - list of genres.

Strikethrough indicates dropped variable not used in our analysis


In [3]:
spy_data = pd.DataFrame.from_records(raw_file).T
del raw_file
gc.collect()
spy_data.drop(
    [
        "appid",
        "developer",
        "publisher",
        "score_rank",
        "userscore",
        "owners",
        "average_2weeks",
        "median_2weeks",
        "price",
        "discount",
        "ccu",
    ],
    axis=1,
    inplace=True,
)
spy_data.rename(
    {
        "name": "Name",
        "positive": "Positive Reviews",
        "negative": "Negative Reviews",
        "average_forever": "Average Playtime",
        "median_forever": "Median Playtime",
        "initialprice": "Price",
        "languages": "Languages",
        "genre": "Genres",
        "tags": "Tags",
    },
    axis=1,
    inplace=True,
)


The following line will inflate the `Tags` column to many different columns, each named after the possible keys and the values being the values from the tags dictionary. This may or may not be necessary for analysis, so I will leave it here for further discussion later.


In [4]:
# pd.concat([spy_data.drop(["Tags"], axis=1), spy_data['Tags'].apply(pd.Series, dtype='uint32')], axis=1).fillna(0)


In [5]:
spy_data.head()


Unnamed: 0,Name,Positive Reviews,Negative Reviews,Average Playtime,Median Playtime,Price,Languages,Genres,Tags
10,Counter-Strike,196594,5073,10978,311,999,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 5393, 'FPS': 4819, 'Multiplayer': 3..."
100,Counter-Strike: Condition Zero,13442,1535,813,57,999,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 1339, 'FPS': 979, 'Shooter': 723, '..."
1000000,ASCENXION,27,5,0,0,999,"English, Korean, Simplified Chinese","Action, Adventure, Indie","{'Shoot 'Em Up': 186, 'Metroidvania': 181, 'Bu..."
1000010,Crown Trick,3812,584,775,1033,1999,"English, Simplified Chinese, Japanese, Traditi...","Adventure, Indie, RPG, Strategy","{'Rogue-like': 264, 'Turn-Based Combat': 250, ..."
1000030,"Cook, Serve, Delicious! 3?!",1470,102,83,107,1999,English,"Action, Indie, Simulation, Strategy","{'Typing': 219, 'Management': 209, 'Casual': 2..."


TODO: Read in player count data (needs a lot of ram, don't try doing it without at least 16(?)gb), take average and join the dataframes, then run analysis with player count as the target. See CS 345 final project for more information on how to implement this


In [6]:
with open("./input/steam_charts/steam_charts.json", "r") as f:
    raw_player_count_file = json.load(f)


In [7]:
mean_dict = {}
for key in raw_player_count_file.keys():
    mean_dict[key] = (
        pd.DataFrame(raw_player_count_file[key], index=[key]).mean(axis=1).iloc[0]
    )
player_count_data = pd.DataFrame(
    mean_dict, index=["Mean Concurrent Players All Time"]
).T.sort_index()


In [8]:
player_count_data.head()


Unnamed: 0,Mean Concurrent Players All Time
10,11408.154961
100,49.588629
1000000,0.382979
1000010,69.729662
1000030,46.01487


In [9]:
all_data = spy_data.merge(
    player_count_data, how="inner", left_index=True, right_index=True
)
all_data.head()


Unnamed: 0,Name,Positive Reviews,Negative Reviews,Average Playtime,Median Playtime,Price,Languages,Genres,Tags,Mean Concurrent Players All Time
10,Counter-Strike,196594,5073,10978,311,999,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 5393, 'FPS': 4819, 'Multiplayer': 3...",11408.154961
100,Counter-Strike: Condition Zero,13442,1535,813,57,999,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 1339, 'FPS': 979, 'Shooter': 723, '...",49.588629
1000000,ASCENXION,27,5,0,0,999,"English, Korean, Simplified Chinese","Action, Adventure, Indie","{'Shoot 'Em Up': 186, 'Metroidvania': 181, 'Bu...",0.382979
1000010,Crown Trick,3812,584,775,1033,1999,"English, Simplified Chinese, Japanese, Traditi...","Adventure, Indie, RPG, Strategy","{'Rogue-like': 264, 'Turn-Based Combat': 250, ...",69.729662
1000030,"Cook, Serve, Delicious! 3?!",1470,102,83,107,1999,English,"Action, Indie, Simulation, Strategy","{'Typing': 219, 'Management': 209, 'Casual': 2...",46.01487


In [10]:
all_data["Languages"] = all_data["Languages"].str.split(", ")
all_data["Genres"] = all_data["Genres"].str.split(", ")
all_data = all_data.loc[all_data["Languages"].notna()]
all_data = all_data.loc[all_data["Mean Concurrent Players All Time"].notna()]


Probably want to trim this a little bit to stop double counts, but also low priority.


In [11]:
mlb = MultiLabelBinarizer()
oe = OrdinalEncoder(dtype="uint32")
column_transform = make_column_transformer(
    (
        oe,
        [
            "Positive Reviews",
            "Negative Reviews",
            "Average Playtime",
            "Median Playtime",
            "Price",
        ],
    )
)
X = column_transform.fit_transform(all_data)
X_languages = mlb.fit_transform(all_data["Languages"].tolist())
X_genres = mlb.fit_transform(all_data["Genres"].tolist())
Xtmp = [[] for _ in range(len(X))]
Xt = [[] for _ in range(len(X))]
for i in range(len(X)):
    Xtmp[i] = np.append(X[i], X_languages[i])
    Xt[i] = np.append(Xtmp[i], X_genres[i])
Xt = np.stack(Xt, axis=0)


In [12]:
y = np.array(all_data["Mean Concurrent Players All Time"])


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    Xt, y, test_size=0.75, shuffle=True, random_state=42
)
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


((12959, 83), (12959,), (38880, 83), (38880,))

In [14]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)


-0.12912198395030883

In [15]:
enet = ElasticNet(random_state=42)
enet.fit(X_train, y_train)
enet.score(X_test, y_test)


-0.018161184741588343

In [16]:
ridge = Ridge(random_state=42)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)


-0.12880602685362774

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)


-0.13000249992474489

In [18]:
svr = SVR()
svr.fit(X_train, y_train)
svr.score(X_test, y_test)


0.0011547566807220688

In [19]:
nsvr = NuSVR()
nsvr.fit(X_train, y_train)
nsvr.score(X_test, y_test)


0.0011700983796757436

In [23]:
lsvr = LinearSVR(max_iter = 1e6)
lsvr.fit(X_train, y_train)
lsvr.score(X_test, y_test)




0.004542628684198413