In [1]:
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# dataset url: https://www.kaggle.com/datasets/tristan581/all-55000-games-on-steam-november-2022
df = pd.read_csv('steam_games.csv', sep=';')
#df = df[df['Price'] > 0]
df = df[['Tags', 'Owners']]
df

  df = pd.read_csv('steam_games.csv', sep=';')


Unnamed: 0,Tags,Owners
0,"Action: 5426, FPS: 4831, Multiplayer: 3392, Sh...","10,000,000 .. 20,000,000"
1,"""Shoot Em Up"": 186, Metroidvania: 181, Bullet ...","0 .. 20,000"
2,"Rogue-like: 268, Turn-Based Combat: 254, RPG: ...","200,000 .. 500,000"
3,"Typing: 221, Management: 213, Casual: 209, Dif...","100,000 .. 200,000"
4,"Action: 22, Casual: 22, Indie: 21, Simulation: 20","0 .. 20,000"
...,...,...
55686,"Education: 22, VR: 6, Space: 5","0 .. 20,000"
55687,"Casual: 21, Indie: 21, Adventure: 20","0 .. 20,000"
55688,"Animation & Modeling: 25, 3D: 13","20,000 .. 50,000"
55689,"Early Access: 21, Indie: 20, RPG: 20, Strategy...","0 .. 20,000"


In [24]:
# select entries with at least 6 tags 
tag_count = [len([items_list.strip().split(':')[0] for items_list in str(items_str).split(',')]) for items_str in df['Tags']]
df['tag_count'] = tag_count
df_filtered = df[df['tag_count'] >= 6][['Tags', 'Owners']].copy()
#df_filtered
df.drop(columns=['tag_count'], inplace=True)

In [5]:
# encode y labels
le_owners = preprocessing.LabelEncoder()
le_owners.fit(df_filtered['Owners'])
df_filtered['Owners'] = le_owners.transform(df_filtered['Owners'])
#df_filtered

In [6]:
# encode tags
unique_tags = list(set(itertools.chain(*[[items_list.strip().split(':')[0] for items_list in str(items_str).split(',')] for items_str in df['Tags']])))
tag_id, tag_label = pd.factorize(unique_tags)
encoded_tags = dict(zip(tag_label, tag_id))

df_tags = pd.DataFrame(columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6'])

for i, tags in enumerate(df_filtered['Tags']):
    df_tags.loc[len(df_tags.index)] = [encoded_tags[items_list.strip().split(':')[0]] for items_list in str(tags).split(',')][:6]
#     df_tags.loc[len(df_tags.index)] = [items_list.strip().split(':')[0] for items_list in str(tags).split(',')][:6]
# df_tags

In [7]:
df_filtered.reset_index(drop=True, inplace=True)
df_tags['y'] = df_filtered['Owners']
df_tags

Unnamed: 0,x1,x2,x3,x4,x5,x6,y
0,83,412,420,225,1,385,2
1,259,362,404,52,274,409,0
2,366,136,331,441,187,104,7
3,335,365,33,409,210,255,3
4,83,153,331,104,157,302,3
...,...,...,...,...,...,...,...
41796,83,73,239,29,94,309,0
41797,420,383,158,104,83,153,7
41798,158,33,153,403,115,105,10
41799,158,153,420,342,311,383,3


In [8]:
X = df_tags.iloc[: , :6]
y = df_tags.y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Decision Tree Classifier

In [13]:
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, pred):.5f}')

accuracy score: 0.64865


### MLP Classifier

In [14]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=42)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, pred):.5f}')

accuracy score: 0.64587


### Random Forest Classifier

In [15]:
clf = RandomForestClassifier(max_depth=5, random_state=42)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, pred):.5f}')

accuracy score: 0.64587


### KNeighborsClassifier

In [18]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, pred):.5f}')

accuracy score: 0.64377


In [11]:
# Decision Tree Classifier: 0.48971 - 0.64865
# MLP Classifier: 0.50005 - 0.64587
# Random Forest Classifier: 0.64195 - 0.64587
# K Nearest Neighbors Classifier: 0.61324 - 0.64377

### Locate bias

In [26]:
df_bias = df.groupby(['Owners']).count().sort_values(by=['Tags'], ascending=False)
df_bias.rename(columns={'Tags':'Entries'}, inplace=True)
df_bias.index.names = ['Interest']
df_bias

Unnamed: 0_level_0,Entries
Interest,Unnamed: 1_level_1
"0 .. 20,000",37938
"20,000 .. 50,000",7285
"50,000 .. 100,000",3695
"100,000 .. 200,000",2518
"200,000 .. 500,000",2162
"500,000 .. 1,000,000",933
"1,000,000 .. 2,000,000",526
"2,000,000 .. 5,000,000",335
"5,000,000 .. 10,000,000",97
"10,000,000 .. 20,000,000",41
