In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [11]:
data = pd.read_csv("../data/metacritic_games.csv")
print(data.shape)
data.head()


(20422, 18)


Unnamed: 0,name,platform,developer,publisher,genre(s),players,rating,attribute,release_date,link,critic_positive,critic_neutral,critic_negative,metascore,user_positive,user_neutral,user_negative,user_score
0,Command & Conquer,PC,Westwood Studios,Virgin Interactive,Sci-Fi,1-4,T,,"Aug 31, 1995",/game/pc/command-conquer,5,0,0,94,47,0,1,8.9
1,Full Throttle,PC,LucasArts,LucasArts,Adventure,,,,"Apr 30, 1995",/game/pc/full-throttle,6,2,0,86,18,1,0,8.7
2,Battle Arena Toshinden,PS,Tamsoft,SCEA,Action,1-2,T,,"Sep 9, 1995",/game/playstation/battle-arena-toshinden,1,3,0,69,1,0,1,5.8
3,Sid Meier's Civilization II,PC,MPS Labs,MicroProse,Strategy,1 Player,K-A,,"Feb 29, 1996",/game/pc/sid-meiers-civilization-ii,7,0,0,94,46,0,1,8.9
4,Quake,PC,id Software,id Software,Action,1-16,M,,"Jun 22, 1996",/game/pc/quake,9,0,0,94,84,4,1,8.8


In [12]:
# Calculate total number of users and critics
number_of_users = (
    data["user_positive"].to_numpy()
    + data["user_neutral"].to_numpy()
    + data["user_negative"].to_numpy()
)
number_of_critics = (
    data["critic_positive"].to_numpy()
    + data["critic_neutral"].to_numpy()
    + data["critic_negative"].to_numpy()
)

data["number_of_users"] = number_of_users
data["number_of_critics"] = number_of_critics


In [13]:
data.drop(
    [
        "name",
        "players",
        "attribute",
        "link",
        "critic_positive",
        "critic_neutral",
        "critic_negative",
        "user_positive",
        "user_neutral",
        "user_negative",
    ],
    axis=1,
    inplace=True,
)
data.head()


Unnamed: 0,platform,developer,publisher,genre(s),rating,release_date,metascore,user_score,number_of_users,number_of_critics
0,PC,Westwood Studios,Virgin Interactive,Sci-Fi,T,"Aug 31, 1995",94,8.9,48,5
1,PC,LucasArts,LucasArts,Adventure,,"Apr 30, 1995",86,8.7,19,8
2,PS,Tamsoft,SCEA,Action,T,"Sep 9, 1995",69,5.8,2,4
3,PC,MPS Labs,MicroProse,Strategy,K-A,"Feb 29, 1996",94,8.9,47,7
4,PC,id Software,id Software,Action,M,"Jun 22, 1996",94,8.8,89,9


In [14]:
# Change release date to release year
release_date = data["release_date"].to_list()
release_year = []
for val in release_date:
    year = val[-4:]
    release_year.append(year)

data["release_date"] = release_year
data.head()


Unnamed: 0,platform,developer,publisher,genre(s),rating,release_date,metascore,user_score,number_of_users,number_of_critics
0,PC,Westwood Studios,Virgin Interactive,Sci-Fi,T,1995,94,8.9,48,5
1,PC,LucasArts,LucasArts,Adventure,,1995,86,8.7,19,8
2,PS,Tamsoft,SCEA,Action,T,1995,69,5.8,2,4
3,PC,MPS Labs,MicroProse,Strategy,K-A,1996,94,8.9,47,7
4,PC,id Software,id Software,Action,M,1996,94,8.8,89,9


In [15]:
# Get unique values for each column
column_list = data.columns.values.tolist()
for column_name in column_list:
    print(data[column_name].unique())


['PC' 'PS' 'N64' 'DC' 'PS2' 'XBOX' 'GBA' 'GC' 'DS' 'X360' 'PSP' 'WII'
 'PS3' '3DS' 'VITA' 'WIIU' 'PS4' 'XONE' 'Switch']
['Westwood Studios' 'LucasArts' 'Tamsoft' ... 'Deeli network'
 'Undercoders' 'Snap Finger Click Ltd']
['Virgin Interactive' 'LucasArts' 'SCEA' ... 'Deeli network' 'Undercoders'
 'Snap Finger Click Ltd']
['Sci-Fi' 'Adventure' 'Action' 'Strategy' 'Role-Playing' 'Driving'
 'Action Adventure' 'Miscellaneous' 'Simulation' 'Sports' 'Puzzle'
 "Beat-'Em-Up" 'General' 'First-Person' 'Pinball' 'Wargame' 'Modern'
 'Shooter' 'Fantasy' 'Compilation' 'Action RPG' 'Dancing' 'Street'
 'PC-style RPG' 'Music' 'Sim' 'Platformer' 'Rally / Offroad' 'Board Games'
 'Olympic Sports' 'Formula One' 'WWII' 'Massively Multiplayer Online'
 'Ice Hockey' 'Horror' 'GT / Street' 'Music Maker' 'Historic' 'Fighting'
 'Edutainment' 'Tycoon' 'Alternative' 'Arcade' '3D' 'Party' 'Card Battle'
 'Tactical' 'Console-style RPG' 'Traditional' 'Third-Person' 'WWI'
 'Virtual Life' 'Real-Time' 'Scrolling' 'Flight'

In [16]:
# Quantize some columns
data["metascore"], bins = pd.qcut(data["metascore"], 7, labels=False, retbins=True)
print(bins)

data["number_of_users"], bins = pd.qcut(
    data["number_of_users"], 8, labels=False, retbins=True
)
print(bins)

data["number_of_critics"], bins = pd.qcut(
    data["number_of_critics"], 8, labels=False, retbins=True
)
print(bins)

# User score has tbd values, replace them with 7
# Then do the quantization
data["user_score"] = data["user_score"].replace("tbd", "7")
data["user_score"] = data["user_score"].astype(float)
quantized_user_score, bins = pd.qcut(
    data[data["user_score"] != 7]["user_score"], 7, labels=False, retbins=True
)
data["user_score"] = data["user_score"].astype(int)
data.loc[data["user_score"] != 7, "user_score"] = quantized_user_score
print(bins)

data.head()


[ 8. 55. 63. 68. 73. 77. 82. 99.]
[0.000e+00 1.000e+00 2.000e+00 3.000e+00 5.000e+00 9.000e+00 1.600e+01
 3.800e+01 4.689e+03]
[  0.   5.   7.  10.  14.  19.  26.  39. 118.]
[0.1 5.5 6.4 7.1 7.5 7.9 8.3 9.4]


Unnamed: 0,platform,developer,publisher,genre(s),rating,release_date,metascore,user_score,number_of_users,number_of_critics
0,PC,Westwood Studios,Virgin Interactive,Sci-Fi,T,1995,6,6,7,0
1,PC,LucasArts,LucasArts,Adventure,,1995,6,6,6,2
2,PS,Tamsoft,SCEA,Action,T,1995,3,1,1,0
3,PC,MPS Labs,MicroProse,Strategy,K-A,1996,6,6,7,1
4,PC,id Software,id Software,Action,M,1996,6,6,7,2


In [17]:
# Encode categorical columns to numerical values
le = LabelEncoder()
data["platform"] = le.fit_transform(data["platform"])
data["developer"] = le.fit_transform(data["developer"])
data["publisher"] = le.fit_transform(data["publisher"])
data["genre(s)"] = le.fit_transform(data["genre(s)"])
data["rating"] = le.fit_transform(data["rating"])
data["release_date"] = le.fit_transform(data["release_date"])
data.head()


Unnamed: 0,platform,developer,publisher,genre(s),rating,release_date,metascore,user_score,number_of_users,number_of_critics
0,6,4515,2048,50,7,0,6,6,7,0
1,6,2330,1113,6,8,0,6,6,6,2
2,7,4006,1629,3,7,0,3,1,1,0
3,6,2369,1185,56,4,1,6,6,7,1
4,6,4696,2205,3,5,1,6,6,7,2


In [18]:
# Save the dataframe except for the metascore column to a npy file
data_without_metascore = data.drop(["metascore"], axis=1)
data_without_metascore = data_without_metascore.to_numpy()
np.save("../data/metacritic_data.npy", data_without_metascore)

# Save the metascore column to a npy file
# Metascore is the target column
metascore = data["metascore"].to_numpy()
np.save("../data/metascore.npy", metascore)
