In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
metacritic_game_info = pd.read_csv('data/metacritic_game_info.csv', index_col=0)
metacritic_game_info

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player
...,...,...,...,...,...,...,...,...
4995,Donut County,2018,BenEsposito,Action Adventure;General,PC,77,8.1,No Online Multiplayer
4996,MotorStorm: Apocalypse,2011,EvolutionStudios,Driving;Racing;Simulation;Rally / Offroad;Rall...,PlayStation3,77,7.7,4 Online
4997,The Last Guy,2008,SCEJapanStudio,Action Adventure;Sci-Fi;Sci-Fi;General,PlayStation3,77,6.8,1 Player
4998,Valiant Hearts: The Great War,2014,UbisoftMontpellier,Platformer;2D;Action;Platformer;2D,PlayStation4,77,8.4,not specified


In [3]:
metacritic_game_user_comments = pd.read_csv('data/metacritic_game_user_comments.csv', index_col=0)
metacritic_game_user_comments

Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA
...,...,...,...,...,...
283978,Etrian Odyssey Untold: The Millennium Girl,3DS,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell
283979,Etrian Odyssey Untold: The Millennium Girl,3DS,0,Typical overrated Atlus trash. A game i should...,TemplarGR
283980,Etrian Odyssey Untold: The Millennium Girl,3DS,9,While I find the story mode to have annoying c...,midipon
283981,Etrian Odyssey Untold: The Millennium Girl,3DS,8,"Pretty good, but it certainly lacks the visual...",night4


In [4]:
'''
Map title + platform to an index into the game data
'''
game_ids = metacritic_game_user_comments.apply(lambda r: metacritic_game_info[(metacritic_game_info['Title'] == r['Title']) & (metacritic_game_info['Platform'] == r['Platform'])].index[0], axis=1)

In [5]:
'''
Replace title + platform columns with an index into the game data
'''
metacritic_game_user_comments = metacritic_game_user_comments.drop(columns=['Title','Platform'])
metacritic_game_user_comments['Game_ID'] = game_ids
metacritic_game_user_comments

Unnamed: 0,Userscore,Comment,Username,Game_ID
0,10,"Everything in OoT is so near at perfection, it...",SirCaestus,0
1,10,I won't bore you with what everyone is already...,Kaistlin,0
2,10,Anyone who gives the masterpiece below a 7 or ...,Jacody,0
3,10,I'm one of those people who think that this is...,doodlerman,0
4,10,This game is the highest rated game on Metacr...,StevenA,0
...,...,...,...,...
283978,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell,3419
283979,0,Typical overrated Atlus trash. A game i should...,TemplarGR,3419
283980,9,While I find the story mode to have annoying c...,midipon,3419
283981,8,"Pretty good, but it certainly lacks the visual...",night4,3419


In [6]:
'''
Save the data
'''
metacritic_game_info.to_csv('data/game_info.csv')
metacritic_game_user_comments.to_csv('data/user_data.csv')
metacritic_game_user_comments.drop(columns=['Comment']).to_csv('data/user_data_no_comments.csv')

In [7]:
# Make splits
user_data = pd.read_csv('data/user_data.csv')

# Remove anonymous users
user_data = user_data[user_data["Username"] != "[Anonymous]"]
user_data = user_data[user_data["Username"] != "AnonymousMC"]

user_groups = user_data.groupby('Username')
# Remove users with less than 3 reviews
user_groups = user_groups.filter(lambda x: len(x) > 2).groupby('Username')

# 70 - 10 - 20 Split
train_validation_groups, test_groups = train_test_split(
    list(user_groups), 
    train_size=.8,
    random_state=314
)
train_groups, validation_groups = train_test_split(
    train_validation_groups, 
    train_size=.875,
    random_state=314159
)

train_set = pd.concat([x[1] for x in train_groups])
test_set = pd.concat([x[1] for x in test_groups])
validation_set = pd.concat([x[1] for x in validation_groups])

train_count = len(train_set)
test_count = len(test_set)
validation_count = len(validation_set)
total_count = train_count + test_count + validation_count
print(f"Train: {train_count} ({train_count/total_count})")
print(f"Test: {test_count} ({test_count/total_count})")
print(f"Validation: {validation_count} ({validation_count/total_count})")

# Save train-test-validation data
train_set.to_csv('data/user_data_train.csv', index=False)
test_set.to_csv('data/user_data_test.csv', index=False)
validation_set.to_csv('data/user_data_validation.csv', index=False)

# Save train-test-validation data with no comments (for upload to GitHub)
train_set.drop(columns=['Comment']).to_csv('data/user_data_train_no_comments.csv', index=False)
test_set.drop(columns=['Comment']).to_csv('data/user_data_test_no_comments.csv', index=False)
validation_set.drop(columns=['Comment']).to_csv('data/user_data_validation_no_comments.csv', index=False)

Train: 105858 (0.696447956209662)
Test: 31221 (0.20540536984282584)
Validation: 14918 (0.09814667394751213)
