### Import Libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")
import re
import pandas as pd

In [3]:
# Loading clean data - processed by game data analysis
dataGames = pd.read_csv('data/cleaned/steam_games_reviews1.csv')

dataUsers = pd.read_csv('data/cleaned/purchase_play_rating.csv', usecols= ['user', 'game', 'hrs'])

In [4]:
dataGames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40833 entries, 0 to 40832
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   url                         40833 non-null  object 
 1   types                       40831 non-null  object 
 2   name                        40817 non-null  object 
 3   desc_snippet                27612 non-null  object 
 4   recent_reviews              2706 non-null   object 
 5   all_reviews                 28470 non-null  object 
 6   release_date                37654 non-null  object 
 7   developer                   40490 non-null  object 
 8   publisher                   35733 non-null  object 
 9   popular_tags                37888 non-null  object 
 10  game_details                40313 non-null  object 
 11  languages                   40797 non-null  object 
 12  achievements                12194 non-null  float64
 13  genre                       403

### Data Pre-processing

In [5]:
dataGames['name'] = dataGames['name'].fillna('')

# create column ID for game and user dataset
dataGames["ID"] = ""
dataUsers["ID"] = ""

In [6]:
# remove spaces and special character from game name in both dataset
for i, row in dataGames.iterrows():
    clean = re.sub('[^A-Za-z0-9]+', '', row["name"])
    clean = clean.lower()
    dataGames.at[i, 'ID'] = clean

for i, row in dataUsers.iterrows():
    clean = re.sub('[^A-Za-z0-9]+', '', row["game"])
    clean = clean.lower()
    dataUsers.at[i, 'ID'] = clean

In [7]:
dataGames.head(2)

Unnamed: 0,url,types,name,desc_snippet,recent_reviews,all_reviews,release_date,developer,publisher,popular_tags,...,genre,game_description,mature_content,minimum_requirements,recommended_requirements,original_price,discount_price,review_qualification,percentage_positive_review,ID
0,https://store.steampowered.com/app/379720/DOOM/,app,DOOM,Now includes all three premium DLC packs (Unto...,"Very Positive,(554),- 89% of the 554 user revi...","Very Positive,(42,550),- 92% of the 42,550 use...","May 12, 2016",id Software,"Bethesda Softworks,Bethesda Softworks","FPS,Gore,Action,Demons,Shooter,First-Person,Gr...",...,Action,"About This Game Developed by id software, the...",,"Minimum:,OS:,Windows 7/8.1/10 (64-bit versions...","Recommended:,OS:,Windows 7/8.1/10 (64-bit vers...",$19.99,$14.99,Very Positive,92,doom
1,https://store.steampowered.com/app/578080/PLAY...,app,PLAYERUNKNOWN'S BATTLEGROUNDS,PLAYERUNKNOWN'S BATTLEGROUNDS is a battle roya...,"Mixed,(6,214),- 49% of the 6,214 user reviews ...","Mixed,(836,608),- 49% of the 836,608 user revi...","Dec 21, 2017",PUBG Corporation,"PUBG Corporation,PUBG Corporation","Survival,Shooter,Multiplayer,Battle Royale,PvP...",...,"Action,Adventure,Massively Multiplayer",About This Game PLAYERUNKNOWN'S BATTLEGROUND...,Mature Content Description The developers de...,"Minimum:,Requires a 64-bit processor and opera...","Recommended:,Requires a 64-bit processor and o...",$29.99,,Mixed,49,playerunknownsbattlegrounds


In [21]:
dataUsers.head(2)

Unnamed: 0,user,game,hrs,ID
0,5250,Alien Swarm,4.9,alienswarm
1,5250,Cities Skylines,144.0,citiesskylines


In [8]:
# find all the games in the game dataset that match the games in user dataset
gameArrayUsers = dataUsers["ID"].unique()
print(len(gameArrayUsers))

5151


In [9]:
criteriaTest = dataGames['ID'].isin(gameArrayUsers)
usedGames = dataGames[criteriaTest]
print(len(usedGames))

3036


In [24]:
# relevant info for recommendation: genre game_details popular_tags publisher developer
usedGames.loc[:, 'genre'] = usedGames['genre'].fillna('')
usedGames.loc[:, 'game_details'] = usedGames['game_details'].fillna('')
usedGames.loc[:, 'popular_tags'] = usedGames['popular_tags'].fillna('')
usedGames.loc[:, 'publisher'] = usedGames['publisher'].fillna('')
usedGames.loc[:, 'developer'] = usedGames['developer'].fillna('')

In [25]:
def clean_data(x):
    if isinstance(x, str):
        return x.replace(" ", "")
    else:
        print(x)
        return x

In [26]:
# remove spaces between the word. This way
usedGames.loc[:, 'genre'] = usedGames['genre'].apply(clean_data)
usedGames.loc[:, 'game_details'] = usedGames['game_details'].apply(clean_data)
usedGames.loc[:, 'popular_tags'] = usedGames['popular_tags'].apply(clean_data)
usedGames.loc[:, 'publisher'] = usedGames['publisher'].apply(clean_data)
usedGames.loc[:, 'developer'] = usedGames['developer'].apply(clean_data)


In [10]:
# create some column containing a mix of different information
usedGames["genre_publisher_developer"] = usedGames['genre'] + usedGames['publisher'] + usedGames['developer']
usedGames["genre_popular_tags_developer"] = usedGames['genre'] + usedGames['popular_tags'] + usedGames['developer']
usedGames["genre_popular_tags_game_details"] = usedGames['genre'] + usedGames['popular_tags'] + usedGames['game_details']
usedGames["genre_publisher_developer_game_details"] = usedGames['genre'] + usedGames['publisher'] + usedGames['developer'] + usedGames['game_details']

In [28]:
usedGames.drop_duplicates("name")
usedGames.to_csv('data/processed_games_for_content-based.csv', index=False)

In [30]:
usedGames.head(5)

Unnamed: 0,name,developer,publisher,popular_tags,game_details,genre,ID,genre_publisher_developer,genre_popular_tags_developer,genre_popular_tags_game_details,genre_publisher_developer_game_details
3,DayZ,BohemiaInteractive,"BohemiaInteractive,BohemiaInteractive","Survival,Zombies,OpenWorld,Multiplayer,PvP,Mas...","Multi-player,OnlineMulti-Player,SteamWorkshop,...","Action,Adventure,MassivelyMultiplayer",dayz,"Action,Adventure,MassivelyMultiplayerBohemiaIn...","Action,Adventure,MassivelyMultiplayerSurvival,...","Action,Adventure,MassivelyMultiplayerSurvival,...","Action,Adventure,MassivelyMultiplayerBohemiaIn..."
4,EVE Online,CCP,"CCP,CCP","Space,MassivelyMultiplayer,Sci-fi,Sandbox,MMOR...","Multi-player,OnlineMulti-Player,MMO,Co-op,Onli...","Action,FreetoPlay,MassivelyMultiplayer,RPG,Str...",eveonline,"Action,FreetoPlay,MassivelyMultiplayer,RPG,Str...","Action,FreetoPlay,MassivelyMultiplayer,RPG,Str...","Action,FreetoPlay,MassivelyMultiplayer,RPG,Str...","Action,FreetoPlay,MassivelyMultiplayer,RPG,Str..."
12,TERA,"Bluehole,Inc.","EnMasseEntertainment,EnMasseEntertainment","FreetoPlay,MMORPG,MassivelyMultiplayer,RPG,Ope...","Multi-player,MMO,Co-op,SteamTradingCards,Parti...","Action,Adventure,FreetoPlay,MassivelyMultiplay...",tera,"Action,Adventure,FreetoPlay,MassivelyMultiplay...","Action,Adventure,FreetoPlay,MassivelyMultiplay...","Action,Adventure,FreetoPlay,MassivelyMultiplay...","Action,Adventure,FreetoPlay,MassivelyMultiplay..."
14,Stonehearth,RadiantEntertainment,"(none),(none)","CityBuilder,Building,Sandbox,Strategy,Survival...","Single-player,Multi-player,OnlineMulti-Player,...","Indie,Simulation,Strategy",stonehearth,"Indie,Simulation,Strategy(none),(none)RadiantE...","Indie,Simulation,StrategyCityBuilder,Building,...","Indie,Simulation,StrategyCityBuilder,Building,...","Indie,Simulation,Strategy(none),(none)RadiantE..."
20,Call of Duty®: Black Ops,Treyarch,"Activision,Activision","Action,FPS,Zombies,Multiplayer,Shooter,Singlep...","Single-player,Multi-player,Co-op,SteamAchievem...",Action,callofdutyblackops,"ActionActivision,ActivisionTreyarch","ActionAction,FPS,Zombies,Multiplayer,Shooter,S...","ActionAction,FPS,Zombies,Multiplayer,Shooter,S...","ActionActivision,ActivisionTreyarchSingle-play..."
