In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("../../data/steam/data.raw.csv")

In [3]:
df.shape

(18842, 7)

In [4]:
# remove elements that are not games
df = df[df['type'] == 'game']
df.shape

(8637, 7)

In [5]:
def clean_txt(txt):
    # remove non ascii characters
    txt2 = ''.join([i if ord(i) < 128 else ' ' for i in str(txt)])
    
    # no xml entities either
    txt3 = re.sub('&[^;]{1,6};',' ',txt2)
    
    # clean whitespace
    txt4 = re.sub('\s+',' ',txt3).lstrip().rstrip()
    
    
    return txt4

df['name'] = df['name'].map(lambda value: clean_txt(value))
df['detailed_description'] = df['detailed_description'].map(lambda value: clean_txt(value))
df['about_the_game'] = df['about_the_game'].map(lambda value: clean_txt(value))

In [6]:
# taking out duplicate names
df = df.drop_duplicates(['name'])
df.shape

(8522, 7)

In [7]:
# no empty names (these are probably due to games whose name was in chinese or japanese)
df = df.drop(df.apply(lambda row: row['name'].strip() == '',axis=1))
df.shape

(8520, 7)

In [10]:
# no need for this because it's just games now
df = df.drop(['type'],axis=1)
df

Unnamed: 0,appid,name,detailed_description,about_the_game,categories,genres
2,30,Day of Defeat,enlist in an intense brand of axis vs. allied ...,enlist in an intense brand of axis vs. allied ...,"Multi-player,Valve Anti-Cheat enabled",Action
3,40,Deathmatch Classic,enjoy fast-paced multiplayer gaming with death...,enjoy fast-paced multiplayer gaming with death...,"Multi-player,Valve Anti-Cheat enabled",Action
4,50,Half-Life: Opposing Force,return to the black mesa research facility as ...,return to the black mesa research facility as ...,"Single-player,Multi-player,Valve Anti-Cheat en...",Action
5,60,Ricochet,a futuristic action game that challenges your ...,a futuristic action game that challenges your ...,"Multi-player,Valve Anti-Cheat enabled",Action
6,70,Half-Life,named game of the year by over 50 publications...,named game of the year by over 50 publications...,"Single-player,Multi-player,Valve Anti-Cheat en...",Action
7,80,Counter-Strike: Condition Zero,"with its extensive tour of duty campaign, a ne...","with its extensive tour of duty campaign, a ne...","Single-player,Multi-player,Valve Anti-Cheat en...",Action
10,130,Half-Life: Blue Shift,made by gearbox software and originally releas...,made by gearbox software and originally releas...,Single-player,Action
13,220,Half-Life 2,1998. half-life sends a shock through the game...,1998. half-life sends a shock through the game...,"Single-player,Steam Achievements,Steam Trading...",Action
14,240,Counter-Strike: Source,the next installment of the world's # 1 online...,the next installment of the world's # 1 online...,"Multi-player,Cross-Platform Multiplayer,Steam ...",Action
15,280,Half-Life: Source,"winner of over 50 game of the year awards, hal...","winner of over 50 game of the year awards, hal...",Single-player,Action


In [11]:
df = df.reset_index().drop(['index'],axis=1)
df.shape

(8520, 6)

In [12]:
df.to_csv("../../data/steam/data.clean.csv",index=False,index_label=False)