In [5]:
from pandas import read_csv
from tqdm.notebook import tqdm
import random
import re
from collections import OrderedDict

In [6]:
df = read_csv("data/steam_games_data.csv")
df

Unnamed: 0,name,steam_appid,short_description,detailed_description,recommendations,genres
0,Counter-Strike,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,162799,Action
1,Team Fortress Classic,20,One of the most popular online action games of...,One of the most popular online action games of...,6688,Action
2,Day of Defeat,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,4326,Action
3,Deathmatch Classic,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,2349,Action
4,Half-Life: Opposing Force,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,23052,Action
...,...,...,...,...,...,...
10875,Summer Crush,2163060,Summer Crush is a visual novel for adults wher...,"<span class=""bb_img_ctn""><video class=""bb_img""...",383,Indie
10876,Yet Another Zombie Survivors,2163330,The Horde is coming but you're ready to fight ...,"<p class=""bb_paragraph"" ><span class=""bb_img_c...",10989,"Action, Casual, Indie, RPG, Early Access"
10877,Tales of Spark,2165390,&quot;Tales of Spark&quot; is a Hack &amp; Sla...,"<h1>Development Plan</h1><p><span class=""bb_im...",554,"Action, Adventure, Indie, RPG, Early Access"
10878,Taora : Survival,2165470,Taora is an open world survival game with uniq...,"<span class=""bb_img_ctn""><img class=""bb_img"" s...",305,"Action, Adventure, RPG, Simulation, Early Access"


In [7]:
def clean_description(description):
    regex_pattern = re.compile(r"<.*?>")

    # remove all html tags
    cleaned_description = re.sub(regex_pattern, "", description)
    # remove weird html artifacts like those: &lt;p&gt;
    cleaned_description = re.sub(r"&lt;.*?&gt;", " ", cleaned_description)
    cleaned_description = re.sub(r"&\w+?;", " ", cleaned_description)
    # remove tabs and new lines
    cleaned_description = cleaned_description.replace("\t", "").replace("\n", " ").strip()
    # remove multiple spaces
    cleaned_description = ' '.join(cleaned_description.split())
    return cleaned_description

In [8]:
import py3langid as langid

# remove duplicate steam_appid
df = df.drop_duplicates(subset="steam_appid")

def is_english(text):
    lang, _ = langid.classify(text)
    return lang == "en"

english_descriptions = df["short_description"].apply(is_english)
df = df[english_descriptions].reset_index(drop=True)
df

Unnamed: 0,name,steam_appid,short_description,detailed_description,recommendations,genres
0,Counter-Strike,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,162799,Action
1,Team Fortress Classic,20,One of the most popular online action games of...,One of the most popular online action games of...,6688,Action
2,Day of Defeat,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,4326,Action
3,Deathmatch Classic,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,2349,Action
4,Half-Life: Opposing Force,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,23052,Action
...,...,...,...,...,...,...
10607,Summer Crush,2163060,Summer Crush is a visual novel for adults wher...,"<span class=""bb_img_ctn""><video class=""bb_img""...",383,Indie
10608,Yet Another Zombie Survivors,2163330,The Horde is coming but you're ready to fight ...,"<p class=""bb_paragraph"" ><span class=""bb_img_c...",10989,"Action, Casual, Indie, RPG, Early Access"
10609,Tales of Spark,2165390,&quot;Tales of Spark&quot; is a Hack &amp; Sla...,"<h1>Development Plan</h1><p><span class=""bb_im...",554,"Action, Adventure, Indie, RPG, Early Access"
10610,Taora : Survival,2165470,Taora is an open world survival game with uniq...,"<span class=""bb_img_ctn""><img class=""bb_img"" s...",305,"Action, Adventure, RPG, Simulation, Early Access"


In [9]:
# clean descriptions
df.loc[:, ["short_description"]] = df.apply(lambda row: clean_description(row["short_description"]), axis=1)
df.loc[:, ["detailed_description"]] = df.apply(lambda row: clean_description(row["detailed_description"]), axis=1)

# get rid of entries with empty descriptions
df = df[df["short_description"].str.len() > 0]
df = df[df["detailed_description"].str.len() > 0]

# get rid of entries with no genres
df = df[df["genres"].notna()]

# split genres into list
df["genres"] = df["genres"].apply(lambda x: x.split(", "))

# legal genres
genres = {"Indie", "Action", "Adventure", "Simulation", "RPG", "Strategy", "Casual"}

# filter genres to only legal ones
def filter_genres(genres_list):
    return [genre for genre in genres_list if genre in genres]

df.loc[:, ["genres"]] = df.apply(lambda row: filter_genres(row["genres"]), axis=1)

# get rid of entries with no legal genres
df = df[df["genres"].map(len) > 0]

df.reset_index(drop=True, inplace=True)

# remove the genre names from the descriptions
def remove_genre_names(description, genres_list):
    for genre in genres_list:
        description = re.sub(rf"\b{re.escape(genre)}\b", "", description, flags=re.IGNORECASE)
    # remove multiple spaces again
    description = ' '.join(description.split())
    return description

df.loc[:, ["short_description"]] = df.apply(lambda row: remove_genre_names(row["short_description"], row["genres"]), axis=1)
df.loc[:, ["detailed_description"]] = df.apply(lambda row: remove_genre_names(row["detailed_description"], row["genres"]), axis=1)

df

Unnamed: 0,name,steam_appid,short_description,detailed_description,recommendations,genres
0,Counter-Strike,10,Play the world's number 1 online game. Engage ...,Play the world's number 1 online game. Engage ...,162799,[Action]
1,Team Fortress Classic,20,One of the most popular online games of all ti...,One of the most popular online games of all ti...,6688,[Action]
2,Day of Defeat,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,4326,[Action]
3,Deathmatch Classic,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,2349,[Action]
4,Half-Life: Opposing Force,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,23052,[Action]
...,...,...,...,...,...,...
10408,Summer Crush,2163060,Summer Crush is a visual novel for adults wher...,Summer Crush is a visual novel for adults wher...,383,[Indie]
10409,Yet Another Zombie Survivors,2163330,The Horde is coming but you're ready to fight ...,You've had enough of hiding behind the barrica...,10989,"[Action, Casual, Indie, RPG]"
10410,Tales of Spark,2165390,Tales of Spark is a Hack Slash with an Eastern...,Development PlanFollow and join usFor inquirie...,554,"[Action, Adventure, Indie, RPG]"
10411,Taora : Survival,2165470,Taora is an open world survival game with uniq...,Taora is an island that is infested with zombi...,305,"[Action, Adventure, RPG, Simulation]"


In [10]:
df.to_csv("data/steam_games_data_cleaned.csv", index=False)

In [11]:
import nltk
import string

# convert to lowercase
df["short_description"] = df["short_description"].apply(lambda x: x.lower())
df["detailed_description"] = df["detailed_description"].apply(lambda x: x.lower())

In [12]:
# remove punctuation
df["short_description"] = df["short_description"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df["detailed_description"] = df["detailed_description"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

In [13]:
# remove stopwords
nltk.download("punkt_tab")
nltk.download("stopwords")
EN_STOPWORDS = set(nltk.corpus.stopwords.words("english"))

def remove_stopwords(text):
    tokens = nltk.tokenize.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in EN_STOPWORDS]
    return " ".join(filtered_tokens)

df["short_description"] = df["short_description"].apply(lambda x: remove_stopwords(x))
df["detailed_description"] = df["detailed_description"].apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package punkt_tab to /home/stachu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/stachu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
lemmatizer = nltk.stem.WordNetLemmatizer()

nltk.download("wordnet")

def lemmatize_text(text):
    tokens = nltk.tokenize.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

df["short_description"] = df["short_description"].apply(lambda x: lemmatize_text(x))
df["detailed_description"] = df["detailed_description"].apply(lambda x: lemmatize_text(x))

[nltk_data] Downloading package wordnet to /home/stachu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
df.to_csv("data/steam_games_data_deep_cleaned.csv", index=False)