This script cleans up the features csv and export the cleaned dataset into "game_feature_data.ipynb".

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
#read csv

# https://www.kaggle.com/datasets/thedevastator/get-your-game-on-metacritic-recommendations-and

# Downside is that this dataset is up to date until 2016, which is pretty old compared to now. But it still has a significant amount of games, we will see how it works out.

file_path = "games-features-edit.csv"
df = pd.read_csv(file_path)
print(len(df))
df.head()

12624


Unnamed: 0,ResponseName,ReleaseDate,Metacritic,RecommendationCount,IsFree,GenreIsNonGame,GenreIsIndie,GenreIsAction,GenreIsAdventure,GenreIsCasual,GenreIsStrategy,GenreIsRPG,GenreIsSimulation,GenreIsEarlyAccess,GenreIsFreeToPlay,GenreIsSports,GenreIsRacing,GenreIsMassivelyMultiplayer,PriceInitial
0,Counter-Strike,Nov 1 2000,88,68991,False,False,False,True,False,False,False,False,False,False,False,False,False,False,9.99
1,Team Fortress Classic,Apr 1 1999,0,2439,False,False,False,True,False,False,False,False,False,False,False,False,False,False,4.99
2,Day of Defeat,May 1 2003,79,2319,False,False,False,True,False,False,False,False,False,False,False,False,False,False,4.99
3,Deathmatch Classic,Jun 1 2001,0,888,False,False,False,True,False,False,False,False,False,False,False,False,False,False,4.99
4,Half-Life: Opposing Force,Nov 1 1999,0,2934,False,False,False,True,False,False,False,False,False,False,False,False,False,False,4.99


In [4]:
# filter out games not released on this list
df = df[df['ReleaseDate'] != 'Coming Soon']
df = df[df['ReleaseDate'] != 'To Be Announced.']

# add a releasedata feature to our data
import re
# aunction to extract numbers
def extract_numbers(text):
    numbers = re.findall(r'\d+', text)  # Extract all numbers
    concatenated_numbers = ''.join(numbers)
    result = concatenated_numbers
    return result if numbers else None  # Convert to integer or return None if no number found

def lastfour(num):
    # Ensure num is treated as a string and handle non-numeric values
    if num is None or num == 'nan':
        return None
    if pd.isna(num):  # Check for NaN values
        return None
    num_str = num # Convert to int first to handle floats like 1.0
    return int(num_str[-4:]) if len(num_str) >= 4 else int(num_str)  # Extract last 4 digits, handle shorter numbers

# apply the function to the column
df['ReleaseDate'] = df['ReleaseDate'].astype(str).apply(extract_numbers)
df['ReleaseDate'] = df['ReleaseDate'].apply(lastfour)
# clean out nans if any
df = df.dropna()
# clean out weird numbers
df = df[df['ReleaseDate'] > 1900 ]
# add feature column, chose 2014 because most games came out around 2015-2016 in this dataset, this is a good indicator if the game is considered "old" or not
    # this is the code used to see the distribution
    # category_counts = df['ReleaseDate'].value_counts()
    # category_counts.plot(kind='bar', edgecolor='black', color='skyblue', figsize=(7, 5))
df['After2014'] = (df['ReleaseDate'] > 2014).astype(int)


# add a expensive or not feature column, here I used price more expensive than 20 dollars.
df['Expensive'] = (df['PriceInitial'] > 20).astype(int)

# filter out games that no one recommended, this will reduce the data size by a good 60%.
df = df[df['RecommendationCount'] > 0 ]

print(len(df))
df.head()


4846


Unnamed: 0,ResponseName,ReleaseDate,Metacritic,RecommendationCount,IsFree,GenreIsNonGame,GenreIsIndie,GenreIsAction,GenreIsAdventure,GenreIsCasual,...,GenreIsRPG,GenreIsSimulation,GenreIsEarlyAccess,GenreIsFreeToPlay,GenreIsSports,GenreIsRacing,GenreIsMassivelyMultiplayer,PriceInitial,After2014,Expensive
0,Counter-Strike,2000.0,88,68991,False,False,False,True,False,False,...,False,False,False,False,False,False,False,9.99,0,0
1,Team Fortress Classic,1999.0,0,2439,False,False,False,True,False,False,...,False,False,False,False,False,False,False,4.99,0,0
2,Day of Defeat,2003.0,79,2319,False,False,False,True,False,False,...,False,False,False,False,False,False,False,4.99,0,0
3,Deathmatch Classic,2001.0,0,888,False,False,False,True,False,False,...,False,False,False,False,False,False,False,4.99,0,0
4,Half-Life: Opposing Force,1999.0,0,2934,False,False,False,True,False,False,...,False,False,False,False,False,False,False,4.99,0,0


In [5]:
# save this cleaned data set

df.to_csv('features_cleaned.csv', index=False)

In [None]:
# convert to markdown for the website

markdown_table = df.head().to_markdown(index=False)
print(markdown_table)

| ResponseName              |   ReleaseDate |   Metacritic |   RecommendationCount | IsFree   | GenreIsNonGame   | GenreIsIndie   | GenreIsAction   | GenreIsAdventure   | GenreIsCasual   | GenreIsStrategy   | GenreIsRPG   | GenreIsSimulation   | GenreIsEarlyAccess   | GenreIsFreeToPlay   | GenreIsSports   | GenreIsRacing   | GenreIsMassivelyMultiplayer   |   PriceInitial |   After2014 |   Expensive |
|:--------------------------|--------------:|-------------:|----------------------:|:---------|:-----------------|:---------------|:----------------|:-------------------|:----------------|:------------------|:-------------|:--------------------|:---------------------|:--------------------|:----------------|:----------------|:------------------------------|---------------:|------------:|------------:|
| Counter-Strike            |          2000 |           88 |                 68991 | False    | False            | False          | True            | False              | False           | Fal