## 1. Load Packages and Data

In [19]:
import pandas as pd
import ast
import numpy as np

# Load the data
df = pd.read_csv("../data/steam_data.csv")
print(df.shape)

(89618, 47)


In [21]:
df.head()

Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,...,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,"For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...",,...,879,5174,350,0,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,82,96473
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...","LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,,...,0,0,0,0,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,68,16720
2,570,Dota 2,2013-07-09,0,0.0,2,"The most-played game on Steam. Every day, mill...","The most-played game on Steam. Every day, mill...","Every day, millions of players worldwide enter...",“A modern multiplayer masterpiece.” 9.5/10 – D...,...,1536,898,892,0,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,80,29366
3,271590,Grand Theft Auto V Legacy,2015-04-13,17,0.0,0,"When a young street hustler, a retired bank ro...","When a young street hustler, a retired bank ro...",Grand Theft Auto V for PC offers players the o...,,...,771,7101,74,0,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,92,17517
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,17,3.99,9,Edition Comparison Ultimate Edition The Tom Cl...,“One of the best first-person shooters ever ma...,"Tom Clancy's Rainbow Six® Siege is an elite, t...",,...,682,2434,306,80,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,76,12608


## 2. Clean Data

In [22]:
AAA_PUBLISHERS = [
    'Electronic Arts', 'Ubisoft', 'Activision', 'Blizzard Entertainment', 
    'Take-Two Interactive', 'Rockstar Games', '2K', 'Bethesda Softworks',
    'Microsoft Studios', 'Xbox Game Studios', 'PlayStation PC LLC', 
    'Sony Interactive Entertainment', 'Nintendo', 'Sega', 'Bandai Namco',
    'Capcom', 'Square Enix', 'Konami', 'Warner Bros. Games', 'THQ Nordic',
    'Paradox Interactive', '505 Games', 'Deep Silver', 'Focus Entertainment',
    'Nexon', 'Tencent', 'NetEase', 'Amazon Games'
]

def is_major_publisher(pub_string):
    if not isinstance(pub_string, str): return False
    for aaa in AAA_PUBLISHERS:
        if aaa.lower() in pub_string.lower():
            return True
    return False

In [23]:
def clean_steam_data(df):
    df_clean = df.copy()
    
    # 1. Fix Prices
    df_clean['price'] = pd.to_numeric(df_clean['price'], errors='coerce').fillna(0.0)
    
    # 2. Parse Tags
    def parse_list(x):
        try:
            return ast.literal_eval(x) if isinstance(x, str) else []
        except:
            return []
            
    print("Parsing tags...")
    df_clean['tags_list'] = df_clean['tags'].apply(parse_list)
    
    # 3. Create "Is_Indie" Feature
    def classify_game(row):
        tags = [t.lower() for t in row['tags_list']]
        pub = row['publishers'] if pd.notnull(row['publishers']) else ""
        price = row['price']
        
        # Explicit Tag
        if 'indie' in tags:
            return 'Indie'
        
        # AAA Check
        if is_major_publisher(pub):
            return 'AAA/AA'
            
        # Price Heuristic
        # If it costs > $40 and NO indie tag, it's likely AA/AAA
        if price > 39.99:
            return 'AAA/AA'
            
        return 'Indie' # Default

    df_clean['game_type'] = df_clean.apply(classify_game, axis=1)
    
    return df_clean

df_processed = clean_steam_data(df)
print(df_processed['game_type'].value_counts())

Parsing tags...
game_type
Indie     87856
AAA/AA     1762
Name: count, dtype: int64


## 3. Feature Engineering

In [26]:
# 1. Extract Top Genres
target_genres = ['Action', 'RPG', 'Strategy', 'Adventure', 'Simulation', 'Casual']

for genre in target_genres:
    # Create a column like 'genre_Action' = 1 if tag exists, else 0
    df_processed[f'genre_{genre}'] = df_processed['tags_list'].apply(lambda x: 1 if genre in x else 0)

# 2. Calculate days since release
df_processed['release_date'] = pd.to_datetime(df_processed['release_date'], errors='coerce')
df_processed['days_since_release'] = (pd.Timestamp.now() - df_processed['release_date']).dt.days

# 3. Drop rows that failed date parsing
df_processed = df_processed.dropna(subset=['days_since_release'])

df_processed.head(3)

Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,...,num_reviews_recent,tags_list,game_type,genre_Action,genre_RPG,genre_Strategy,genre_Adventure,genre_Simulation,genre_Casual,days_since_release
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,"For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...",,...,96473,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",Indie,1,0,1,0,0,0,4840
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...","LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,,...,16720,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",Indie,1,0,1,0,1,0,2892
2,570,Dota 2,2013-07-09,0,0.0,2,"The most-played game on Steam. Every day, mill...","The most-played game on Steam. Every day, mill...","Every day, millions of players worldwide enter...",“A modern multiplayer masterpiece.” 9.5/10 – D...,...,29366,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",Indie,1,1,1,0,1,0,4518


In [28]:
df_processed.to_csv("../data/steam_cleaned.csv", index=False)