In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import json
import time
import os
import joblib
import warnings
from collections import defaultdict

#sklearn
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectFromModel, RFE

#metrics
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, mean_absolute_error

sns.set_theme(style="whitegrid")
sns.set_palette("Set2")

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

seed = 42
modelfolder = "models"
os.makedirs(modelfolder, exist_ok=True)

In [2]:
print("1) Data Loading")
print("=============================================")

gamesdata = pd.DataFrame(columns=['app_id'])
steamdata = pd.DataFrame(columns=['app_id'])
mergeddata = pd.DataFrame()
alldata = pd.DataFrame()

try:
    print("Reading gamescsv...")
    gamesdata = pd.read_csv('games.csv')
    print("Size of games is: " + str(len(gamesdata)))
    if 'App ID' in gamesdata.columns:
        gamesdata = gamesdata.rename(columns={'App ID': 'app_id'})
    elif 'appid' in gamesdata.columns:
        gamesdata = gamesdata.rename(columns={'appid': 'app_id'})
    if 'app_id' in gamesdata.columns:
        gamesdata['app_id'] = gamesdata['app_id'].astype(str)
except FileNotFoundError:
    print("couldnt find gamescsv")

try:
    print("Reading steamdbjson...")
    with open('steamdb.json','r',encoding='utf-8') as f:
        stdata = json.load(f)
    steamdata = pd.DataFrame(stdata)
    print("Size of steamdb data is: " + str(len(steamdata)))
    steamdata = steamdata.rename(columns={
        'sid': 'app_id',
        'name': 'title_steamdb',
        'full_price': 'steamdb_full_price',
        'current_price': 'steamdb_current_price'
    })
    if 'app_id' in steamdata.columns:
        steamdata['app_id'] = steamdata['app_id'].astype(str)
except FileNotFoundError:
    print("steamdbjson not found")
except json.JSONDecodeError:
    print("Steamdbjson may be corrupted")

if not gamesdata.empty and not steamdata.empty and 'app_id' in gamesdata.columns and 'app_id' in steamdata.columns:
    print("Checking overlap:")
    gameids = set(gamesdata['app_id'])
    steamids = set(steamdata['app_id'])
    overlapids = gameids.intersection(steamids)
    print("Games data unique: " + str(len(gameids)) + ", steamdb data unique: " + str(len(steamids)) + ", overlap: " + str(len(overlapids)))
    if min(len(gameids), len(steamids))>0:
        print("Overlap percent: " + str(len(overlapids)/min(len(gameids), len(steamids))*100) + "%")

    if 'tags' in gamesdata.columns and 'tags' in steamdata.columns:
        steamdata.rename(columns={'tags':'tags_steamdb'}, inplace=True)
        print("renamed tags column to tags_steamdb so no collisions")

    print("Merging...")
    mergeddata = pd.merge(gamesdata, steamdata, on='app_id', how='outer', suffixes=('','_steamdb'))

    print("Checking duplicates in appid after merge")
    initial_len = len(mergeddata)
    mergeddata.drop_duplicates(subset='app_id', keep='first', inplace=True)
    diff_len = initial_len - len(mergeddata)
    if diff_len>0:
        print("Dropped " + str(diff_len) + " duplicate rows new shape is " + str(mergeddata.shape))

    duplicatecols = mergeddata.columns[mergeddata.columns.duplicated()]
    if len(duplicatecols)>0:
        print("Found duplicate cols: " + str(duplicatecols.tolist()))
        for ccol in duplicatecols:
            newc = ccol+"_duplicate"
            print("Renaming " + ccol + " to " + newc)
            cpos = np.where(mergeddata.columns==ccol)[0]
            mergeddata.columns.values[cpos[1]] = newc

    mergeddata = mergeddata.reset_index(drop=True)
    print("Merged data has " + str(len(mergeddata)) + " unique rows in total")
    print("num columns is: " + str(mergeddata.shape[1]))
else:
    print("Can't merge: no data or columns missing")
    if not gamesdata.empty and 'app_id' in gamesdata.columns:
        mergeddata = gamesdata
        print("Games data only")
    else:
        print("No data")
        mergeddata = pd.DataFrame()

if not mergeddata.empty:
    alldata = mergeddata.copy()
    print("Initial shape: " + str(alldata.shape))
else:
    print("Empty df being used")
    alldata = pd.DataFrame()

1) Data Loading
Reading gamescsv...
Size of games is: 65111
Reading steamdbjson...
Size of steamdb data is: 53981
Checking overlap:
Games data unique: 65109, steamdb data unique: 53981, overlap: 47609
Overlap percent: 88.19584668679721%
Merging...
Checking duplicates in appid after merge
Dropped 2 duplicate rows new shape is (71481, 59)
Merged data has 71481 unique rows in total
num columns is: 59
Initial shape: (71481, 59)


In [3]:
print("2) Data Preprocessing")
print("===============================================")

if not alldata.empty:
    #cleaning col names
    alldata.columns = [re.sub(r'\W+','_',col).lower().strip('_') for col in alldata.columns]

    if alldata.columns.duplicated().any():
        dupc = alldata.columns[alldata.columns.duplicated()].unique()
        print("Found duplicates after cleaning: " + str(dupc.tolist()))
        for cc in dupc:
            ccpos = np.where(alldata.columns==cc)[0]
            for i in range(1,len(ccpos)):
                alldata.columns.values[ccpos[i]] = cc+"_dup"+str(i)
        print("Renamed cols so no collisions occur")

    if 'app_id_steamdb' in alldata.columns and 'app_id' in alldata.columns:
        alldata = alldata.drop(columns=['app_id_steamdb'])
    if 'title_steamdb' in alldata.columns and 'title' in alldata.columns:
        alldata['title'] = alldata['title'].fillna(alldata['title_steamdb'])
        alldata = alldata.drop(columns=['title_steamdb'])

    if 'title' not in alldata.columns:
        alldata['title'] = 'Unknown Game'
    alldata['title'] = alldata['title'].fillna('Unknown Game')

    def extractprice(astr):
        if pd.isna(astr):
            return np.nan
        try:
            s = str(astr)
            s = re.sub(r'[^\d.,]','',s)
        except:
            return np.nan
        if '.' not in s and ',' in s:
            s = s.replace(',','.')
        s = s.replace(',','')
        mm = re.search(r'(\d+(\.\d+)?)', s)
        return float(mm.group(1)) if mm else np.nan

    pcols = ['launch_price','steamdb_full_price','steamdb_current_price']
    print("Fixing price columns")
    for c in pcols:
        if c in alldata.columns:
            alldata[c] = alldata[c].apply(extractprice)

    print("Creating target_price if possible")
    used_target = False
    if 'steamdb_full_price' in alldata.columns and alldata['steamdb_full_price'].notna().sum() > (alldata['launch_price'].notna().sum() if 'launch_price' in alldata.columns else 0):
        alldata['target_price'] = alldata['steamdb_full_price']
        if 'launch_price' in alldata.columns:
            alldata['target_price'] = alldata['target_price'].fillna(alldata['launch_price'])
        print("picking steamdb_full_price as main target then fallback on launch_price if missing")
        used_target = True
    elif 'launch_price' in alldata.columns:
        alldata['target_price'] = alldata['launch_price']
        print("picking launch_price as main target")
        used_target = True
    else:
        print("no price col found for target")
        alldata['target_price'] = np.nan

    if used_target and alldata['target_price'].notna().any():
        medianp = alldata['target_price'].median()
        maxp = alldata['target_price'].max()
        if pd.notna(medianp) and pd.notna(maxp) and (medianp>100 or maxp>200):
            print("these prices look too big dividing by 100 for anything above 200")
            for cc in pcols + ['target_price']:
                if cc in alldata.columns:
                    hmask = (alldata[cc]>200) & alldata[cc].notna()
                    if hmask.any():
                        alldata.loc[hmask,cc] = alldata.loc[hmask,cc]/100.0

    print("Handling outliers (capping)")
    if 'target_price' in alldata.columns:
        q1 = alldata['target_price'].quantile(0.01)
        q3 = alldata['target_price'].quantile(0.99)
        iqr = q3 - q1
        lower = max(0.99, q1 - 1.5*iqr)
        upper = q3 + 1.5*iqr
        capmask = (alldata['target_price']<lower) | (alldata['target_price']>upper)
        if capmask.any():
            print("capping " + str(capmask.sum()) + " outliers to range: " + str(lower) + " to " + str(upper))
            alldata.loc[alldata['target_price']<lower,'target_price'] = lower
            alldata.loc[alldata['target_price']>upper,'target_price'] = upper

    if 'target_price' in alldata.columns:
        alldata['log_price'] = np.log1p(alldata['target_price'])
        print("made a log1p price col log_price")

        pricebreaks = [0,4.99,9.99,14.99,19.99,29.99,39.99,59.99,float('inf')]
        plabels = ['$0-$4.99','$5-$9.99','$10-$14.99','$15-$19.99','$20-$29.99','$30-$39.99','$40-$59.99','$60+']
        alldata['price_bucket'] = pd.cut(alldata['target_price'], bins=pricebreaks, labels=plabels, right=True)
        alldata['price_bucket_id'] = pd.cut(alldata['target_price'], bins=pricebreaks, labels=False, right=True)
        print("made price buckets")

    firstrows = len(alldata)
    if 'target_price' in alldata.columns:
        alldata.dropna(subset=['target_price'], inplace=True)
        alldata = alldata[(alldata['target_price']>0) & (alldata['target_price']<=200)]
        droprows = firstrows - len(alldata)
        if droprows>0:
            print("dropped " + str(droprows) + " rows no target or invalid price")
    else:
        print("no target col found")

    alldata.reset_index(drop=True, inplace=True)
    print("Preproc final shape is " + str(alldata.shape))
else:
    print("df is empty")

2) Data Preprocessing
Found duplicates after cleaning: ['tags']
Renamed cols so no collisions occur
Fixing price columns
Creating target_price if possible
picking launch_price as main target
these prices look too big dividing by 100 for anything above 200
Handling outliers (capping)
capping 104 outliers to range: 0.99 to 123.49000000000001
made a log1p price col log_price
made price buckets
dropped 6372 rows no target or invalid price
Preproc final shape is (65109, 62)


In [4]:
print("3) Basic Exploration")
print("============================================")

if not alldata.empty and 'target_price' in alldata.columns and alldata['target_price'].nunique()>0:
    plt.figure(figsize=(12,6))
    sns.histplot(alldata['target_price'], bins=50, kde=True)
    plt.title("Distribution of Game Prices")
    plt.xlabel("Price")
    plt.ylabel("Count of Games")
    plt.xlim(0, alldata['target_price'].quantile(0.99))
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('eda_price_distribution.png')
    plt.close()

    plt.figure(figsize=(12,6))
    sns.histplot(alldata['log_price'], bins=50, kde=True)
    plt.title("Distribution of Log Price")
    plt.xlabel("Log Price")
    plt.ylabel("Count of Games")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('eda_log_price_distribution.png')
    plt.close()

    plt.figure(figsize=(12,6))
    pcounts = alldata['price_bucket'].value_counts().sort_index()
    ax = sns.barplot(x=pcounts.index, y=pcounts.values)
    plt.title("Price Bucket Distribution")
    plt.xlabel("Range")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    for i, val in enumerate(pcounts.values):
        ax.text(i, val+100, str(val), ha='center')
    plt.tight_layout()
    plt.savefig('eda_price_bucket_distribution.png')
    plt.close()

    print("Some price stats:")
    print("Num of games: " + str(len(alldata)))
    print("Median price: " + str(alldata['target_price'].median()))
    print("Mean price: " + str(alldata['target_price'].mean()))
    print("25th: " + str(alldata['target_price'].quantile(0.25)) + "  75th: " + str(alldata['target_price'].quantile(0.75)))

    fewcols = ['target_price']
    maybe_cols = ['reviews_score_fancy','meta_score','revenue_estimated','reviews_total','achievements','hltb_single']
    for cc in maybe_cols:
        if cc in alldata.columns and pd.api.types.is_numeric_dtype(alldata[cc]):
            fewcols.append(cc)
    if len(fewcols)>1:
        cmat = alldata[fewcols].corr()['target_price'].sort_values(ascending=False)
        print("some correlation with target price:")
        print(cmat)
else:
    print("not enough data to do EDA")

3) Basic Exploration
Some price stats:
Num of games: 65109
Median price: 6.99
Mean price: 10.20861278625075
25th: 3.99  75th: 12.99
some correlation with target price:
target_price     1.000000
hltb_single      0.115804
meta_score       0.098041
reviews_total    0.052872
achievements    -0.019015
Name: target_price, dtype: float64


In [None]:
print("4) Feature Engineering")
print("==============================================")

def make_features(dftrain, dftest=None):
    print("feature engineering for train size: ", len(dftrain), " test size: ", (len(dftest) if dftest is not None else 0))
    traincopy = dftrain.copy()
    testcopy = dftest.copy() if dftest is not None else None

    def test(f, *args, **kwargs):
        if testcopy is not None:
            return f(testcopy, *args, **kwargs)

    enged = []

    #dates
    #processing dates
    datecols = []
    for c in ['release_date','published_store','published_meta','published_igdb']:
        if c in traincopy.columns:
            datecols.append(c)

    if datecols:
        for col in datecols:
            traincopy[col] = pd.to_datetime(traincopy[col], errors='coerce', unit='s', origin='unix')
            test(lambda df,x: df.__setitem__(x, pd.to_datetime(df[x], errors='coerce', unit='s', origin='unix')), col)

        traincopy['earliest_release_date'] = traincopy[datecols].min(axis=1)
        test(lambda df: df.__setitem__('earliest_release_date', df[datecols].min(axis=1)))

        if traincopy['earliest_release_date'].notna().any():
            now = datetime(2025,4,21)
            traincopy['game_age_days'] = (now - traincopy['earliest_release_date']).dt.days
            test(lambda df: df.__setitem__('game_age_days',(now - df['earliest_release_date']).dt.days))

            negmask = traincopy['game_age_days']<0
            if negmask.any():
                print("fixing " + str(negmask.sum()) + " future release age to 0")
                traincopy.loc[negmask,'game_age_days'] = 0
                if testcopy is not None and (testcopy['game_age_days']<0).any():
                    testcopy.loc[testcopy['game_age_days']<0,'game_age_days'] = 0

            traincopy['age_years'] = traincopy['game_age_days']/365.25
            test(lambda df: df.__setitem__('age_years', df['game_age_days']/365.25))

            traincopy['age_years_log'] = np.log1p(np.maximum(0, traincopy['age_years']))
            test(lambda df: df.__setitem__('age_years_log', np.log1p(np.maximum(0, df['age_years']))))

            traincopy['release_year'] = traincopy['earliest_release_date'].dt.year
            traincopy['release_month'] = traincopy['earliest_release_date'].dt.month
            traincopy['release_quarter'] = traincopy['earliest_release_date'].dt.quarter
            traincopy['release_is_q4'] = (traincopy['release_quarter']==4).astype(int)
            traincopy['is_new_release'] = (traincopy['game_age_days']<=90).astype(int)

            test(lambda df: df.__setitem__('release_year', df['earliest_release_date'].dt.year))
            test(lambda df: df.__setitem__('release_month', df['earliest_release_date'].dt.month))
            test(lambda df: df.__setitem__('release_quarter', df['earliest_release_date'].dt.quarter))
            test(lambda df: df.__setitem__('release_is_q4', (df['release_quarter']==4).astype(int)))
            test(lambda df: df.__setitem__('is_new_release', (df['game_age_days']<=90).astype(int)))

            enged.extend(['game_age_days','age_years','age_years_log','release_year','release_month','release_quarter','release_is_q4','is_new_release'])
        else:
            print("no valid dates found")
    else:
        print("no date col found")

    #review scores
    print("doing reviewscore normalization")
    rmap = {
        'reviews_score_fancy':100,
        'meta_score':100,
        'meta_uscore':10,
        'igdb_score':100,
        'igdb_uscore':100,
        'gfq_rating':5
    }
    normed = []
    for col,maxv in rmap.items():
        if col in traincopy.columns:
            try:
                if col=='reviews_score_fancy':
                    s1 = traincopy[col].astype(str).str.replace('%','',regex=False).str.strip()
                    traincopy[col] = pd.to_numeric(s1, errors='coerce')
                    if testcopy is not None:
                        testcopy[col] = pd.to_numeric(testcopy[col].astype(str).str.replace('%','',regex=False).str.strip(), errors='coerce')

                ncol = col+"_norm"
                traincopy[ncol] = (traincopy[col]/maxv).clip(0,1)
                if testcopy is not None:
                    testcopy[ncol] = (testcopy[col]/maxv).clip(0,1)

                normed.append(ncol)
                enged.append(ncol)

                if col=='reviews_score_fancy':
                    traincopy['is_highly_rated'] = (traincopy[ncol]>=0.8).astype(int)
                    if testcopy is not None:
                        testcopy['is_highly_rated'] = (testcopy[ncol]>=0.8).astype(int)
                    enged.append('is_highly_rated')
            except Exception as e:
                print("issue with " + col + " " + str(e))

    if normed:
        weightsdict = {
            'reviews_score_fancy_norm':0.45,
            'meta_score_norm':0.25,
            'igdb_score_norm':0.2,
            'meta_uscore_norm':0.05,
            'igdb_uscore_norm':0.05
        }
        avw = {k:v for k,v in weightsdict.items() if k in traincopy.columns}
        if avw:
            ssum = sum(avw.values())
            nweights = {k:(v/ssum) for k,v in avw.items()}
            traincopy['combined_score'] = sum(traincopy[c].fillna(0)*w for c,w in nweights.items()).clip(0,1)
            if testcopy is not None:
                testcopy['combined_score'] = sum(testcopy[c].fillna(0)*w for c,w in nweights.items()).clip(0,1)
            enged.append('combined_score')

            if len(avw)>1:
                scols = list(avw.keys())
                traincopy['score_variance'] = traincopy[scols].var(axis=1)
                traincopy['is_controversial'] = (traincopy['score_variance']>traincopy['score_variance'].quantile(0.75)).astype(int)
                if testcopy is not None:
                    testcopy['score_variance'] = testcopy[scols].var(axis=1)
                    t75 = traincopy['score_variance'].quantile(0.75)
                    testcopy['is_controversial'] = (testcopy['score_variance']>t75).astype(int)
                enged.extend(['score_variance','is_controversial'])
            print("made combined_score from the available review norms")
        else:
            traincopy['combined_score'] = 0.5
            if testcopy is not None:
                testcopy['combined_score'] = 0.5
            print("no weighted scores, setting combined_score=0.5")
            enged.append('combined_score')
    else:
        traincopy['combined_score'] = 0.5
        if testcopy is not None:
            testcopy['combined_score'] = 0.5
        print("no normed scores found, setting combined_score=0.5")
        enged.append('combined_score')

    print("Processing tags genres categories")
    tagfields = []
    for f in ['tags','genres','categories']:
        if f in traincopy.columns:
            tagfields.append(f)

    binflags = []
    for fld in tagfields:
        try:
            traincopy[fld] = traincopy[fld].astype(str).fillna('')
            if testcopy is not None:
                testcopy[fld] = testcopy[fld].astype(str).fillna('')

            ccount = fld+"_count"
            traincopy[ccount] = traincopy[fld].apply(lambda x: len(str(x).split(',')))
            if testcopy is not None:
                testcopy[ccount] = testcopy[fld].apply(lambda x: len(str(x).split(',')))
            enged.append(ccount)

            if fld=='tags':
                kp = {
                    'has_dlc_keywords':r'DLC|Expansion|Content Pack',
                    'has_free_keywords':r'Free to Play|Free|F2P',
                    'has_premium_keywords':r'AAA|Premium|Quality',
                    'has_indie_keywords':r'Indie|Small|Casual',
                    'has_franchise_keywords':r'Franchise|Series|Sequel'
                }
                for kkk, pat in kp.items():
                    traincopy[kkk] = traincopy[fld].str.contains(pat, case=False, na=False).astype(int)
                    if testcopy is not None:
                        testcopy[kkk] = testcopy[fld].str.contains(pat, case=False, na=False).astype(int)
                    enged.append(kkk)

            if fld in ['tags','genres']:
                common_tags = ['Indie','Action','RPG','Strategy','Casual','Adventure','Simulation','Singleplayer','Multiplayer','Early Access','Free to Play','Open World','Puzzle','Story Rich','Shooter','Platformer','Racing']
                for tg in common_tags:
                    flcol = fld+"_has_"+tg.lower().replace(' ','_')
                    traincopy[flcol] = traincopy[fld].str.contains(tg, case=False, na=False).astype(int)
                    if testcopy is not None:
                        testcopy[flcol] = testcopy[fld].str.contains(tg, case=False, na=False).astype(int)
                    binflags.append(flcol)
                    enged.append(flcol)
        except Exception as e:
            print("issue with " + fld + " " + str(e))

    print("made " + str(len(binflags)) + " flags from tags or genres")

    print("making compound features")
    try:
        combos = 0
        combofeat = [
            {
                'name':'is_action_rpg',
                'condition':lambda df: (df['genres_has_rpg'] & df['genres_has_action']).astype(int)
            },
            {
                'name':'is_indie_casual',
                'condition':lambda df: (df['genres_has_indie'] & df['genres_has_casual']).astype(int)
            },
            {
                'name':'is_strategy_sim',
                'condition':lambda df: (df['genres_has_strategy'] & df['genres_has_simulation']).astype(int)
            }
        ]
        for cdef in combofeat:
            nm = cdef['name']
            cond = cdef['condition']
            needed = []
            if 'action_rpg' in nm:
                needed = ['genres_has_rpg','genres_has_action']
            elif 'indie_casual' in nm:
                needed = ['genres_has_indie','genres_has_casual']
            elif 'strategy_sim' in nm:
                needed = ['genres_has_strategy','genres_has_simulation']

            if all(kk in traincopy.columns for kk in needed):
                traincopy[nm] = cond(traincopy)
                if testcopy is not None:
                    testcopy[nm] = cond(testcopy)
                combos+=1
                enged.append(nm)
        print("made " + str(combos) + " compound feats")
    except Exception as e:
        print("error making combos " + str(e))

    print("processing developer publisher")
    for fff in [f for f in ['developers','publishers'] if f in traincopy.columns]:
        try:
            traincopy[fff] = traincopy[fff].astype(str).fillna('Unknown')
            if testcopy is not None:
                testcopy[fff] = testcopy[fff].astype(str).fillna('Unknown')

            valcounts = traincopy[fff].value_counts()
            for ddf in [traincopy, testcopy] if testcopy is not None else [traincopy]:
                ddf[fff+'_game_count'] = ddf[fff].map(valcounts).fillna(1).astype(int)
                ddf[fff+'_is_prolific'] = (ddf[fff+'_game_count']>10).astype(int)
                ddf[fff+'_is_major'] = (ddf[fff+'_game_count']>50).astype(int)
                ddf[fff+'_log_games'] = np.log1p(ddf[fff+'_game_count'])

            enged.extend([fff+'_game_count', fff+'_is_prolific', fff+'_is_major', fff+'_log_games'])

            bigcos = [
                'Ubisoft','Electronic Arts','EA','Activision','Blizzard','Square Enix','Capcom','Bethesda','Microsoft','Sony','Nintendo','Take-Two','2K Games','Rockstar','SEGA','Warner Bros','Bandai Namco','CD Projekt','Epic Games','THQ','Paradox','Deep Silver'
            ]
            traincopy[fff+'_is_aaa'] = traincopy[fff].apply(lambda x: 1 if any(b.lower() in x.lower() for b in bigcos) else 0)
            if testcopy is not None:
                testcopy[fff+'_is_aaa'] = testcopy[fff].apply(lambda x: 1 if any(b.lower() in x.lower() for b in bigcos) else 0)
            enged.append(fff+'_is_aaa')
        except Exception as e:
            print("error dev/pub " + str(e))

    print("processing numeric features")
    numz = ['reviews_total','achievements','stsp_owners','hltb_single','hltb_complete','revenue_estimated']
    for cc in numz:
        if cc in traincopy.columns:
            traincopy[cc] = pd.to_numeric(traincopy[cc], errors='coerce')
            if testcopy is not None:
                testcopy[cc] = pd.to_numeric(testcopy[cc], errors='coerce')

            logc = cc+"_log"
            traincopy[logc] = np.log1p(np.maximum(0, traincopy[cc]))
            if testcopy is not None:
                testcopy[logc] = np.log1p(np.maximum(0, testcopy[cc]))
            enged.append(logc)

            if cc=='hltb_single':
                binsy = [-1,2,5,10,20,40,float('inf')]
                labsy = ['Very Short (<2h)','Short (2-5h)','Medium (5-10h)','Long (10-20h)','Very Long (20-40h)','Massive (40h+)']
                traincopy['playtime_category'] = pd.cut(traincopy[cc], bins=binsy, labels=labsy)
                if testcopy is not None:
                    testcopy['playtime_category'] = pd.cut(testcopy[cc], bins=binsy, labels=labsy)

            if cc=='achievements':
                abins = [-1,10,25,50,100,float('inf')]
                alabs = ['Few','Some','Average','Many','Massive']
                traincopy['achievement_tier'] = pd.cut(traincopy[cc], bins=abins, labels=alabs)
                traincopy['has_achievements'] = (traincopy[cc]>0).astype(int)
                if testcopy is not None:
                    testcopy['achievement_tier'] = pd.cut(testcopy[cc], bins=abins, labels=alabs)
                    testcopy['has_achievements'] = (testcopy[cc]>0).astype(int)
                enged.append('has_achievements')

            mkcols = ['revenue_estimated','stsp_owners']
            for xco in mkcols:
                if xco in traincopy.columns:
                    traincopy[xco] = pd.to_numeric(traincopy[xco], errors='coerce')
                    if testcopy is not None:
                        testcopy[xco] = pd.to_numeric(testcopy[xco], errors='coerce')

                    tiercol = xco+"_tier"
                    if traincopy[xco].notna().sum()>4:
                        qzz = [0,0.25,0.5,0.75,1]
                        labsz = ['Low','Medium-Low','Medium','High']
                        nonz = traincopy[xco][traincopy[xco]>0]
                        if len(nonz)>0:
                            qvals = np.unique(np.percentile(nonz,[0,25,50,75,100]))
                            if len(qvals)<2:
                                qvals = np.linspace(nonz.min(), nonz.max(),5)
                            traincopy[tiercol] = pd.cut(traincopy[xco], bins=qvals, labels=labsz[:len(qvals)-1], include_lowest=True, duplicates='drop')
                            if testcopy is not None:
                                testcopy[tiercol] = pd.cut(testcopy[xco], bins=qvals, labels=labsz[:len(qvals)-1], include_lowest=True, duplicates='drop')
                        else:
                            print("not enough non zero in " + xco)
    
    print("creating advanced feats now maybe")
    if all(x in traincopy.columns for x in ['combined_score','reviews_total_log']):
        traincopy['score_x_log_reviews'] = traincopy['combined_score']*traincopy['reviews_total_log']
        if testcopy is not None:
            testcopy['score_x_log_reviews'] = testcopy['combined_score']*testcopy['reviews_total_log']
        enged.append('score_x_log_reviews')

    if all(x in traincopy.columns for x in ['combined_score','age_years_log']):
        traincopy['score_x_age'] = traincopy['combined_score']*traincopy['age_years_log']
        if testcopy is not None:
            testcopy['score_x_age'] = testcopy['combined_score']*testcopy['age_years_log']
        enged.append('score_x_age')

    if all(x in traincopy.columns for x in ['game_age_days','reviews_total']):
        tmedian = traincopy['game_age_days'][traincopy['game_age_days']>0].median()
        if pd.isna(tmedian) or tmedian<=0:
            tmedian = 365
        def revperday(df, med):
            return df.apply(lambda row: row['reviews_total']/row['game_age_days'] if pd.notna(row['game_age_days']) and row['game_age_days']>30 else row['reviews_total']/med if pd.notna(row['reviews_total']) else 0, axis=1).fillna(0)
        traincopy['reviews_per_day'] = revperday(traincopy, tmedian)
        if testcopy is not None:
            testcopy['reviews_per_day'] = revperday(testcopy, tmedian)

        traincopy['reviews_per_day_log'] = np.log1p(traincopy['reviews_per_day'])
        if testcopy is not None:
            testcopy['reviews_per_day_log'] = np.log1p(testcopy['reviews_per_day'])
        enged.extend(['reviews_per_day','reviews_per_day_log'])

        traincopy['recent_popularity'] = traincopy['reviews_per_day'] * np.minimum(1, 365/traincopy['game_age_days'].clip(lower=30))
        if testcopy is not None:
            testcopy['recent_popularity'] = testcopy['reviews_per_day'] * np.minimum(1, 365/testcopy['game_age_days'].clip(lower=30))
        enged.append('recent_popularity')

    qcomponents = ['combined_score','achievements','hltb_single']
    qavail = [c for c in qcomponents if c in traincopy.columns]
    if qavail:
        traincopy['quality_indicator'] = 0
        if testcopy is not None:
            testcopy['quality_indicator'] = 0
        compcount = 0
        if 'combined_score' in qavail:
            traincopy['quality_indicator'] += traincopy['combined_score']
            if testcopy is not None:
                testcopy['quality_indicator'] += testcopy['combined_score']
            compcount+=1
        if 'achievements' in qavail:
            ascore = np.clip(traincopy['achievements']/100, 0,1)
            traincopy['quality_indicator'] += ascore
            if testcopy is not None:
                testtest = np.clip(testcopy['achievements']/100,0,1)
                testcopy['quality_indicator'] += testtest
            compcount+=1
        if 'hltb_single' in qavail:
            tmaxp = max(40, traincopy['hltb_single'].quantile(0.95))
            lscore = np.clip(traincopy['hltb_single']/tmaxp,0,1)
            traincopy['quality_indicator'] += lscore
            if testcopy is not None:
                testlscore = np.clip(testcopy['hltb_single']/tmaxp,0,1)
                testcopy['quality_indicator'] += testlscore
            compcount+=1
        if compcount>0:
            traincopy['quality_indicator'] /= compcount
            if testcopy is not None:
                testcopy['quality_indicator'] /= compcount
            enged.append('quality_indicator')
            print("made quality_indicator from " + str(compcount) + " feats")

    print("Free or cheap games feature")
    pcol = None
    if 'target_price' in traincopy.columns:
        pcol = 'target_price'
    elif 'launch_price' in traincopy.columns:
        pcol = 'launch_price'
    elif 'steamdb_full_price' in traincopy.columns:
        pcol = 'steamdb_full_price'

    if pcol:
        cheapmask = traincopy[pcol]<1
        if cheapmask.any():
            print("There are " + str(cheapmask.sum()) + " free or cheap titles in train data")
            traincopy['is_free_or_cheap'] = cheapmask.astype(int)
            if testcopy is not None:
                testcopy['is_free_or_cheap'] = (testcopy[pcol]<1).astype(int)

            if 'tags' in traincopy.columns:
                microk = ['Free to Play','Microtransactions','In-App Purchases','F2P']
                traincopy['has_microtransactions'] = traincopy['tags'].apply(lambda x: 1 if any(k.lower() in str(x).lower() for k in microk) else 0)
                if testcopy is not None:
                    testcopy['has_microtransactions'] = testcopy['tags'].apply(lambda x: 1 if any(k.lower() in str(x).lower() for k in microk) else 0)
                enged.append('has_microtransactions')
    else:
        print("No price col to check free or cheap")
        traincopy['is_free_or_cheap'] = 0
        if testcopy is not None:
            testcopy['is_free_or_cheap'] = 0
    enged.append('is_free_or_cheap')

    tnum = traincopy.select_dtypes(include=np.number).columns
    infmask = np.isinf(traincopy[tnum]).any(axis=1)
    if infmask.any():
        print("replacing " + str(infmask.sum()) + " infinite with nan in train")
        traincopy.replace([np.inf, -np.inf], np.nan, inplace=True)
        if testcopy is not None:
            testcopy.replace([np.inf, -np.inf], np.nan, inplace=True)

    for cl in tnum:
        if traincopy[cl].isna().any():
            tmed = traincopy[cl].median()
            if pd.notna(tmed):
                traincopy[cl] = traincopy[cl].fillna(tmed)
                if testcopy is not None and cl in testcopy.columns:
                    testcopy[cl] = testcopy[cl].fillna(tmed)
            else:
                traincopy[cl] = traincopy[cl].fillna(0)
                if testcopy is not None and cl in testcopy.columns:
                    testcopy[cl] = testcopy[cl].fillna(0)

    print("Checking low variance features")
    lowvar = []
    for c in tnum:
        if c in ['target_price','log_price','price_bucket_id']:
            continue
        if traincopy[c].nunique()<=1:
            lowvar.append(c)
    if lowvar:
        print("found " + str(len(lowvar)) + " features no variance: " + str(lowvar[:10]))

    print("Done with Feature Engineering. Made " + str(len(enged)) + " feats")
    goodfeats = [f for f in enged if f not in lowvar]
    return traincopy, testcopy, goodfeats

4) Feature Engineering


In [6]:
print("5) Models from Scratch")
print("==================================================")

#ridge reg
def standardize(X):
    X = np.asarray(X)
    if X.ndim==1:
        X = X.reshape(-1,1)
    if X.shape[1]<=1:
        return X, np.zeros(0), np.ones(0)
    means = np.mean(X[:,1:], axis=0)
    stds = np.std(X[:,1:], axis=0)+1e-7
    Xn = (X[:,1:] - means)/stds
    return np.hstack((X[:,:1], Xn)), means, stds

def RidgePredict(X, W, means=None, stds=None):
    X = np.asarray(X)
    n,e = X.shape
    XX = np.append(np.ones((n,1)), X, axis=1)
    if means is not None and stds is not None:
        Xnf = (XX[:,1:] - means)/stds
        Xready = np.hstack((XX[:,:1], Xnf))
    else:
        Xready,_,_ = standardize(XX)
    return Xready.dot(W)

def RidgeUpdate(X, Y, W, lr, lam):
    X, Y, W = np.asarray(X), np.asarray(Y), np.asarray(W)
    n,e = X.shape
    if n==0:
        return W, np.inf
    XX = np.append(np.ones((n,1)), X, axis=1)
    Xn, mm, ss = standardize(XX)
    yp = Xn.dot(W)
    err = Y - yp
    dwmse = -2*Xn.T.dot(err)/n
    dwl2 = 2*lam*W
    dwl2[0] = 0
    dw = dwmse + dwl2
    Wnew = W - lr*dw
    newy = Xn.dot(Wnew)
    cost = np.sqrt(np.mean((Y - newy)**2))
    return Wnew, cost

def RidgeFit(X, Y, lr=0.01, numiter=1000, lam=0.1, verbose=False, pstep=100, early_stop=True, pat=20, tol=1e-4):
    X, Y = np.asarray(X), np.asarray(Y)
    n,e = X.shape
    Xb = np.append(np.ones((n,1)), X, axis=1)
    Xn, mus, sigs = standardize(Xb)
    W = np.zeros(e+1)
    costhist = []
    bestc = np.inf
    bestw = W.copy()
    ps=0
    lrr=lr
    for i in range(numiter):
        W, cost = RidgeUpdate(X, Y, W, lrr, lam)
        costhist.append(cost)
        if early_stop:
            if cost<bestc - tol:
                bestc = cost
                bestw = W.copy()
                ps=0
            else:
                ps+=1
            if ps>=pat:
                lrr*=0.5
                ps=0
                if verbose:
                    print("reducing learning rate now -> ", lrr)
                if lrr<lr*0.01:
                    if verbose:
                        print(" stopping early at i= ", i+1)
                    break
        if verbose and (i+1)%pstep==0:
            print("iter= ", i+1, " cost= ", cost, " lr= ", lrr)
        if np.isnan(cost) or np.isinf(cost):
            print("cost is messed up at i= ", i+1)
            break
    if early_stop and bestc<cost:
        if verbose:
            print("using best weights cost= ", bestc)
        return bestw, costhist, mus, sigs
    return W, costhist, mus, sigs

#Gradient boosting
def boostIteration(X, y, H, eta, maxleaf=8, minleaf=1, rseed=42, sw=None):
    X, y, H = np.asarray(X), np.asarray(y), np.asarray(H)
    r = y-H
    tree = DecisionTreeRegressor(
        max_leaf_nodes=maxleaf,
        min_samples_leaf=minleaf,
        random_state=rseed
    )
    if sw is not None:
        tree.fit(X, r, sample_weight=sw)
    else:
        tree.fit(X, r)
    hh = tree.predict(X)
    newH = H + eta*hh
    return newH, tree

def GBfit(X, y, n=100, lr=0.1, maxleaf=8, minleaf=1, verbose=False, pstep=10, Xv=None, yv=None, esr=10, usew=False):
    X,y = np.asarray(X), np.asarray(y)
    initp = np.mean(y)
    H = np.full(y.shape[0], initp)
    trees = []
    bestval = np.inf
    bestit = 0
    nocount = 0
    if usew:
        sw = np.ones(y.shape[0])/y.shape[0]
    else:
        sw = None
    for i in range(n):
        H, t = boostIteration(X, y, H, lr, maxleaf, minleaf, rseed=seed+i, sw=sw)
        trees.append(t)
        trrmse = np.sqrt(np.mean((y-H)**2))
        if usew:
            eabs = np.abs(y-H)
            sw = eabs/np.sum(eabs)
        if Xv is not None and yv is not None:
            vpred = GBpredict(Xv, initp, trees, lr)
            vrmse = np.sqrt(np.mean((yv - vpred)**2))
            if vrmse<bestval-1e-4:
                bestval=vrmse
                bestit=i
                nocount=0
            else:
                nocount+=1
            if nocount>=esr:
                if verbose:
                    print("earlystop at iteration= ", i+1, " best iteration= ", bestit+1)
                trees=trees[:bestit+1]
                break
            if verbose and (i+1)%pstep==0:
                print(" iter= ", i+1, "/", n, " train rmse= ", trrmse, " val rmse= ", vrmse)
        else:
            if verbose and (i+1)%pstep==0:
                print(" iter= ", i+1, "/", n, " train rmse= ", trrmse)
    return initp, trees, lr

def GBpredict(X, initp, trees, lr):
    X = np.asarray(X)
    yp = np.full(X.shape[0], initp)
    for t in trees:
        yp += lr*t.predict(X)
    return yp

#feat selection
def select_features(X,y, method='importance', n_features=None, est=None):
    if method=='none':
        return X, np.arange(X.shape[1])
    if n_features is None:
        n_features = min(int(X.shape[1]*0.7), 100)
    if est is None:
        est = DecisionTreeRegressor(max_depth=5, random_state=seed)
    if method=='importance':
        selector = SelectFromModel(est, threshold='median', max_features=n_features)
        Xs = selector.fit_transform(X,y)
        idx = np.where(selector.get_support())[0]
    elif method=='rfe':
        selector = RFE(est, n_features_to_select=n_features, step=0.2)
        Xs = selector.fit_transform(X,y)
        idx = np.where(selector.get_support())[0]
    else:
        raise ValueError("unknown method " + method)
    print("selected " + str(Xs.shape[1]) + " feats using " + method)
    return Xs, idx

5) Models from Scratch


In [7]:
print("6) Training Models")
print("===========================================")

if not alldata.empty and 'target_price' in alldata.columns and alldata['target_price'].notna().all():
    if 'price_bucket_id' not in alldata.columns:
        pbb = [0,4.99,9.99,14.99,19.99,29.99,39.99,59.99,float('inf')]
        alldata['price_bucket_id'] = pd.cut(alldata['target_price'], bins=pbb, labels=False, right=True)

    keepcols = [c for c in alldata.columns if c not in ['target_price','price_bucket','price_bucket_id','log_price','hours_per_dollar','is_high_value','content_price_ratio','content_price_ratio_log','optimal_price_heuristic','recommended_discount_heuristic','price_elasticity']]
    print("Making base feats and targets")

    yreg = alldata['target_price'].copy()
    ylog = alldata['log_price'].copy()
    ybuck = alldata['price_bucket_id'].copy()

    Xbase = alldata[keepcols].copy()

    print("split train test 80 20 with stratify on bucket")
    X_train_base, X_test_base, ytrainreg, ytestreg = train_test_split(
        Xbase, yreg, test_size=0.2, random_state=seed, stratify=alldata['price_bucket_id']
    )
    ytrainlog = ylog.loc[ytrainreg.index]
    ytestlog = ylog.loc[ytestreg.index]
    ytrainbucket = ybuck.loc[ytrainreg.index]
    ytestbucket = ybuck.loc[ytestreg.index]

    print("train size= ", X_train_base.shape[0], " test size= ", X_test_base.shape[0])

    print("feature engineering separately for train test")
    xtraineng, xtesteng, englist = make_features(X_train_base, X_test_base)
    print("got " + str(len(englist)) + " eng feats in total")

    nf = 5
    cv = StratifiedKFold(n_splits=nf, shuffle=True, random_state=seed)
    print("trying " + str(nf) + "-fold strat cv now")

    allresults = {}

    def evalmodel(yt, yp, name, part="test"):
        mm = {}
        mm['RMSE'] = np.sqrt(mean_squared_error(yt, yp))
        mm['MAE'] = mean_absolute_error(yt, yp)
        mm['MAPE (%)'] = mean_absolute_percentage_error(yt, yp)*100
        mm['R2'] = r2_score(yt, yp)
        if name not in allresults:
            allresults[name] = {}
        if part not in allresults[name]:
            allresults[name][part] = {}
        for mk,mv in mm.items():
            if mk not in allresults[name][part]:
                allresults[name][part][mk] = []
            allresults[name][part][mk].append(mv)
        return mm

    def priceToBucket(prices):
        pbb = [0,4.99,9.99,14.99,19.99,29.99,39.99,59.99,float('inf')]
        return pd.cut(prices, bins=pbb, labels=False, right=True)

    def bucketAcc(yt, yp, tol=0):
        predb = priceToBucket(yp)
        if tol==0:
            return np.mean(predb==yt)*100
        else:
            return np.mean(np.abs(predb-yt)<=tol)*100

    print("feature selection approach maybe")
    numfeats = [f for f in englist if pd.api.types.is_numeric_dtype(xtraineng[f])]

    xtrainnum = xtraineng[numfeats].copy()
    xtestnum = xtesteng[numfeats].copy()
    xtrainnum.replace([np.inf, -np.inf], np.nan, inplace=True)
    xtestnum.replace([np.inf, -np.inf], np.nan, inplace=True)
    xtrainnum.fillna(0, inplace=True)
    xtestnum.fillna(0, inplace=True)

    selmethods = ['importance','none']

    # ridge
    print("train ridge models")
    ridgepars = {
        'regular':{
            'lr':0.005,
            'iterations':1000,
            'lambda':0.5,
            'early_stopping':True
        },
        'log':{
            'lr':0.01,
            'iterations':800,
            'lambda':0.1,
            'early_stopping':True
        }
    }
    final_ridge = {}

    for ttype in ['regular','log']:
        tname = 'Price' if ttype=='regular' else 'LogPrice'
        print("training ridge for: ", tname)
        for selm in selmethods:
            mname = "MLR_Ridge_"+tname
            if selm!='none':
                mname+="_"+selm
            pars = ridgepars[ttype]
            print("model= ", mname," params= ", pars)
            foldpreds=[]
            foldy=[]
            foldms = []
            ytrainThis = ytrainreg if ttype=='regular' else ytrainlog
            ytestThis = ytestreg if ttype=='regular' else ytestlog
            for fold,(trix,valix) in enumerate(cv.split(xtrainnum, ytrainbucket)):
                print("fold= ", fold+1, " / ", nf)
                Xftrain, Xfval = xtrainnum.iloc[trix], xtrainnum.iloc[valix]
                yftrain, yfval = ytrainThis.iloc[trix], ytrainThis.iloc[valix]
                if selm!='none':
                    print("doing ", selm, " selection")
                    Xft_sel, idxsel = select_features(Xftrain.values, yftrain.values, method=selm, n_features=min(100, Xftrain.shape[1]//2))
                    Xfv_sel = Xfval.values[:,idxsel]
                    featnames = Xftrain.columns[idxsel].tolist()
                else:
                    Xft_sel = Xftrain.values
                    Xfv_sel = Xfval.values
                    featnames = Xftrain.columns.tolist()

                stime = time.time()
                W, ch, mmu, mss = RidgeFit(
                    Xft_sel, yftrain.values,
                    lr=pars['lr'],
                    numiter=pars['iterations'],
                    lam=pars['lambda'],
                    early_stop=pars['early_stopping'],
                    verbose=True,
                    pstep=200
                )
                ttime = time.time()-stime
                yvpred = RidgePredict(Xfv_sel, W, mmu, mss)
                foldpreds.append(yvpred)
                foldy.append(yfval.values)
                ms = evalmodel(yfval.values, yvpred, mname, part="fold_"+str(fold+1))
                ms['Training Time']=ttime
                foldms.append(ms)
                print("fold metrics= rmse= ", ms['RMSE'], " mae= ", ms['MAE'], " mape= ", ms['MAPE (%)'], " r2= ", ms['R2'])
            avgrmse = np.mean([m['RMSE'] for m in foldms])
            avgmae = np.mean([m['MAE'] for m in foldms])
            avgmape = np.mean([m['MAPE (%)'] for m in foldms])
            avgr2 = np.mean([m['R2'] for m in foldms])
            print("cv average= rmse= ", avgrmse, " mae= ", avgmae, " mape= ", avgmape, " r2= ", avgr2)

            if selm=='none':
                print("train final on full training set no selection")
                if selm!='none':
                    pass
                else:
                    Xtrain_sel = xtrainnum.values
                    Xtest_sel = xtestnum.values
                    final_feats = xtrainnum.columns.tolist()

                stime2 = time.time()
                W, ch, mmu, mss = RidgeFit(
                    Xtrain_sel, ytrainThis.values,
                    lr=pars['lr'],
                    numiter=pars['iterations'],
                    lam=pars['lambda'],
                    early_stop=pars['early_stopping'],
                    verbose=True,
                    pstep=200
                )
                fttime2 = time.time()-stime2
                ytpred = RidgePredict(Xtest_sel, W, mmu, mss)
                testms = evalmodel(ytestThis.values, ytpred, mname, part="test")
                testms['Training Time']=fttime2

                if ttype=='log':
                    ypreorig = np.expm1(ytpred)
                    ytestorig = np.expm1(ytestThis.values)
                    om = {}
                    om['RMSE (orig)'] = np.sqrt(mean_squared_error(ytestorig, ypreorig))
                    om['MAE (orig)'] = mean_absolute_error(ytestorig, ypreorig)
                    om['MAPE (%) (orig)'] = mean_absolute_percentage_error(ytestorig, ypreorig)*100
                    for k,v in om.items():
                        if k not in allresults[mname]["test"]:
                            allresults[mname]["test"][k]=[]
                        allresults[mname]["test"][k].append(v)
                    print("test metrics on original scale= ", om)

                if ttype=='log':
                    pred4b = np.expm1(ytpred)
                else:
                    pred4b = ytpred
                bacc = bucketAcc(ytestbucket.values, pred4b, tol=0)
                bacc1 = bucketAcc(ytestbucket.values, pred4b, tol=1)
                if 'Price Bucket Accuracy' not in allresults[mname]["test"]:
                    allresults[mname]["test"]['Price Bucket Accuracy']=[]
                allresults[mname]["test"]['Price Bucket Accuracy'].append(bacc)
                if 'Price Bucket Accuracy (±1)' not in allresults[mname]["test"]:
                    allresults[mname]["test"]['Price Bucket Accuracy (±1)']=[]
                allresults[mname]["test"]['Price Bucket Accuracy (±1)'].append(bacc1)
                print("bucket acc= ", bacc, " bucket acc ±1= ", bacc1)

                fm = {
                    'weights':W,
                    'means':mmu,
                    'stds':mss,
                    'feature_names':final_feats,
                    'params':pars,
                    'metrics':testms,
                    'is_log_model':(ttype=='log')
                }
                fnam = mname.lower()+"_final.joblib"
                pathf = os.path.join(modelfolder, fnam)
                joblib.dump(fm, pathf)
                print("saved final model= ", pathf)
                final_ridge[mname]=fm

    # GBDT
    print("Training gradient boosting from scratch")
    gpar = {
        'regular':{
            'n_estimators':200,
            'learning_rate':0.05,
            'max_leaves':12,
            'min_samples':20,
            'early_stopping':True,
            'early_stopping_rounds':10,
            'use_sample_weights':True
        },
        'log':{
            'n_estimators':200,
            'learning_rate':0.03,
            'max_leaves':12,
            'min_samples':20,
            'early_stopping':True,
            'early_stopping_rounds':10,
            'use_sample_weights':False
        }
    }
    final_gb = {}

    for ttype in ['regular','log']:
        tname = 'Price' if ttype=='regular' else 'LogPrice'
        print("Training GBDT for= ", tname)
        for selm in selmethods:
            mname = "GBDT_"+tname
            if selm!='none':
                mname+="_"+selm
            pp = gpar[ttype]
            print("model= ", mname, " params= ", pp)
            foldpreds=[]
            foldy=[]
            foldms=[]
            ytrainThis = ytrainreg if ttype=='regular' else ytrainlog
            ytestThis = ytestreg if ttype=='regular' else ytestlog
            for fold,(trix,valix) in enumerate(cv.split(xtrainnum, ytrainbucket)):
                print("fold= ", fold+1, "/", nf)
                Xftrain, Xfval = xtrainnum.iloc[trix], xtrainnum.iloc[valix]
                yftrain, yfval = ytrainThis.iloc[trix], ytrainThis.iloc[valix]
                if selm!='none':
                    print("selection= ", selm)
                    Xft_sel, idxsel = select_features(Xftrain.values, yftrain.values, method=selm, n_features=min(100, Xftrain.shape[1]//2))
                    Xfv_sel = Xfval.values[:,idxsel]
                    fnames = Xftrain.columns[idxsel].tolist()
                else:
                    Xft_sel = Xftrain.values
                    Xfv_sel = Xfval.values
                    fnames = Xftrain.columns.tolist()
                startt = time.time()
                ip, trees, lrate = GBfit(
                    Xft_sel, yftrain.values,
                    n=pp['n_estimators'],
                    lr=pp['learning_rate'],
                    maxleaf=pp['max_leaves'],
                    minleaf=pp['min_samples'],
                    verbose=True,
                    pstep=20,
                    Xv=Xfv_sel,
                    yv=yfval.values,
                    esr=pp['early_stopping_rounds'],
                    usew=pp['use_sample_weights']
                )
                ttt = time.time()-startt
                yvpred = GBpredict(Xfv_sel, ip, trees, lrate)
                foldpreds.append(yvpred)
                foldy.append(yfval.values)
                ms = evalmodel(yfval.values, yvpred, mname, part="fold_"+str(fold+1))
                ms['Training Time']=ttt
                foldms.append(ms)
                print("fold metrics= rmse= ", ms['RMSE'], " mae= ", ms['MAE'], " mape= ", ms['MAPE (%)'], " r2= ", ms['R2'])
            avgrmse = np.mean([m['RMSE'] for m in foldms])
            avgmae = np.mean([m['MAE'] for m in foldms])
            avgmape = np.mean([m['MAPE (%)'] for m in foldms])
            avgr2 = np.mean([m['R2'] for m in foldms])
            print("cv avg= rmse= ", avgrmse, " mae= ", avgmae, " mape= ", avgmape, " r2= ", avgr2)

            if selm=='none':
                print("final training on full data now for ", mname)
                if selm!='none':
                    pass
                else:
                    Xtrain_sel = xtrainnum.values
                    Xtest_sel = xtestnum.values
                    finfnames = xtrainnum.columns.tolist()
                valsize = 0.15
                vix = np.random.choice(len(Xtrain_sel), int(len(Xtrain_sel)*valsize), replace=False)
                masktr = np.ones(len(Xtrain_sel), dtype=bool)
                masktr[vix]=False
                Xtr_final = Xtrain_sel[masktr]
                ytr_final = ytrainThis.values[masktr]
                Xval_final = Xtrain_sel[~masktr]
                yval_final = ytrainThis.values[~masktr]

                st3 = time.time()
                ip, trees, lrate = GBfit(
                    Xtr_final, ytr_final,
                    n=pp['n_estimators'],
                    lr=pp['learning_rate'],
                    maxleaf=pp['max_leaves'],
                    minleaf=pp['min_samples'],
                    verbose=True,
                    pstep=20,
                    Xv=Xval_final,
                    yv=yval_final,
                    esr=pp['early_stopping_rounds'],
                    usew=pp['use_sample_weights']
                )
                ftdur3 = time.time()-st3
                ytpred = GBpredict(Xtest_sel, ip, trees, lrate)
                testms = evalmodel(ytestThis.values, ytpred, mname, part="test")
                testms['Training Time'] = ftdur3

                if ttype=='log':
                    ypreor = np.expm1(ytpred)
                    ytestor = np.expm1(ytestThis.values)
                    om = {}
                    om['RMSE (orig)']=np.sqrt(mean_squared_error(ytestor, ypreor))
                    om['MAE (orig)']=mean_absolute_error(ytestor, ypreor)
                    om['MAPE (%) (orig)']=mean_absolute_percentage_error(ytestor, ypreor)*100
                    for k,v in om.items():
                        if k not in allresults[mname]["test"]:
                            allresults[mname]["test"][k]=[]
                        allresults[mname]["test"][k].append(v)
                    print("test metrics orig scale= ", om)
                if ttype=='log':
                    p4b = np.expm1(ytpred)
                else:
                    p4b = ytpred
                bacc = bucketAcc(ytestbucket.values, p4b, tol=0)
                bacc1 = bucketAcc(ytestbucket.values, p4b, tol=1)
                if 'Price Bucket Accuracy' not in allresults[mname]["test"]:
                    allresults[mname]["test"]['Price Bucket Accuracy']=[]
                allresults[mname]["test"]['Price Bucket Accuracy'].append(bacc)
                if 'Price Bucket Accuracy (±1)' not in allresults[mname]["test"]:
                    allresults[mname]["test"]['Price Bucket Accuracy (±1)']=[]
                allresults[mname]["test"]['Price Bucket Accuracy (±1)'].append(bacc1)
                print("bucket acc= ", bacc, " plusminus1= ", bacc1)
                fmodel = {
                    'initial_pred':ip,
                    'trees':trees,
                    'learning_rate':lrate,
                    'feature_names':finfnames,
                    'params':pp,
                    'metrics':testms,
                    'is_log_model':(ttype=='log')
                }
                fname = mname.lower()+"_final.joblib"
                ppth = os.path.join(modelfolder, fname)
                joblib.dump(fmodel, ppth)
                print("saved final model= ", ppth)
                final_gb[mname]=fmodel

    print("7) Feature Importance")
    print("====================================================")

    best_ridge_name = None
    best_ridge_mae = float('inf')
    for mn,mi in final_ridge.items():
        if mi['is_log_model']:
            mae = mi['metrics']['MAE (orig)'] if 'MAE (orig)' in mi['metrics'] else mi['metrics']['MAE']
        else:
            mae = mi['metrics']['MAE']
        if mae<best_ridge_mae:
            best_ridge_mae = mae
            best_ridge_name = mn

    best_gb_name = None
    best_gb_mae = float('inf')
    for mn,mi in final_gb.items():
        if mi['is_log_model']:
            mae = mi['metrics']['MAE (orig)'] if 'MAE (orig)' in mi['metrics'] else mi['metrics']['MAE']
        else:
            mae = mi['metrics']['MAE']
        if mae<best_gb_mae:
            best_gb_mae = mae
            best_gb_name = mn

    print("best ridge= ", best_ridge_name, " mae= ", best_ridge_mae)
    print("best GBDT= ", best_gb_name, " mae= ", best_gb_mae)

    print("----- ridge feature importance maybe -----")
    if best_ridge_name and best_ridge_name in final_ridge:
        minf = final_ridge[best_ridge_name]
        W = minf['weights']
        fnames = minf['feature_names']
        if len(W)>1 and len(fnames)==len(W)-1:
            coefs = W[1:]
            dfimp = pd.DataFrame({'Feature':fnames, 'Coefficient':coefs})
            dfimp['Abs_Coefficient']=np.abs(dfimp['Coefficient'])
            dfimp.sort_values('Abs_Coefficient', ascending=False, inplace=True)
            print("Top 20 feats by absolute coef:")
            print(dfimp[['Feature','Coefficient']].head(20))

            plt.figure(figsize=(12,10))
            topf = dfimp.head(20).copy()
            topf.sort_values('Coefficient', inplace=True)
            colorz = ['red' if x<0 else 'green' for x in topf['Coefficient']]
            plt.barh(topf['Feature'], topf['Coefficient'], color=colorz)
            plt.title("Top 20 Ridge Features " + best_ridge_name)
            plt.axvline(x=0, color='black', linestyle='-', alpha=0.5)
            plt.grid(True, axis='x')
            plt.tight_layout()
            plt.savefig('feature_importance_ridge_improved.png')
            plt.close()
            print("saved ridge feats png")

            cats = {
                'Age':['age_','is_new_release'],
                'Quality':['combined_score','quality_indicator','is_highly_rated'],
                'Genre':['genres_has_','is_action_rpg'],
                'Publisher':['publisher','developer'],
                'Engagement':['achievements','has_achievements','reviews_'],
                'GameContent':['hltb_','tags_count','genres_count']
            }
            catimp = {}
            for cat, patlist in cats.items():
                cfeats = []
                for pat in patlist:
                    cfeats.extend([f for f in dfimp['Feature'] if pat in f])
                sabs = dfimp[dfimp['Feature'].isin(cfeats)]['Abs_Coefficient'].sum()
                catimp[cat] = sabs
            if catimp:
                plt.figure(figsize=(10,6))
                catdf = pd.DataFrame({'Category':list(catimp.keys()), 'Importance':list(catimp.values())})
                catdf.sort_values('Importance', ascending=False, inplace=True)
                plt.barh(catdf['Category'], catdf['Importance'])
                plt.title("Feature Category Importance Ridge " + best_ridge_name)
                plt.grid(True, axis='x')
                plt.tight_layout()
                plt.savefig('feature_category_importance.png')
                plt.close()
                print("saved cat importance")
                print("by cat:")
                sst = sum(catimp.values())
                for c,v in sorted(catimp.items(), key=lambda x: x[1], reverse=True):
                    print(c,": ", v, " -> ", v/sst*100, "%")
        else:
            print("mismatch in shape")
    else:
        print("no ridge model to do feat importance")

    print("----- GBDT feature importance maybe -----")
    if best_gb_name and best_gb_name in final_gb:
        minf = final_gb[best_gb_name]
        trees = minf['trees']
        fnames = minf['feature_names']
        if trees:
            print("we have " + str(len(trees)) + " trees checking feature_importances")
            timports = []
            for t in trees:
                if hasattr(t,'feature_importances_'):
                    timports.append(t.feature_importances_)
            if timports and all(len(imp)==len(fnames) for imp in timports):
                aimp = np.mean(timports, axis=0)
                dfimp = pd.DataFrame({'Feature':fnames, 'Avg_Importance':aimp})
                dfimp.sort_values('Avg_Importance', ascending=False, inplace=True)
                print("Top 20 feats for GBDT:")
                print(dfimp.head(20))

                plt.figure(figsize=(12,10))
                topf = dfimp.head(20).copy()
                plt.barh(topf['Feature'], topf['Avg_Importance'])
                plt.title("Top 20 DGBT feats " + best_gb_name)
                plt.grid(True, axis='x')
                plt.tight_layout()
                plt.savefig('feature_importance_GBDT_improved.png')
                plt.close()
                print("saved GDBT feats png")

                if len(timports)>1:
                    stdev = np.std(timports, axis=0)
                    dfimp['Importance_StdDev'] = stdev
                    dfimp['Importance_CV'] = stdev/(aimp+1e-10)
                    print("importance stability top10:")
                    stable = dfimp.head(10)
                    print(stable[['Feature','Avg_Importance','Importance_StdDev','Importance_CV']])
            else:
                print("Couldnt do average importances")
        else:
            print("No trees found in best GBDT model?")
    else:
        print("No GBDT model for feat importance")

    print("8) Model Load & Inference example")
    print("======================================")
    best_model_name = best_ridge_name if best_ridge_mae<=best_gb_mae else best_gb_name
    best_model_type = "Ridge" if "Ridge" in best_model_name else "GBDT"
    is_log = "LogPrice" in best_model_name
    print("Best model is= ", best_model_name)

    modfile = best_model_name.lower()+"_final.joblib"
    mp = os.path.join(modelfolder, modfile)
    try:
        loaded_model = joblib.load(mp)
        print("Loaded model from disk")

        def predict_with_model(dfxx, modelinfo, mtype="Ridge"):
            flz = modelinfo['feature_names']
            for f in flz:
                if f not in dfxx.columns:
                    print("missing feature= ", f, " filling with 0")
                    dfxx[f]=0
            XX = dfxx[flz].values
            if mtype=="Ridge":
                W = modelinfo['weights']
                mmu = modelinfo['means']
                mss = modelinfo['stds']
                p = RidgePredict(XX, W, mmu, mss)
            else:
                ipred = modelinfo['initial_pred']
                tr = modelinfo['trees']
                lr = modelinfo['learning_rate']
                p = GBpredict(XX, ipred, tr, lr)
            if modelinfo['is_log_model']:
                p = np.expm1(p)
            return p

        nsample = min(5, len(xtesteng))
        sidx = np.random.choice(len(xtesteng), nsample, replace=False)
        sampleX = xtesteng.iloc[sidx]
        if is_log:
            samplT = np.expm1(ytestlog.iloc[sidx].values)
        else:
            samplT = ytestreg.iloc[sidx].values

        preds = predict_with_model(sampleX, loaded_model, best_model_type)

        print("Sample predictions with loaded model:")
        print("Game Title                            ActualPrice   PredPrice     AbsErr")
        print("---------------------------------------------------------------------")
        gtitle = []
        if 'title' in alldata.columns:
            gtitle = alldata.iloc[ytestreg.iloc[sidx].index]['title'].values
        else:
            gtitle = ["Game "+str(i+1) for i in range(nsample)]
        for i,(tt,ac,pr) in enumerate(zip(gtitle, samplT, preds)):
            shortt = (tt[:37]+'...') if len(tt)>40 else tt
            print(shortt.ljust(40), str(ac).rjust(12), str(round(pr,2)).rjust(12), str(round(abs(ac-pr),2)).rjust(12))
        smae = mean_absolute_error(samplT, preds)
        smape = mean_absolute_percentage_error(samplT, preds)*100
        print("sample mae= ", smae, " sample mape= ", smape, " %")

        print("in order to do inference with the saved model just do these steps basically:")
        if best_model_type=="Ridge":
            print("1) joblibload the file")
            print("2) get weights, means, stds, feature_names from it")
            print("3) arrange input features in the same columns order")
            print("4) call RidgePredict( X, W, means, stds ) to get predictions")
            if is_log:
                print("5) do np.expm1 on predictions to revert from log scale")
        else:
            print("1) joblibload the file")
            print("2) get initial_pred, trees, learning_rate, feature_names from it")
            print("3) arrange input features similarly")
            print("4) call GBpredict( X, init_pred, trees, lr ) to get predictions")
            if is_log:
                print("5) again do np.expm1 if it's log model")

    except Exception as e:
        print("couldnt load or use model reason= ", e)

    print("9) Price Elasticity")
    print("======================================================================")

    def est_elasticity(row, leak_free=True):
        base = -1.5
        sc = row.get('combined_score',0.5)
        ag = row.get('age_years',1)
        rv = row.get('reviews_total',0)
        qi = row.get('quality_indicator',0.5)
        aaa = row.get('publishers_is_aaa',0)
        ihr = row.get('is_highly_rated',0)

        if pd.notna(qi):
            base += qi*0.8
        if ihr:
            base +=0.3
        if aaa:
            base +=0.35
        if pd.notna(ag) and ag>0:
            af = min(0.6, (ag/3)*0.2)
            base -= af
        if pd.notna(rv) and rv>0:
            rvf = min(0.5, np.log1p(rv)/9)
            base += rvf
        if row.get('genres_has_indie',0):
            base -=0.25
        if row.get('genres_has_casual',0):
            base -=0.3
        if row.get('genres_has_rpg',0):
            base +=0.25
        if row.get('genres_has_strategy',0):
            base +=0.15
        if row.get('is_action_rpg',0):
            base +=0.1

        return max(-3.0, min(-0.2, base))

    xtesteng['price_elasticity'] = xtesteng.apply(est_elasticity, axis=1)

    plt.figure(figsize=(10,6))
    sns.histplot(xtesteng['price_elasticity'], kde=True, bins=30)
    plt.title("Price Elasticity Distribution")
    plt.xlabel("Price Elasticity")
    plt.grid(True)
    plt.savefig('elasticity_distribution_leak_free.png')
    plt.close()
    print("Wrote elasticity distribution figure")

    if 'publishers_is_aaa' in xtesteng.columns:
        plt.figure(figsize=(10,6))
        sns.boxplot(x='publishers_is_aaa', y='price_elasticity', data=xtesteng)
        plt.title("Elasticity by Publisher Type")
        plt.xlabel("AAA or Not")
        plt.ylabel("Elasticity")
        plt.grid(True)
        plt.savefig('elasticity_by_publisher_leak_free.png')
        plt.close()
        print("Wrote elasticity by Publisher figure")

    print("Making discount recs")
    def rec_discount(row, leak_free=True):
        disc = 0.0
        ag = row.get('age_years',1)
        sc = row.get('combined_score',0.5)
        el = row.get('price_elasticity',-1.5)
        rv = row.get('reviews_total',0)
        ql = row.get('quality_indicator',0.5)
        aaa = row.get('publishers_is_aaa',0)

        if pd.notna(ag) and ag>0:
            if ag<0.5:
                aged = 0.05*(ag/0.5)
            elif ag<1:
                aged = 0.05 + (ag-0.5)*0.15
            elif ag<2:
                aged = 0.125 + (ag-1)*0.125
            else:
                aged = 0.25 + min(0.3, (ag-2)*0.1)
            disc+=aged
        if pd.notna(sc):
            if sc<0.4:
                scd=0.15
            elif sc<0.7:
                scd = 0.15 - (sc-0.4)*0.33
            else:
                scd = 0.05 - (sc-0.7)*0.15
            if scd>0:
                disc+=scd
        if pd.notna(ql):
            disc+= (1-ql)*0.15
        if pd.notna(el):
            nel = np.clip((abs(el)-0.2)/2.8,0,1)
            disc+= nel*0.25
        if pd.notna(rv) and rv>=0:
            lr = np.log1p(rv)
            popd = max(0, 0.2*(1-np.clip(lr/8,0,1)))
            disc+=popd
        if aaa:
            if ag<1:
                disc-=0.1
            elif ag>3:
                disc+=0.05
        if row.get('genres_has_indie',0):
            disc+=0.05
        if row.get('genres_has_casual',0):
            disc+=0.05
        if row.get('genres_has_rpg',0) and not row.get('is_action_rpg',0):
            disc-=0.05
        if row.get('is_action_rpg',0):
            disc-=0.03
        if row.get('genres_has_simulation',0):
            disc-=0.05

        return min(90, max(0, disc*100))

    xtesteng['recommended_discount'] = xtesteng.apply(rec_discount, axis=1)

    plt.figure(figsize=(10,6))
    sns.histplot(xtesteng['recommended_discount'], bins=30, kde=True)
    plt.title("distribution recommended discount maybe")
    plt.xlabel("discount %")
    plt.grid(True)
    plt.savefig('discount_distribution_leak_free.png')
    plt.close()
    print("saved discount distribution figure")

    print("Price optimization with best model predictions")
    if best_model_name in final_ridge or best_model_name in final_gb:
        moinfo = final_ridge.get(best_model_name) or final_gb.get(best_model_name)
        mty = "Ridge" if "Ridge" in best_model_name else "GBDT"
        basepred = predict_with_model(xtesteng, moinfo, mty)

        def sim_price_change(dff, basep, minf, mtype, pads=None):
            if pads is None:
                pads = [-30,-20,-10,-5,0,5,10,20,30]
            elas = dff['price_elasticity'].values
            baseq = np.full(len(basep),100)
            baser = basep*baseq
            res = []
            for ap in pads:
                newp = basep*(1+ap/100)
                pch = ap/100
                qch=[]
                for i,e in enumerate(elas):
                    if pch>=0:
                        dd = pch*e*(1+0.5*pch)
                    else:
                        dd = pch*e*(1-0.3*pch)
                    qch.append(dd)
                newq = baseq*(1+np.array(qch))
                newq = np.maximum(0,newq)
                newr = newp*newq
                revc = (newr/baser -1)*100
                res.append({
                    'price_adj_pct':ap,
                    'avg_price':np.mean(newp),
                    'avg_quantity':np.mean(newq),
                    'avg_revenue':np.mean(newr),
                    'avg_rev_change_pct':np.mean(revc)
                })
            return pd.DataFrame(res)

        scen = [-30,-20,-10,-5,0,5,10,15,20,25,30]
        simres = sim_price_change(xtesteng, basepred, moinfo, mty, pads=scen)
        print("sim results for price changes")
        print(simres[['price_adj_pct','avg_price','avg_revenue','avg_rev_change_pct']])

        plt.figure(figsize=(10,6))
        plt.plot(simres['price_adj_pct'], simres['avg_rev_change_pct'], 'o-')
        plt.axhline(y=0, color='r', linestyle='--')
        plt.axvline(x=0, color='r', linestyle='--')
        plt.grid(True)
        plt.title("price change vs revenue change simulation")
        plt.xlabel("price change%")
        plt.ylabel("rev change%")
        plt.tight_layout()
        plt.savefig('price_elasticity_simulation.png')
        plt.close()
        print("saved simulation result figure")

        bestidx = simres['avg_rev_change_pct'].idxmax()
        bestrow = simres.loc[bestidx]
        print("optimal strategy maybe is= ", bestrow['price_adj_pct'], " % price adjustment")
        print("estimated revenue impact= ", bestrow['avg_rev_change_pct'], " %")

        print("some sample game recs")
        ns = min(5, len(xtesteng))
        sids = np.random.choice(len(xtesteng), ns, replace=False)
        sFeatures = xtesteng.iloc[sids]
        spreds = basepred[sids]
        sels = xtesteng.iloc[sids]['price_elasticity'].values
        sdisc = xtesteng.iloc[sids]['recommended_discount'].values

        stitles=[]
        if 'title' in alldata.columns:
            stitles = alldata.iloc[ytestreg.iloc[sids].index]['title'].values
        else:
            stitles = ["Game " + str(i+1) for i in range(ns)]
        for i,(tt,pp,ee,dd) in enumerate(zip(stitles, spreds, sels, sdisc)):
            shortt = (tt[:37]+'...') if len(tt)>40 else tt
            print("\nGame= ", shortt)
            print("pred price= ", round(pp,2), " elasticity= ", round(ee,2), " recommended discount= ", round(dd,1), "%")
            if ee>-1.0:
                padj = "+5% to +15%"
                desc = "Premium pricing recommended"
            elif ee>-1.5:
                padj = "-5% to +5%"
                desc = "Maybe keep current price"
            else:
                padj = "-15% to -5%"
                desc = "Try lowering price"
            print("Price adjustment recommendation= ", padj)
            print(desc)
            newsale = pp*(1-dd/100)
            print("Sale price around= ", round(newsale,2))
    else:
        print("No best model for price optimization")

    print("10) Conclusion")
    print("===========================================")

    print("comparing final model performance:")
    def format_res(rd):
        out={}
        for mod, ds in rd.items():
            if 'test' in ds:
                mm = ds['test']
                rr={}
                if 'RMSE' in mm:
                    rr['RMSE'] = np.mean(mm['RMSE'])
                if 'RMSE (orig)' in mm:
                    rr['RMSE (orig)'] = np.mean(mm['RMSE (orig)'])
                if 'MAE' in mm:
                    rr['MAE'] = np.mean(mm['MAE'])
                if 'MAPE (%)' in mm:
                    rr['MAPE (%)'] = np.mean(mm['MAPE (%)'])
                if 'R2' in mm:
                    rr['R2'] = np.mean(mm['R2'])
                if 'Price Bucket Accuracy' in mm:
                    rr['Bucket Acc']= np.mean(mm['Price Bucket Accuracy'])
                if 'Price Bucket Accuracy (±1)' in mm:
                    rr['Bucket Acc (±1)']= np.mean(mm['Price Bucket Accuracy (±1)'])
                out[mod]=rr
        return pd.DataFrame(out).T

    regmods = {k:v for k,v in allresults.items() if 'LogPrice' not in k}
    lgmods = {k:v for k,v in allresults.items() if 'LogPrice' in k}

    if regmods:
        regdf = format_res(regmods)
        if not regdf.empty:
            print("direct price models:")
            if 'MAE' in regdf.columns:
                regdf=regdf.sort_values('MAE')
            print(regdf)
    if lgmods:
        lgdf = format_res(lgmods)
        if not lgdf.empty:
            print("log price models:")
            if 'MAE' in lgdf.columns:
                lgdf=lgdf.sort_values('MAE')
            elif 'RMSE (orig)' in lgdf.columns:
                lgdf=lgdf.sort_values('RMSE (orig)')
            print(lgdf)

    best_info = None
    if best_model_name in final_ridge:
        best_info = final_ridge[best_model_name]
    elif best_model_name in final_gb:
        best_info = final_gb[best_model_name]

    if best_info:
        print("Best model= ", best_model_name)
        print("MAE= ", best_info['metrics']['MAE'])
        if 'Price Bucket Accuracy (±1)' in best_info['metrics']:
            pacc = best_info['metrics']['Price Bucket Accuracy (±1)']
            print("price bracket accuracy ±1= ", pacc)
        if best_info['is_log_model'] and 'RMSE (orig)' in best_info['metrics']:
            print("RMSE on original scale= ", best_info['metrics']['RMSE (orig)'])

    print("done")
else:
    print("empty data")

6) Training Models
Making base feats and targets
split train test 80 20 with stratify on bucket
train size=  52087  test size=  13022
feature engineering separately for train test
feature engineering for train size:  52087  test size:  13022
no valid dates found
doing reviewscore normalization
made combined_score from the available review norms
Processing tags genres categories
made 34 flags from tags or genres
making compound features
made 3 compound feats
processing developer publisher stuff
processing numeric features
creating advanced feats now maybe
made quality_indicator from 3 feats
Free or cheap games feature
There are 4125 free or cheap titles in train data
Checking low variance features
found 14 features no variance: ['reviews_d7', 'reviews_d30', 'reviews_d90', 'name_slug', 'revenue_estimated', 'has_dlc_keywords', 'has_premium_keywords', 'genres_has_singleplayer', 'genres_has_open_world', 'genres_has_puzzle']
Done with Feature Engineering. Made 76 feats
got 67 eng feats in to