Importing important libraries

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math
import os

from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MultiLabelBinarizer, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.filterwarnings('ignore')

import ast
from collections import Counter
from wordcloud import WordCloud
# from sentence_transformers import SentenceTransformer

from numpy import unique
from scipy.stats import skew
import pickle
import json
import joblib

Load Testing Data
- send path of csv files of testing data as parameter 
- use input() to write path of csv files if they are not in the same folder , if they are just write its name 
- use GUI and make button for upload data 

question: Do I really to know the name of each csv file or by col I can detect it and merge it correctly?

In [7]:
# Define file paths
DEMOS_PATH = '../../data/raw/demos.csv'
DLCS_PATH = '../../data/raw/dlcs.csv'
BASE_GAMES_PATH = '../../data/raw/info_base_games.csv'
GAMALYTIC_PATH = '../../data/raw/gamalytic_steam_games.csv'

# Read CSV files
demo_df = pd.read_csv(DEMOS_PATH)
dlc_df = pd.read_csv(DLCS_PATH)
info_base_games_df = pd.read_csv(BASE_GAMES_PATH, low_memory=False)
gamalytic_steam_games_df = pd.read_csv(GAMALYTIC_PATH)

Preprocess DLC , DEMO CSV 

yeah I can drop nulls here BUT

I CAN'T DROP ANY NULLS in GAMALYTIC CSV FILE

In [8]:
dlc_df.head()

Unnamed: 0,base_appid,dlc_appid,name
0,1786750,2568660,家出王女 - 全年齢版ストーリー&グラフィック追加 DLC
1,1981700,2563730,Jacob's Quest - Voyage
2,2009450,2552980,Invector: Rhythm Galaxy - Latin Power Song Pack
3,1133420,2550750,Hero or Villain: Genesis — Supercharged!
4,2533950,2551000,Hot And Lovely ：Uniform - adult patch


In [9]:
def preprocess_dlc_demo(demo_df,dlc_df):
    demo_df.drop("Unnamed: 0", axis=1, inplace=True) # Drop auto_incremented columns
    
    # Standardize column names
    demo_df.rename(columns={"full_game_appid": "appid"}, inplace=True)
    dlc_df.rename(columns={"base_appid": "appid"}, inplace=True)

    # Handle missing values
    #MODIFY THIS
    #demo_df.dropna(inplace=True)
    #dlc_df.dropna(inplace=True)

    
    # Convert data types
    for df, appid_col in [(demo_df, "demo_appid"), (dlc_df, "dlc_appid")]:
        df[appid_col] = pd.to_numeric(df[appid_col], errors="coerce") 
        df["appid"] = pd.to_numeric(df["appid"], errors="coerce")

        # Drop rows where appid or demo/dlc_appid is missing/NaN
        df.dropna(subset=[appid_col, "appid"], inplace=True)

        df[appid_col] = df[appid_col].astype(int)
        df["appid"] = df["appid"].astype(int)
        df["name"] = df["name"].astype(str)


    #Remove duplicates by unique identifier
    demo_df.drop_duplicates(subset=["demo_appid"], inplace=True)
    dlc_df.drop_duplicates(subset=["dlc_appid"], inplace=True)

    # Clean the 'name' column
    def clean_name_column(df, column_name):
        df[column_name] = (df[column_name]
            .str.replace(r"(?i)\bdemo\b", "", regex=True)
            .str.replace(r"(?i)\bdlc\b", "", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
            .str.title())
        return df

    demo_df = clean_name_column(demo_df, "name")
    dlc_df = clean_name_column(dlc_df, "name")

    # Reset the indices of the dataframe to the original order.
    demo_df.reset_index(drop=True, inplace=True)
    dlc_df.reset_index(drop=True, inplace=True)

    # Create features : HAS_DEMO , HAS_DLC , DEMO_COUNT , DLC_COUNT
    def create_indicator_features(df, group_by_col, count_col, feature_prefix):
        # group by appid to count the demo/dlc count for each game 
        indicators = df.groupby(group_by_col)[count_col].count().reset_index()
        
        # create has_feature column
        indicators[f'has_{feature_prefix}'] = 1

        # create count column
        indicators[f'{feature_prefix}_count'] = indicators[count_col]
        
        # e.g.: return demo_df[['appid', 'has_demo', 'demo_count']]  
        return indicators[[group_by_col, f'has_{feature_prefix}', f'{feature_prefix}_count']]

    demo_indicators = create_indicator_features(demo_df, 'appid', 'demo_appid', 'demo')
    dlc_indicators = create_indicator_features(dlc_df, 'appid', 'dlc_appid', 'dlc')
    return demo_indicators , dlc_indicators



    

In [10]:
# Apply Preprocessing for dlc and demo csv files 
demo_indicators , dlc_indicators = preprocess_dlc_demo(demo_df,dlc_df)

In [11]:
dlc_indicators

Unnamed: 0,appid,has_dlc,dlc_count
0,380,1,1
1,420,1,1
2,2810,1,1
3,4700,1,1
4,4850,1,1
...,...,...,...
5439,3433030,1,1
5440,3445550,1,1
5441,3452270,1,1
5442,3454830,1,1


In [12]:
# Drop Corrupted Sample That Contains The Column Names
info_base_games_df = info_base_games_df.drop(index=9929)

# Merging all csv files into one csv file
def merge_all_csvs():
    gamalytic_steam_games_df.rename(columns={"steamId": "appid"}, inplace=True)

    # Ensure both DataFrames have the same data type for 'appid'
    info_base_games_df['appid'] = info_base_games_df['appid'].astype(int)
    gamalytic_steam_games_df['appid'] = gamalytic_steam_games_df['appid'].astype(int)

    # Merge info_base_games_df with gamalytic_steam_games_df into a new DataFrame
    merged_df = gamalytic_steam_games_df.merge(info_base_games_df, on="appid", how="left")

    # Merge demo_indicators with merged_df
    merged_df = merged_df.merge(demo_indicators, on="appid", how="left")

    # Merge dlc_indicators with merged_df
    merged_df = merged_df.merge(dlc_indicators, on="appid", how="left")

    return merged_df
 
    

In [13]:
final_df = merge_all_csvs()

In [14]:
final_df

Unnamed: 0,appid,price,copiesSold,publisherClass,reviewScore,aiContent,name,metacritic,steam_achievements,steam_trading_cards,workshop_support,genres,achievements_total,release_date,supported_platforms,has_demo,demo_count,has_dlc,dlc_count
0,730,0.00,302158048,AAA,87,,Counter-Strike 2,,False,True,True,"Action, Free To Play",1.0,"Aug 21, 2012","['windows', 'linux']",,,,
1,570,0.00,212896574,AAA,82,,Dota 2,90.0,False,True,True,"Action, Strategy, Free To Play",,"Jul 9, 2013","['windows', 'mac', 'linux']",,,,
2,578080,0.00,161971233,AAA,59,,PUBG: BATTLEGROUNDS,,False,False,False,"Action, Adventure, Massively Multiplayer, Free...",37.0,"Dec 21, 2017",['windows'],,,,
3,440,0.00,99060457,AAA,90,,Team Fortress 2,92.0,True,True,True,"Action, Free To Play",520.0,"Oct 10, 2007","['windows', 'linux']",,,,
4,1172470,0.00,67554185,AAA,67,,Apex Legends™,88.0,True,True,False,"Action, Adventure, Free To Play",12.0,"Nov 4, 2020",['windows'],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94852,582110,7.99,0,Hobbyist,0,,,,,,,,,,,,,,
94853,514510,9.99,0,Indie,0,,,,,,,,,,,,,,
94854,497380,14.99,0,Indie,0,,,,,,,,,,,,,,
94855,456430,1.99,0,Indie,0,,,,,,,,,,,,,,


In [20]:
final_df
final_df.columns # THESE ARE THE COLUMNS THAT WE SHOULD HANDLE NULL VALUES IN ANY OF THEM

final_df['appid'].dtype
final_df['aiContent'].dtype

dtype('float64')

Preprocessing features of infobase and gamalytic 

1. For each feature here , I need to handle nulls and outliers 

2. Feature Engineering (Create new features)

3. Feature Transformation (Scaling , Standardization , log transform , encoding)

We should handle nulls before encoding cuz encoding will cause error for null values


# Handling Nulls

In [None]:
# if steam achievement is false and achievement total is null so set it zero
final_df.loc[(final_df['steam_achievements'] == False) & (final_df['achievements_total'].isna()), 'achievements_total'] = 0


# if Steam achievement is true and achievement total is null so set it by median (laod it)
# Handling Nulls for each feature
with open('Joblib_JSON_files/default_values.json', 'r') as f:
    default_values = json.load(f)

for column, value in default_values.items():
    if column in final_df.columns:
        final_df[column].fillna(value, inplace=True) 

# has_demo , demo_count , has_dlc, dlc_count
final_df["has_demo"] = final_df["has_demo"].fillna(0).astype(int)
final_df["demo_count"] = final_df["demo_count"].fillna(0).astype(int)
final_df["has_dlc"] = final_df["has_dlc"].fillna(0).astype(int)
final_df["dlc_count"] = final_df["dlc_count"].fillna(0).astype(int)

{'Unnamed: 0': 'DROP',
 'full_game_appid': 0,
 'demo_appid': 0,
 'name': '#NAME?',
 'base_appid': 0,
 'dlc_appid': 0,
 'appid': 0,
 'metacritic': 73,
 'steam_achievements': False,
 'steam_trading_cards': False,
 'workshop_support': False,
 'genres': 'Action, Adventure, Indie',
 'achievements_total': 18.0,
 'release_date': 'Coming soon',
 'supported_platforms': "['windows']",
 'steamId': 3596260.0,
 'price': 0.0,
 'publisherClass': 'Hobbyist',
 'reviewScore': 80.0,
 'aiContent': 'DROP'}

# Handling Outliers , Negative Values

In [None]:
with open('Joblib_JSON_files/reviewScore_outliers.json', 'r') as f:
    review_score_data = json.load(f)
# name , steam_achievements , steam_trading_cards,workshop_support,genres, release_date, supported_platforms,# publisherClass, has_demo, has_dlc

# metacritic
final_df.loc[(final_df['metacritic'] > 100) , 'metacritic'] = 100
final_df.loc[(final_df['metacritic'] < 0) , 'metacritic'] = 0

# achievements_total
final_df.loc[(final_df['achievements_total'] > 100) , 'achievements_total'] = 100
final_df.loc[(final_df['achievements_total'] < 0) , 'achievements_total'] = 0

# price
final_df.loc[(final_df['price'] > 100) , 'price'] = 100
final_df.loc[(final_df['price'] < 0) , 'price'] = 0


# reviewScore
final_df.loc[final_df['reviewScore'] < review_score_data['low'],  'reviewScore'] = review_score_data['median']
final_df.loc[final_df['reviewScore'] > review_score_data['high'], 'reviewScore'] = review_score_data['median']

# demo_count  
final_df.loc[(final_df['demo_count'] > 100) , 'demo_count'] = 100
final_df.loc[(final_df['demo_count'] < 0) , 'demo_count'] = 0

# dlc_count
final_df.loc[(final_df['dlc_count'] > 100) , 'dlc_count'] = 100
final_df.loc[(final_df['dlc_count'] < 0) , 'dlc_count'] = 0

In [None]:
# Convert 'Total achievement' to number, and if there errors set them null
final_df['achievements_total'] = pd.to_numeric(final_df['achievements_total'], errors='coerce')

# if total achievement has value and steam achievement is false so convert it into true
final_df.loc[(final_df['achievements_total'] > 0) & (final_df['steam_achievements'] == False), 'steam_achievements'] = True



# Features Transformation (Transformers for features) / (Scalers - Encoders)

In [None]:
# metacritic
# Load the transformer
metacritic_scaler = joblib.load('Joblib_JSON_files/metacritic_scaler.joblib')

# Use it directly on test data (DON’T refit) we transform based fitting on training data 
final_df['metacritic'] = metacritic_scaler.transform(final_df['metacritic'])

# steam_achievements 
final_df['steam_achievements'] = final_df['steam_achievements'].astype(int)

# steam_trading_cards
final_df['steam_trading_cards'] = final_df['steam_trading_cards'].astype(int)
# workshop_support
final_df['workshop_support'] = final_df['workshop_support'].astype(int)


# genres
final_df['genres'] = final_df['genres'].fillna('')
final_df['genres'] = final_df['genres'].apply(lambda x: x.split(', ') if x else [])
with open('Joblib_JSON_files/non_game_genres.json', 'r') as f:
    non_game_genres = json.load(f)
def replace_non_game_genres(genres_list):
    return [
        genre if genre not in non_game_genres else 'Other'
        for genre in genres_list
    ]
final_df['genres'] = final_df['genres'].apply(replace_non_game_genres)
mlb_genres = joblib.load('Joblib_JSON_files/mlb_genres.joblib')
known_genres = set(mlb_genres.classes_)
def filter_unseen_genres(genres_list):
    return [genre for genre in genres_list if genre in known_genres]

final_df['genres'] = final_df['genres'].apply(lambda x: filter_unseen_genres(x))
val_genres_encoded = mlb_genres.transform(final_df['genres'])
val_genres_df = pd.DataFrame(val_genres_encoded, columns=[f'genre_{c}' for c in mlb_genres.classes_], index=final_df.index)
final_df = pd.concat([final_df, val_genres_df], axis=1)


# achievements_total
achievements_total_scaler = joblib.load('Joblib_JSON_files/achievements_total_scaler.joblib')
final_df['achievements_total'] = achievements_total_scaler.transform(final_df['achievements_total'])



# supported_platforms
final_df['supported_platforms'] = final_df['supported_platforms'].apply(ast.literal_eval)
# Load Multi Label Binarizer object : mlb
mlb_platforms = joblib.load('Joblib_JSON_files/mlb_platforms.joblib')
# Transform : platforms_encoded
platforms_encoded = mlb_platforms.transform(final_df['supported_platforms'])
val_platforms_df = pd.DataFrame(platforms_encoded, columns=[f'platform_{c}' for c in mlb_platforms.classes_], index=final_df.index)
final_df = pd.concat([final_df, val_platforms_df], axis=1)


price_reviews_scaler = joblib.load('Joblib_JSON_files/price_reviews_scaler.joblib')
# price
final_df['price'] = np.log1p(final_df['price'])
final_df['price'] = achievements_total_scaler.transform(final_df['price'])
# reviewScore
final_df['reviewScore'] = achievements_total_scaler.transform(final_df['reviewScore'])

# publisherClass
mapping = {'AA': 'Other', 'AAA': 'Other'}
final_df['publisherClass'] = final_df['publisherClass'].replace(mapping)
# Load one hot encoder
ohe_publisher = joblib.load('Joblib_JSON_files/ohe_publisher.joblib')
encoded_publisher = ohe_publisher.transform(final_df[['publisherClass']])
encoded_publisher_df = pd.DataFrame(encoded_publisher, columns=ohe_publisher.get_feature_names_out(['publisherClass']), index=final_df.index)
final_df = pd.concat([final_df, encoded_publisher_df], axis=1)



# Feature Engineering (Generating new features)

In [None]:
# name
roman_re = re.compile(r'\b(?:i{1,3}|iv|v|vi|vii|viii|ix|x)\b')
digit_re = re.compile(r'\b[2-9]\b')
keywords = ['vr', 'remaster', 'collector', 'collection', 'edition', 'bundle', 'playtest']

def cap_ratio(s):
    if not s:
        return 0
    upper_count = sum(1 for ch in s if ch.isupper())
    return upper_count / len(s)


# Strip punctuation
final_df['name'] = final_df['name'].str.replace(r'[^\w\s]', '', regex=True)

# add character count and word count features
final_df['name_len'] = final_df['name'].str.len()
final_df['name_words'] = final_df['name'].str.split().str.len()

# add caps ratio feature
final_df['name_cap_ratio'] = final_df['name'].apply(cap_ratio)

# transform all names to lowercase
final_df['name'] = final_df['name'].str.lower()

# check if game is a sequel and add is_sequel feature
final_df['is_sequel'] = (
    final_df['name'].str.contains(roman_re) |
    final_df['name'].str.contains(digit_re)
).astype(int)

# add useful keyword features
for kw in keywords:
    final_df[f'name_has_{kw}'] = final_df['name'].str.contains(fr'\b{kw}\b').astype(int)



# Scaling these features
scale_cols = [
    'name_len', 'name_words', 'name_cap_ratio', 'is_sequel'
] + [f'name_has_{kw}' for kw in keywords]

name_is_sequel_scaler = joblib.load('Joblib_JSON_files/name_is_sequel_scaler.joblib')
final_df[scale_cols]   = name_is_sequel_scaler.transform(final_df[scale_cols])
#--------------------------------------------------------------------------------------
# metacritic
final_df['has_metacritic'] = final_df['metacritic'].notna().astype(int)

# steam_achievements (Mode of traning data is 0 or 1 (saved in file))

# steam_trading_cards

# workshop_support

# genres

# achievements_total

# release_date
final_df['release_date'] = final_df['release_date'].astype(str).str.strip().str.lower() # to remove any leading or trailing spaces if exist

YEAR_DATA_COLLECTED = 2024
# ^: start of string , $: end of string
unknown_release_dates = [
    r'^coming soon$',
    r'^to be announced$',
    r'^\s*$',  # empty string
    r'^q[1-4][^a-zA-Z0-9]+\d{4}$', #q2 2025
    r'\d{4}$',  # year only
    r'^\d{1,2}[^a-zA-Z0-9]+[A-Za-z]{3}$' # DAY MONTH
]
def has_known_release(x):
    x = str(x).strip().lower()
    for pattern in unknown_release_dates:
        if re.match(pattern, x):
            return 0
    return 1
final_df['is_release_date_known'] = final_df['release_date'].apply(has_known_release)

# I don't we should add "Coming soon" as it is upcoming game or not
# [^a-zA-Z0-9]+ : continue taking if not alphanumeric , [^0-9]+ : continue taking if not number
def preprocess_release_date(x):
    x = str(x).strip()

    # 8-Dec-2022 or 8 Dec 2022
    match = re.search(r'(\d{1,2})[^a-zA-Z0-9]+([A-Za-z]{3})[^a-zA-Z0-9]+(\d{2,4})', x)
    if match:
        day, month, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"{day} {month} {year}" , 1
        return f"{day} {month} {year}" , 0

    # Dec-8-2022 or Dec 8, 2022
    match = re.search(r'([A-Za-z]{3})[^a-zA-Z0-9]+(\d{1,2})[^a-zA-Z0-9]+(\d{4})', x)
    if match:
        month, day, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"{day} {month} {year}" , 1
        return f"{day} {month} {year}" , 0

    # Dec-2022 or Dec 2022
    match = re.search(r'([A-Za-z]{3})[^a-zA-Z0-9]+(\d{2,4})', x)
    if match:
        month, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"15 {month} {year}" , 1
        return f"15 {month} {year}" , 0
        

    # 8-Dec (no year)
    match = re.search(r'(\d{1,2})[^a-zA-Z0-9]+([A-Za-z]{3})', x)
    if match:
        return "15 NOV 2020",0
        #return np.nan,0

    # q1 2023, q2-2024
    match = re.search(r'(q[1-4])[^a-zA-Z0-9]+(\d{4})', x, re.IGNORECASE)
    if match:
        q, y = match.groups()
        q = q.lower()
        quarter_map = {
            'q1': '15 Feb',
            'q2': '15 May',
            'q3': '15 Aug',
            'q4': '15 Nov',
        }
        if(int(y) > YEAR_DATA_COLLECTED):
            return f"{quarter_map[q]} {y}" , 1
        return f"{quarter_map[q]} {y}" , 0

    # 2023 (year only)
    if re.fullmatch(r'\d{4}', x):
        year = int(x)
        if YEAR_DATA_COLLECTED < year:
            return f"1 Jun {x}" , 1
        else:
            return f"1 JAN {x}" , 0


    # "Coming soon", "To be announced", ... etc.
    return "1 JUN 2026",1
    #return np.nan,0

# for each value in column release date we will apply this function to it 

final_df[['release_date', 'is_upcoming']] = final_df['release_date'].apply(preprocess_release_date).apply(pd.Series) # for unpacking the series tuple into 2 columns

final_df['release_date'] = pd.to_datetime(final_df['release_date'], errors='coerce') 

final_df['year'] = final_df['release_date'].dt.year.fillna(0).astype(int)

fraction_of_year = np.where(~final_df['release_date'].isna(), (final_df['release_date'].dt.dayofyear - 1) / 365, -1)
final_df['sin_day'] = np.where(fraction_of_year == -1, 0, np.sin(2 * np.pi * fraction_of_year))
final_df['cos_day'] = np.where(fraction_of_year == -1, 0, np.cos(2 * np.pi * fraction_of_year))



# supported_platforms

# price

# publisherClass

# reviewScore

# aiContent

# has_demo , demo_count , has_dlc, dlc_count

If name was null , we set it by unique value as its flag 

so we need to set all new features generated from name by mode !

# Dropping not used features

In [None]:
final_df.drop('release_date',axis=1,inplace=True)
final_df.drop('aiContent',axis=1,inplace=True)
final_df.drop('supported_platforms',axis=1,inplace=True)
final_df.drop('publisherClass',axis=1,inplace=True)
final_df.drop(columns=['appid', 'name', 'genres'], inplace=True)

Detect its regression or classification depending on type of output (target variable)


In [None]:
def isRegression():
    if(final_df['copiesSold'].dtype == 'object'):
        return 0
    else : # (final_df['copiesSold'].dtype == 'int64')
        return 1

flag_Is_regression = isRegression() 

In [None]:
features = ['steam_achievements', 'steam_trading_cards', 'workshop_support',
       'achievements_total', 'is_release_date_known', 'is_upcoming', 'year',
       'sin_day', 'cos_day', 'price', 'reviewScore', 'has_demo', 'demo_count',
       'has_dlc', 'dlc_count', 'copiesSold', 'metacritic_preprocessed',
       'has_metacritic', 'genre_Action', 'genre_Adventure', 'genre_Casual',
       'genre_Early Access', 'genre_Free To Play', 'genre_Gore', 'genre_Indie',
       'genre_Massively Multiplayer', 'genre_Nudity', 'genre_Other',
       'genre_RPG', 'genre_Racing', 'genre_Sexual Content', 'genre_Simulation',
       'genre_Sports', 'genre_Strategy', 'genre_Violent', 'platform_linux',
       'platform_mac', 'platform_windows', 'name_len', 'name_words',
       'name_cap_ratio', 'is_sequel', 'name_has_vr', 'name_has_remaster',
       'name_has_collector', 'name_has_collection', 'name_has_edition',
       'name_has_bundle', 'name_has_playtest', 'publisherClass_Indie',
       'publisherClass_Other']

for i in features:
    print("# ",i," : ")

steam_achievements
steam_trading_cards
workshop_support
achievements_total
is_release_date_known
is_upcoming
year
sin_day
cos_day
price
reviewScore
has_demo
demo_count
has_dlc
dlc_count
copiesSold
metacritic_preprocessed
has_metacritic
genre_Action
genre_Adventure
genre_Casual
genre_Early Access
genre_Free To Play
genre_Gore
genre_Indie
genre_Massively Multiplayer
genre_Nudity
genre_Other
genre_RPG
genre_Racing
genre_Sexual Content
genre_Simulation
genre_Sports
genre_Strategy
genre_Violent
platform_linux
platform_mac
platform_windows
name_len
name_words
name_cap_ratio
is_sequel
name_has_vr
name_has_remaster
name_has_collector
name_has_collection
name_has_edition
name_has_bundle
name_has_playtest
publisherClass_Indie
publisherClass_Other


# Model Prediction and Evaluation

In [None]:
TARGET_VARIABLE = 'copiesSold'
X = final_df.drop(TARGET_VARIABLE,axis=1)
Y = final_df[TARGET_VARIABLE]

In [None]:
if flag_Is_regression == 1 :
    # Load models and features used
    XGBOOST_regressor = joblib.load('Joblib_JSON_files/XGBoost_MI.joblib')
    with open('Joblib_JSON_files/XGBoost_MI.json', 'r') as f:
        XGBOOST_regressor_features = json.load(f)

    # predict
    y_pred_xgboost = XGBOOST_regressor.predict(X[XGBOOST_regressor_features])
    
    # evaluate using y act and y pred
    

else:
    print("classification")
