Importing important libraries

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math
import os

from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MultiLabelBinarizer, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.filterwarnings('ignore')

import ast
from collections import Counter
from wordcloud import WordCloud
# from sentence_transformers import SentenceTransformer

from numpy import unique
from scipy.stats import skew
import json
import joblib

from enum import Enum, auto

In [26]:
with open('../../Joblib_JSON_files/test_data_path.json', 'r') as f:
        test_data_path = json.load(f)

Load Testing Data
- send path of csv files of testing data as parameter 
- use input() to write path of csv files if they are not in the same folder , if they are just write its name 
- use GUI and make button for upload data 

question: Do I really to know the name of each csv file or by col I can detect it and merge it correctly?

In [27]:
# Define file paths
DEMOS_PATH = test_data_path['demos']
DLCS_PATH = test_data_path['dlcs']
BASE_GAMES_PATH = test_data_path['info_base_games']
GAMALYTIC_PATH = test_data_path['gamalytic_steam_games']

# Read CSV files
demo_df = pd.read_csv(DEMOS_PATH)
dlc_df = pd.read_csv(DLCS_PATH)
info_base_games_df = pd.read_csv(BASE_GAMES_PATH, low_memory=False)
gamalytic_steam_games_df = pd.read_csv(GAMALYTIC_PATH)

Preprocess DLC , DEMO CSV 

yeah I can drop nulls here BUT

I CAN'T DROP ANY NULLS in GAMALYTIC CSV FILE

In [28]:
dlc_df.head()

Unnamed: 0,base_appid,dlc_appid,name
0,833360,2755940,Star Valor - Base Building
1,1482000,1536330,LineArt Jigsaw Puzzle - Erotica 2 ArtBook
2,1611430,2022750,The Bridge Curse Road to Salvation The art of ...
3,1667770,1667780,Samurai Shampoo: Support the developers!
4,623580,624390,POBEDA - SOUNDTRACK


In [29]:
demo_df.head()

Unnamed: 0.1,Unnamed: 0,full_game_appid,demo_appid,name
0,150,3198820,3214770,Passer Demo
1,3467,2385230,2404570,Bad Times at the Silver Lake Demo
2,13942,1383370,1536710,Trail of Ayash Demo
3,1547,1871940,1974490,ZOA Protocol Demo
4,903,2644110,3349920,Pumpkin Ghost Demo


In [30]:
def preprocess_dlc_demo(demo_df,dlc_df):
    demo_df.drop("Unnamed: 0", axis=1, inplace=True) # Drop auto_incremented columns
    
    # Standardize column names
    demo_df.rename(columns={"full_game_appid": "appid"}, inplace=True)
    dlc_df.rename(columns={"base_appid": "appid"}, inplace=True)
    
    
    # Convert data types
    for df, appid_col in [(demo_df, "demo_appid"), (dlc_df, "dlc_appid")]:
        df[appid_col] = pd.to_numeric(df[appid_col], errors="coerce") 
        df["appid"] = pd.to_numeric(df["appid"], errors="coerce")

        # Drop rows where appid or demo/dlc_appid is missing/NaN
        df.dropna(subset=["appid"], inplace=True)

        df[appid_col] = df[appid_col].astype(int)
        df["appid"] = df["appid"].astype(int)
        df["name"] = df["name"].astype(str)


    #Remove duplicates by unique identifier
    demo_df.drop_duplicates(subset=["demo_appid"], inplace=True)
    dlc_df.drop_duplicates(subset=["dlc_appid"], inplace=True)

    # Clean the 'name' column
    def clean_name_column(df, column_name):
        df[column_name] = (df[column_name]
            .str.replace(r"(?i)\bdemo\b", "", regex=True)
            .str.replace(r"(?i)\bdlc\b", "", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
            .str.title())
        return df

    demo_df = clean_name_column(demo_df, "name")
    dlc_df = clean_name_column(dlc_df, "name")

    # Reset the indices of the dataframe to the original order.
    demo_df.reset_index(drop=True, inplace=True)
    dlc_df.reset_index(drop=True, inplace=True)

    # Create features : HAS_DEMO , HAS_DLC , DEMO_COUNT , DLC_COUNT
    def create_indicator_features(df, group_by_col, count_col, feature_prefix):
        # group by appid to count the demo/dlc count for each game 
        indicators = df.groupby(group_by_col)[count_col].count().reset_index()
        
        # create has_feature column
        indicators[f'has_{feature_prefix}'] = 1

        # create count column
        indicators[f'{feature_prefix}_count'] = indicators[count_col]
        
        # e.g.: return demo_df[['appid', 'has_demo', 'demo_count']]  
        return indicators[[group_by_col, f'has_{feature_prefix}', f'{feature_prefix}_count']]

    demo_indicators = create_indicator_features(demo_df, 'appid', 'demo_appid', 'demo')
    dlc_indicators = create_indicator_features(dlc_df, 'appid', 'dlc_appid', 'dlc')
    return demo_indicators , dlc_indicators

In [31]:
# Apply Preprocessing for dlc and demo csv files 
demo_indicators , dlc_indicators = preprocess_dlc_demo(demo_df,dlc_df)

In [32]:
dlc_indicators

Unnamed: 0,appid,has_dlc,dlc_count
0,9200,1,1
1,9940,1,1
2,16450,1,1
3,18070,1,1
4,26800,1,1
...,...,...,...
1084,3276220,1,1
1085,3317540,1,1
1086,3331000,1,1
1087,3361210,1,1


In [33]:
demo_indicators

Unnamed: 0,appid,has_demo,demo_count
0,2100,1,1
1,2720,1,1
2,3200,1,1
3,3360,1,1
4,3380,1,1
...,...,...,...
3084,3488700,1,1
3085,3489620,1,1
3086,3489640,1,1
3087,3491470,1,1


In [34]:
# Drop Corrupted Sample That Contains The Column Names
info_base_games_df = info_base_games_df.drop(index=9929)

# Merging all csv files into one csv file
def merge_all_csvs():
    gamalytic_steam_games_df.rename(columns={"steamId": "appid"}, inplace=True)

    # Ensure both DataFrames have the same data type for 'appid'
    info_base_games_df['appid'] = info_base_games_df['appid'].astype(int)
    gamalytic_steam_games_df['appid'] = gamalytic_steam_games_df['appid'].astype(int)

    # Merge info_base_games_df with gamalytic_steam_games_df into a new DataFrame
    merged_df = gamalytic_steam_games_df.merge(info_base_games_df, on="appid", how="left")

    # Merge demo_indicators with merged_df
    merged_df = merged_df.merge(demo_indicators, on="appid", how="left")

    # Merge dlc_indicators with merged_df
    merged_df = merged_df.merge(dlc_indicators, on="appid", how="left")

    return merged_df

In [35]:
final_df = merge_all_csvs()

In [36]:
final_df

Unnamed: 0,appid,price,copiesSold,publisherClass,reviewScore,aiContent,name,metacritic,steam_achievements,steam_trading_cards,workshop_support,genres,achievements_total,release_date,supported_platforms,has_demo,demo_count,has_dlc,dlc_count
0,970560,29.99,Platinum,AA,97,,,,,,,,,,,,,,
1,1077150,2.99,Bronze,Hobbyist,33,,,,,,,,,,,,,,
2,1563710,0.00,Silver,Hobbyist,50,,CyberGrid: Tower defense,,False,False,False,"Free To Play, Indie, Strategy",,"Mar 26, 2021",['windows'],,,,
3,402220,4.99,Silver,Hobbyist,57,,,,,,,,,,,,,,
4,1377680,3.99,Bronze,Hobbyist,100,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18681,376730,1.99,Gold,Indie,76,,,,,,,,,,,,,,
18682,48000,9.99,Platinum,Indie,93,,,,,,,,,,,,,,
18683,690550,4.99,Silver,Hobbyist,90,,,,,,,,,,,,,,
18684,2691900,12.99,Bronze,Hobbyist,100,,,,,,,,,,,1.0,1.0,,


In [37]:
final_df.columns # THESE ARE THE COLUMNS THAT WE SHOULD HANDLE NULL VALUES IN ANY OF THEM

Index(['appid', 'price', 'copiesSold', 'publisherClass', 'reviewScore',
       'aiContent', 'name', 'metacritic', 'steam_achievements',
       'steam_trading_cards', 'workshop_support', 'genres',
       'achievements_total', 'release_date', 'supported_platforms', 'has_demo',
       'demo_count', 'has_dlc', 'dlc_count'],
      dtype='object')

Preprocessing features of infobase and gamalytic 

1. For each feature here , I need to handle nulls and outliers 

2. Feature Engineering (Create new features)

3. Feature Transformation (Scaling , Standardization , log transform , encoding)

We should handle nulls before encoding cuz encoding will cause error for null values


# Handling Nulls

In [38]:
# if steam achievement is false and achievement total is null so set it zero
final_df.loc[(final_df['steam_achievements'] == False) & (final_df['achievements_total'].isna()), 'achievements_total'] = 0


# if Steam achievement is true and achievement total is null so set it by median (laod it)
# Handling Nulls for each feature
with open('../../Joblib_JSON_files/default_values.json', 'r') as f:
    default_values = json.load(f)

for column, value in default_values.items():
    if column in final_df.columns:
        final_df[column].fillna(value, inplace=True) 

# has_demo , demo_count , has_dlc, dlc_count
final_df["has_demo"] = final_df["has_demo"].fillna(0).astype(int)
final_df["demo_count"] = final_df["demo_count"].fillna(0).astype(int)
final_df["has_dlc"] = final_df["has_dlc"].fillna(0).astype(int)
final_df["dlc_count"] = final_df["dlc_count"].fillna(0).astype(int)

# Handling Outliers , Negative Values

In [39]:
with open('../../Joblib_JSON_files/reviewScore_outliers.json', 'r') as f:
    review_score_data = json.load(f)
# name , steam_achievements , steam_trading_cards,workshop_support,genres, release_date, supported_platforms,# publisherClass, has_demo, has_dlc

# Ensure numeric types for comparison
final_df['metacritic'] = pd.to_numeric(final_df['metacritic'], errors='coerce')
final_df['achievements_total'] = pd.to_numeric(final_df['achievements_total'], errors='coerce')

# metacritic
final_df.loc[(final_df['metacritic'] > 97), 'metacritic'] = 97
final_df.loc[(final_df['metacritic'] < 20), 'metacritic'] = 20

# achievements_total
final_df.loc[(final_df['achievements_total'] > 5394), 'achievements_total'] = 5394
final_df.loc[(final_df['achievements_total'] < 0), 'achievements_total'] = 0

# price
final_df.loc[(final_df['price'] > 1900) , 'price'] = 1900
final_df.loc[(final_df['price'] < 0) , 'price'] = 0


# reviewScore
final_df.loc[final_df['reviewScore'] < 60.0,  'reviewScore'] = 80.0
final_df.loc[final_df['reviewScore'] > 94.0, 'reviewScore'] = 80.0

# demo_count  
final_df.loc[(final_df['demo_count'] > 1) , 'demo_count'] = 1
final_df.loc[(final_df['demo_count'] < 0) , 'demo_count'] = 0

# dlc_count
final_df.loc[(final_df['dlc_count'] > 3) , 'dlc_count'] = 3
final_df.loc[(final_df['dlc_count'] < 0) , 'dlc_count'] = 0

In [40]:
# Convert 'Total achievement' to number, and if there errors set them null
final_df['achievements_total'] = pd.to_numeric(final_df['achievements_total'], errors='coerce')

# if total achievement has value and steam achievement is false so convert it into true
final_df.loc[(final_df['achievements_total'] > 0) & (final_df['steam_achievements'] == False), 'steam_achievements'] = True

# Features Transformation (Transformers for features) / (Scalers - Encoders)

In [41]:
# metacritic
# Load the transformer
metacritic_scaler = joblib.load('../../Joblib_JSON_files/metacritic_scaler.joblib')

# Use it directly on test data (DON’T refit) we transform based fitting on training data 
final_df['metacritic'] = metacritic_scaler.transform(final_df[['metacritic']])

# steam_achievements 
final_df['steam_achievements'] = final_df['steam_achievements'].astype(int)

# steam_trading_cards
final_df['steam_trading_cards'] = final_df['steam_trading_cards'].astype(int)
# workshop_support
final_df['workshop_support'] = final_df['workshop_support'].astype(int)


# genres
final_df['genres'] = final_df['genres'].fillna('')
final_df['genres'] = final_df['genres'].apply(lambda x: x.split(', ') if x else [])

mlb_genres = joblib.load('../../Joblib_JSON_files/mlb_genres.joblib')
known_genres = set(mlb_genres.classes_)

def replace_unknown_game_genres(genres_list):
    return [
        genre if genre in known_genres else 'Other'
        for genre in genres_list
    ]   
final_df['genres'] = final_df['genres'].apply(replace_unknown_game_genres)

#def filter_unseen_genres(genres_list):
#   return [genre for genre in genres_list if genre in known_genres]
#final_df['genres'] = final_df['genres'].apply(lambda x: filter_unseen_genres(x))

val_genres_encoded = mlb_genres.transform(final_df['genres'])
val_genres_df = pd.DataFrame(val_genres_encoded, columns=[f'genre_{c}' for c in mlb_genres.classes_], index=final_df.index)
final_df = pd.concat([final_df, val_genres_df], axis=1)


# achievements_total
achievements_total_scaler = joblib.load('../../Joblib_JSON_files/achievements_total_scaler.joblib')
final_df['achievements_total'] = achievements_total_scaler.transform(final_df[['achievements_total']])



# supported_platforms
final_df['supported_platforms'] = final_df['supported_platforms'].apply(ast.literal_eval)
# Load Multi Label Binarizer object : mlb
mlb_platforms = joblib.load('../../Joblib_JSON_files/mlb_platforms.joblib')
# Transform : platforms_encoded
platforms_encoded = mlb_platforms.transform(final_df['supported_platforms'])
val_platforms_df = pd.DataFrame(platforms_encoded, columns=[f'platform_{c}' for c in mlb_platforms.classes_], index=final_df.index)
final_df = pd.concat([final_df, val_platforms_df], axis=1)


price_reviews_scaler = joblib.load('../../Joblib_JSON_files/price_reviews_scaler.joblib')
# price
final_df['price'] = np.log1p(final_df['price'])
# Transform both columns together to match the scaler's fit
final_df[['price', 'reviewScore']] = price_reviews_scaler.transform(final_df[['price', 'reviewScore']])

# publisherClass
mapping = {'AA': 'Other', 'AAA': 'Other'}
#known_publishers = ["Hobbyist","Indie"]
#def replace_unknown_publisher(publisher_list):
#    for publisher in publisher_list:
#        if publisher not in known_publishers:
#            return "Other"
#        else:
#            return publisher
final_df['publisherClass'] = final_df['publisherClass'].replace(mapping)        
#final_df['publisherClass'] = final_df['publisherClass'].apply(replace_unknown_publisher)
# Load one hot encoder (should be OneHotEncoder, not LabelEncoder)
ohe_publisher = joblib.load('../../Joblib_JSON_files/ohe_publisher.joblib')
# If your encoder was fitted with handle_unknown='ignore', this will work:
encoded_publisher = ohe_publisher.transform(final_df[['publisherClass']])
encoded_publisher_df = pd.DataFrame(encoded_publisher.toarray() if hasattr(encoded_publisher, "toarray") else encoded_publisher, columns=ohe_publisher.get_feature_names_out(['publisherClass']), index=final_df.index)
final_df = pd.concat([final_df, encoded_publisher_df], axis=1)



# Feature Engineering (Generating new features)

In [42]:
# name
roman_re = re.compile(r'\b(?:i{1,3}|iv|v|vi|vii|viii|ix|x)\b')
digit_re = re.compile(r'\b[2-9]\b')
keywords = ['vr', 'remaster', 'collector', 'collection', 'edition', 'bundle', 'playtest']

def cap_ratio(s):
    if not s:
        return 0
    upper_count = sum(1 for ch in s if ch.isupper())
    return upper_count / len(s)


# Strip punctuation
final_df['name'] = final_df['name'].str.replace(r'[^\w\s]', '', regex=True)

# add character count and word count features
final_df['name_len'] = final_df['name'].str.len()
final_df['name_words'] = final_df['name'].str.split().str.len()

# add caps ratio feature
final_df['name_cap_ratio'] = final_df['name'].apply(cap_ratio)

# transform all names to lowercase
final_df['name'] = final_df['name'].str.lower()

# check if game is a sequel and add is_sequel feature
final_df['is_sequel'] = (
    final_df['name'].str.contains(roman_re) |
    final_df['name'].str.contains(digit_re)
).astype(int)

# add useful keyword features
for kw in keywords:
    final_df[f'name_has_{kw}'] = final_df['name'].str.contains(fr'\b{kw}\b').astype(int)



# Scaling these features
scale_cols = [
    'name_len', 'name_words', 'name_cap_ratio', 'is_sequel'
] + [f'name_has_{kw}' for kw in keywords]

name_is_sequel_scaler = joblib.load('../../Joblib_JSON_files/name_is_sequel_scaler.joblib')
final_df[scale_cols]  = name_is_sequel_scaler.transform(final_df[scale_cols])
#--------------------------------------------------------------------------------------
# metacritic
final_df['has_metacritic'] = final_df['metacritic'].notna().astype(int)
standardizer_metacritic = joblib.load('../../Joblib_JSON_files/metacritic_scaler.joblib')
final_df['metacritic'] = standardizer_metacritic.transform(final_df[['metacritic']])
# steam_achievements (Mode of traning data is 0 or 1 (saved in file))

# steam_trading_cards

# workshop_support

# genres

# achievements_total

# release_date
final_df['release_date'] = final_df['release_date'].astype(str).str.strip().str.lower() # to remove any leading or trailing spaces if exist

YEAR_DATA_COLLECTED = 2024
# ^: start of string , $: end of string
# not have specific release date
unknown_release_dates = [ 
    r'^coming soon$',
    r'^to be announced$',
    r'^\s*$',  # empty string
    r'^q[1-4][^a-zA-Z0-9]+\d{4}$', #q2 2025
    r'\d{4}$',  # year only
    r'^\d{1,2}[^a-zA-Z0-9]+[A-Za-z]{3}$' # DAY MONTH
]
def has_known_release(x):
    x = str(x).strip().lower()
    for pattern in unknown_release_dates:
        if re.match(pattern, x):
            return 0
    return 1
final_df['is_release_date_known'] = final_df['release_date'].apply(has_known_release)

# I don't we should add "Coming soon" as it is upcoming game or not
# [^a-zA-Z0-9]+ : continue taking if not alphanumeric , [^0-9]+ : continue taking if not number
def preprocess_release_date(x):
    x = str(x).strip()

    # 8-Dec-2022 or 8 Dec 2022
    match = re.search(r'(\d{1,2})[^a-zA-Z0-9]+([A-Za-z]{3})[^a-zA-Z0-9]+(\d{2,4})', x)
    if match:
        day, month, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"{day} {month} {year}" , 1
        return f"{day} {month} {year}" , 0

    # Dec-8-2022 or Dec 8, 2022
    match = re.search(r'([A-Za-z]{3})[^a-zA-Z0-9]+(\d{1,2})[^a-zA-Z0-9]+(\d{4})', x)
    if match:
        month, day, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"{day} {month} {year}" , 1
        return f"{day} {month} {year}" , 0

    # Dec-2022 or Dec 2022
    match = re.search(r'([A-Za-z]{3})[^a-zA-Z0-9]+(\d{2,4})', x)
    if match:
        month, year = match.groups()
        if(int(year) > YEAR_DATA_COLLECTED):
            return f"15 {month} {year}" , 1
        return f"15 {month} {year}" , 0
        

    # 8-Dec (no year)
    match = re.search(r'(\d{1,2})[^a-zA-Z0-9]+([A-Za-z]{3})', x)
    if match:
        return "15 NOV 2020",0
        #return np.nan,0

    # q1 2023, q2-2024
    match = re.search(r'(q[1-4])[^a-zA-Z0-9]+(\d{4})', x, re.IGNORECASE)
    if match:
        q, y = match.groups()
        q = q.lower()
        quarter_map = {
            'q1': '15 Feb',
            'q2': '15 May',
            'q3': '15 Aug',
            'q4': '15 Nov',
        }
        if(int(y) > YEAR_DATA_COLLECTED):
            return f"{quarter_map[q]} {y}" , 1
        return f"{quarter_map[q]} {y}" , 0

    # 2023 (year only)
    if re.fullmatch(r'\d{4}', x):
        year = int(x)
        if YEAR_DATA_COLLECTED < year:
            return f"1 Jun {x}" , 1
        else:
            return f"1 JAN {x}" , 0


    # "Coming soon", "To be announced", ... etc.
    return "1 JUN 2026",1
    #return np.nan,0

# for each value in column release date we will apply this function to it 

final_df[['release_date', 'is_upcoming']] = final_df['release_date'].apply(preprocess_release_date).apply(pd.Series) # for unpacking the series tuple into 2 columns

final_df['release_date'] = pd.to_datetime(final_df['release_date'], errors='coerce') 

final_df['year'] = final_df['release_date'].dt.year.fillna(2026).astype(int)

fraction_of_year = np.where(~final_df['release_date'].isna(), (final_df['release_date'].dt.dayofyear - 1) / 365, -1)
final_df['sin_day'] = np.where(fraction_of_year == -1, 0, np.sin(2 * np.pi * fraction_of_year))
final_df['cos_day'] = np.where(fraction_of_year == -1, 0, np.cos(2 * np.pi * fraction_of_year))



# supported_platforms

# price

# publisherClass

# reviewScore

# aiContent

# has_demo , demo_count , has_dlc, dlc_count

If name was null , we set it by unique value as its flag 

so we need to set all new features generated from name by mode !

In [43]:
with open('../../Joblib_JSON_files/name_features_vals.json', 'r') as f:
        name_features_mod = json.load(f)
name_features = ['name_len','name_words',"name_cap_ratio","is_sequel","name_has_vr","name_has_remaster","name_has_collector" , "name_has_collection" , "name_has_edition" , "name_has_bundle" , "name_has_playtest"]
final_df.loc[final_df['name'] == "Name$", name_features] = [name_features_mod[feature] for feature in name_features]

In [44]:
nan_counts = final_df.isna().sum()
print(nan_counts[nan_counts > 0].sort_values(ascending=False))

aiContent    18686
dtype: int64


In [45]:
def isRegression():
    if(final_df['copiesSold'].dtype == 'object'): # Classification
        return 0
    else : # (final_df['copiesSold'].dtype == 'int64')
        return 1

flag_Is_regression = isRegression() 

# Dropping not used features

In [46]:
final_df.drop('release_date',axis=1,inplace=True)
final_df.drop('aiContent',axis=1,inplace=True)
final_df.drop('supported_platforms',axis=1,inplace=True)
final_df.drop('publisherClass',axis=1,inplace=True)
final_df.drop(columns=['appid', 'name', 'genres'], inplace=True)

In [47]:
features = ['steam_achievements', 'steam_trading_cards', 'workshop_support',
       'achievements_total', 'is_release_date_known', 'is_upcoming', 'year',
       'sin_day', 'cos_day', 'price', 'reviewScore', 'has_demo', 'demo_count',
       'has_dlc', 'dlc_count', 'copiesSold', 'metacritic_preprocessed',
       'has_metacritic', 'genre_Action', 'genre_Adventure', 'genre_Casual',
       'genre_Early Access', 'genre_Free To Play', 'genre_Gore', 'genre_Indie',
       'genre_Massively Multiplayer', 'genre_Nudity', 'genre_Other',
       'genre_RPG', 'genre_Racing', 'genre_Sexual Content', 'genre_Simulation',
       'genre_Sports', 'genre_Strategy', 'genre_Violent', 'platform_linux',
       'platform_mac', 'platform_windows', 'name_len', 'name_words',
       'name_cap_ratio', 'is_sequel', 'name_has_vr', 'name_has_remaster',
       'name_has_collector', 'name_has_collection', 'name_has_edition',
       'name_has_bundle', 'name_has_playtest', 'publisherClass_Indie',
       'publisherClass_Other']

#for i in features:
#    print("# ",i," : ")

Detect its regression or classification depending on type of output (target variable)


# Model Prediction and Evaluation

In [48]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,classification_report
TARGET_VARIABLE = 'copiesSold'
X = final_df.drop(TARGET_VARIABLE,axis=1) # All features 
Y = final_df[TARGET_VARIABLE]

if flag_Is_regression == 1 :
    print("Regression")
    # Load models 
    wrapper_gbr_1 = joblib.load('../../Joblib_JSON_files/gradient_boosting_regressor_no_release_date.joblib')
    wrapper_gbr_2 = joblib.load('../../Joblib_JSON_files/gradient_boosting_regressor_with_release_date.joblib')
    wrapper_catBoost = joblib.load('../../Joblib_JSON_files/catboost_regressor.joblib')

    # Load features used for each model
    with open('../../Joblib_JSON_files/gradient_boosting_regressor_features_no_release_date.json', 'r') as f:
        gbr_1_features = json.load(f)
    with open('../../Joblib_JSON_files/gradient_boosting_regressor_features_with_release_date.json', 'r') as f:
        gbr_2_features = json.load(f)
    with open('../../Joblib_JSON_files/catboost_regressor_features.json', 'r') as f:
        catBoost_features = json.load(f)

    # predict
    y_pred_gbr_1 = wrapper_gbr_1.predict(X[gbr_1_features])
    y_pred_gbr_2 = wrapper_gbr_2.predict(X[gbr_2_features])
    y_pred_catBoost = wrapper_catBoost.predict(X[catBoost_features])

    # Results for GBR 1
    r2_gbr_1 = r2_score(Y, y_pred_gbr_1)
    mse_gbr_1 = mean_squared_error(Y,y_pred_gbr_1)
    mae_gbr_1 = mean_absolute_error(Y,y_pred_gbr_1)
    print("R^2 GBR 1 : ",r2_gbr_1)
    print("MSE GBR 1 : ",mse_gbr_1)
    print("MAE GBR 1 : ",mae_gbr_1)

    # Results for GBR 2
    r2_gbr_2 = r2_score(Y, y_pred_gbr_2)
    mse_gbr_2 = mean_squared_error(Y,y_pred_gbr_2)
    mae_gbr_2 = mean_absolute_error(Y,y_pred_gbr_2)
    print("R^2 GBR 2 : ",r2_gbr_2)
    print("MSE GBR 2 : ",mse_gbr_2)
    print("MAE GBR 2 : ",mae_gbr_2)

    # Results for Cat Boost 
    r2_catBoost = r2_score(Y, y_pred_catBoost)
    mse_catBoost = mean_squared_error(Y,y_pred_catBoost)
    mae_catBoost = mean_absolute_error(Y,y_pred_catBoost)
    print("R^2 Cat Boost : ",r2_catBoost)
    print("MSE Cat Boost : ",mse_catBoost)
    print("MAE Cat Boost : ",mae_catBoost)
    

else:
    print("classification")
    encoder = joblib.load('../../Joblib_JSON_files/ohe_copiesSold.joblib')
    Y = encoder.transform(Y)
    # LabelEncoder returns a 1D array, so just wrap it as a Series or DataFrame
    Y = pd.Series(Y, name='copiesSold', index=X.index)
    

    # Load classifier and feature selector
    XGBOOST_classifier = joblib.load('../../Joblib_JSON_files/XGBoost_classifier.joblib')

    with open('../../Joblib_JSON_files/XGBoost_classifier.json', 'r') as f:
        XGBOOST_classifier_features = json.load(f)

    # Transform features only (not Y)
    X_aligned = X[XGBOOST_classifier_features]
    
    # Predict
    y_pred_xgboost = XGBOOST_classifier.predict(X_aligned)

    # Evaluate
    acc = accuracy_score(Y, y_pred_xgboost)
    print("Accuracy Score:", acc)
    print(classification_report(Y, y_pred_xgboost))
   

classification
Accuracy Score: 0.5862142780691427
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      9189
           1       0.41      0.18      0.25      2939
           2       0.67      0.11      0.18      1454
           3       0.38      0.67      0.49      5104

    accuracy                           0.59     18686
   macro avg       0.57      0.42      0.43     18686
weighted avg       0.63      0.59      0.57     18686

