In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import math

from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MultiLabelBinarizer, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, f_regression

import warnings
warnings.filterwarnings('ignore')

import ast
from collections import Counter


from numpy import unique
from scipy.stats import skew

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression

In [3]:
# Define file paths
TRAINING_DATA = '../../../data/preprocessed/train.parquet'
VALIDATION_DATA = '../../../data/preprocessed/validation.parquet'
TESTING_DATA = '../../../data/preprocessed/test.parquet'

df_train = pd.read_parquet(TRAINING_DATA)
df_validation = pd.read_parquet(VALIDATION_DATA)
df_small_test = pd.read_parquet(TESTING_DATA)
df_test = pd.concat([df_validation,df_small_test],axis = 0).reset_index(drop=True)

In [None]:
df_train.columns
#pd.set_option('display.max_rows', 50)
#pd.set_option('display.max_columns', None)
#df_train['copiesSold']

In [None]:
X_train = df_train.copy()
y_train = df_train['copiesSold'].copy()

X_test = df_test.copy()
y_test = df_test['copiesSold'].copy()

X_train.drop('copiesSold',axis=1,inplace=True)
X_test.drop('copiesSold',axis=1,inplace=True)
X_test.info()


Function for plotting R^2,MSE,MAE

In [None]:
def plot_r_square(r2_scores,labelName):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, X_train.shape[1] + 1), r2_scores, marker='o', color='darkred',label=labelName)
    plt.title("R² Score vs Number of Selected Features")
    plt.xlabel("Number of Top Features Selected")
    plt.ylabel("R² Score")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_mse(mse_vals,labelName):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, X_train.shape[1] + 1), mse_vals, marker='o', color='darkgreen',label=labelName)
    plt.title("Mean Square Error vs Number of Selected Features")
    plt.xlabel("Number of Top Features Selected")
    plt.ylabel("MSE Score")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_mae(mae_vals,labelName):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, X_train.shape[1] + 1), mae_vals, marker='o', color='darkblue',label=labelName)
    plt.title("Mean Absoulte Error vs Number of Selected Features")
    plt.xlabel("Number of Top Features Selected")
    plt.ylabel("MAE Score")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

Get Number of features used to get highest R^2 and lowest MAE and MSE



In [None]:
def evaluate(r2_scores,mse_vals,mae_vals):
    max_r2 = max(r2_scores)
    best_index_r2 = r2_scores.index(max_r2)

    min_mse = min(mse_vals)
    best_index_mse = mse_vals.index(min_mse)

    min_mae = min(mae_vals)
    best_index_mae = mae_vals.index(min_mae)
    
    print(f"Max R² = {max_r2:.4f} at {best_index_r2 + 1} features \nMIN MSE = {min_mse:.4f} at {best_index_mse + 1} features \nMIN MAE = {min_mae:.4f} at {best_index_mae + 1} features \n")
    

**K best features using Mutual information feature selection method**

What do mutual information scores mean?
Each score represents how much information a feature gives about the target(output) variable.

Higher score : more useful the feature is for predicting the target.

Lower score (close to 0) : not informative, maybe noise or irrelevant.

mutual_info_regression works with numerical and categorical features
but It assumes numerical input only.

In [None]:
X_train.drop(columns=['name_words','has_demo','has_dlc','has_metacritic','is_release_date_known'],inplace=True)
X_test.drop(columns=['name_words','has_demo','has_dlc','has_metacritic','is_release_date_known'],inplace=True)
#X_train.drop(columns=['name_len','name_words','has_demo','has_dlc','has_metacritic','name_cap_ratio'],inplace=True)
#X_test.drop(columns=['name_len','name_words','has_demo','has_dlc','has_metacritic','name_cap_ratio'],inplace=True)


In [None]:
# Output (Target) : Numerical (Continuous) --> Regression 
# Input features : Both numerical and categorcal

selector = SelectKBest(mutual_info_regression, k='all')
selector.fit(X_train, y_train)

# Get scores (information of each feature)
mi_scores = selector.scores_

# Combine feature names with scores
feature_scores = list(zip(X_train.columns, mi_scores))

# Sort by information score in descending order
sorted_scores = sorted(feature_scores, key=lambda x: x[1], reverse=True)

for feature, score in sorted_scores:
    print(f"{feature}: {score:.4f}")

Plotting

In [None]:
# Unzip to plot meaning we will seperate features and scores
featuress, scores = zip(*sorted_scores)

# Plotting
plt.figure(figsize=(20, 20))
plt.barh(featuress, scores, color='skyblue')
plt.xlabel("Mutual Information Score")
plt.title("Feature Importance Based on Mutual Information")
plt.gca().invert_yaxis()  # Highest scores on top
plt.tight_layout()
plt.show()

## Gradient Boosting model
try different number of top k features

In [None]:
r2_scores = []
mse = []
mae = []
for i in range(1,X_train.shape[1] + 1):
    gbr = GradientBoostingRegressor(max_depth=5,random_state=42)
    selector = SelectKBest(mutual_info_regression,k=i)
    selector.fit(X_train,y_train)

    sel_x_train = selector.transform(X_train)
    sel_x_test = selector.transform(X_test)

    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=gbr,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)
 
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    mse.append(mean_squared_error(y_test,y_pred))
    mae.append(mean_absolute_error(y_test,y_pred))
    
    
    

In [None]:
plot_r_square(r2_scores,"Gradient Boosting Regressor")
plot_mse(mse,"Gradient Boosting Regressor")
plot_mae(mae,"Gradient Boosting Regressor")

evaluate(r2_scores,mse,mae)

In [None]:
r2_scores

## Decision Tree Regressor 


In [None]:
r2_scores_ID3 = []
mse_ID3 = []
mae_ID3 = []
for i in range(1,X_train.shape[1] + 1):
    model_ID3 = DecisionTreeRegressor()
    selector = SelectKBest(mutual_info_regression,k=i)
    selector.fit(X_train,y_train)

    sel_x_train = selector.transform(X_train)
    sel_x_test = selector.transform(X_test)

    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=model_ID3,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)

     
    r2 = r2_score(y_test, y_pred)
    r2_scores_ID3.append(r2)

    mse_ID3.append(mean_squared_error(y_test,y_pred))
    mae_ID3.append(mean_absolute_error(y_test,y_pred))

Plotting 

In [None]:
plot_r_square(r2_scores_ID3,"Decision Tree Regressor")
plot_mse(mse_ID3,"Decision Tree Regressor")
plot_mae(mae_ID3,"Decision Tree Regressor")

evaluate(r2_scores_ID3,mse_ID3,mae_ID3)

In [None]:
r2_scores_ID3

Gradient Boosting variants : XGBoost, LightGBM, CatBoost

In [None]:
r2_scores_XGBOOST = []
r2_scores_LGBM = []
r2_scores_CatBoost = []

mse_XGBOOST = []
mae_XGBOOST = []

mse_LGBM = []
mae_LGBM = []

mse_CatBoost = []
mae_CatBoost = []

for i in range(1,X_train.shape[1] + 1):
    model_XGB = XGBRegressor(random_state=42)
    model_LGBM = LGBMRegressor(random_state=42)
    model_CatBoost = CatBoostRegressor(verbose=0, random_state=42)

    selector = SelectKBest(mutual_info_regression,k=i)
    selector.fit(X_train,y_train)

    sel_x_train = selector.transform(X_train)
    sel_x_test = selector.transform(X_test)

    # XGBoost
    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=model_XGB,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)
    r2 = r2_score(y_test, y_pred)
    r2_scores_XGBOOST.append(r2)
    mse_XGBOOST.append(mean_squared_error(y_test,y_pred))
    mae_XGBOOST.append(mean_absolute_error(y_test,y_pred))

    # LightGBM
    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=model_LGBM,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)
    r2 = r2_score(y_test, y_pred)
    r2_scores_LGBM.append(r2)
    mse_LGBM.append(mean_squared_error(y_test,y_pred))
    mae_LGBM.append(mean_absolute_error(y_test,y_pred))

    #  CatBoost
    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=model_CatBoost,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)
    r2 = r2_score(y_test, y_pred)
    r2_scores_CatBoost.append(r2)
    mse_CatBoost.append(mean_squared_error(y_test,y_pred))
    mae_CatBoost.append(mean_absolute_error(y_test,y_pred))

    

Plotting for XGBOOST

In [None]:
plot_r_square(r2_scores_XGBOOST,"XGBOOST Regressor")
plot_mse(mse_XGBOOST,"XGBOOST Regressor")
plot_mae(mae_XGBOOST,"XGBOOST Regressor")

evaluate(r2_scores_XGBOOST,mse_XGBOOST,mae_XGBOOST)

Plotting for LightGBM

In [None]:
plot_r_square(r2_scores_LGBM,"LightGBM Regressor")
plot_mse(mse_LGBM,"LightGBM Regressor")
plot_mae(mae_LGBM,"LightGBM Regressor")

evaluate(r2_scores_LGBM,mse_LGBM,mae_LGBM)

Plotting for  CatBoost

In [None]:
plot_r_square(r2_scores_CatBoost,"CatBoost Regressor")
plot_mse(mse_CatBoost,"CatBoost Regressor")
plot_mae(mae_CatBoost,"CatBoost Regressor")

evaluate(r2_scores_CatBoost,mse_CatBoost,mae_CatBoost)

In [None]:
r2_scores_CatBoost

In [None]:
r2_scores_linear = []
mse_gradient_linear = []
mae_gradient_linear = []
for i in range(1,X_train.shape[1] + 1):
    model_linear = LinearRegression()
    selector = SelectKBest(mutual_info_regression,k=i)
    selector.fit(X_train,y_train)

    sel_x_train = selector.transform(X_train)
    sel_x_test = selector.transform(X_test)

    target_transformer = QuantileTransformer(output_distribution='normal',random_state=42)
    wrapper_model = TransformedTargetRegressor(regressor=model_linear,transformer=target_transformer)
    wrapper_model.fit(sel_x_train,y_train)
    y_pred = wrapper_model.predict(sel_x_test)

    r2 = r2_score(y_test, y_pred) 
    r2_scores_linear.append(r2)
    mse_gradient_linear.append(mean_squared_error(y_test,y_pred))
    mae_gradient_linear.append(mean_absolute_error(y_test,y_pred))

In [None]:
plot_r_square(r2_scores_linear,"Linear Regression")
plot_mse(mse_gradient_linear,"Linear Regression")
plot_mae(mae_gradient_linear,"Linear Regression")

evaluate(r2_scores_linear,mse_gradient_linear,mae_gradient_linear)