## Imports

In [79]:
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import seaborn as sns

In [80]:
PREPROCESSED_DATA_PATH = "../../../data/preprocessed/"

## Read Data

In [81]:
train_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "train.parquet")

val_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "validation.parquet")
small_test_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "test.parquet")

test_df = pd.concat([val_df, small_test_df], axis=0)

## Split Data Into Training and Testing

In [82]:
TARGET = 'copiesSold'

X_train = train_df.drop(columns=TARGET)
y_train = train_df[TARGET]

X_test = test_df.drop(columns=TARGET)
y_test = test_df[TARGET]

In [83]:
pd.set_option('display.max_columns', None)
X_train.head()

Unnamed: 0_level_0,steam_achievements,steam_trading_cards,workshop_support,achievements_total,is_release_date_known,is_upcoming,year,sin_day,cos_day,price,reviewScore,has_demo,demo_count,has_dlc,dlc_count,metacritic_preprocessed,has_metacritic,genre_Action,genre_Adventure,genre_Casual,genre_Early Access,genre_Free To Play,genre_Gore,genre_Indie,genre_Massively Multiplayer,genre_Nudity,genre_Other,genre_RPG,genre_Racing,genre_Sexual Content,genre_Simulation,genre_Sports,genre_Strategy,genre_Violent,platform_linux,platform_mac,platform_windows,name_len,name_words,name_cap_ratio,is_sequel,name_has_vr,name_has_remaster,name_has_collector,name_has_collection,name_has_edition,name_has_bundle,name_has_playtest,publisherClass_Indie,publisherClass_Other
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
11655,0,0,0,-0.127185,1,0.0,2024,0.230306,-0.973118,-0.959158,-0.331901,0,0,0,0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.129008,-0.485228,-0.42215,-0.241008,-0.141551,-0.018005,-0.025111,-0.045946,-0.125187,-0.012731,0.0,0.0,0.0
9303,0,0,0,-0.127185,1,0.0,2024,-0.060213,-0.998186,-1.631332,1.07229,0,0,0,0,0.0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,-0.430898,0.109341,0.140908,-0.241008,-0.141551,-0.018005,-0.025111,-0.045946,-0.125187,-0.012731,0.0,0.0,0.0
56618,0,0,0,-0.127185,1,0.0,2007,0.179767,-0.983709,-0.06118,-0.331901,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.575401,0.109341,0.301146,-0.241008,-0.141551,-0.018005,-0.025111,-0.045946,-0.125187,-0.012731,0.0,1.0,0.0
55579,1,0,0,0.105103,1,0.0,2019,-0.188227,0.982126,-1.631332,-0.331901,0,0,0,0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,-0.129008,0.703911,0.243282,-0.241008,7.064584,-0.018005,-0.025111,-0.045946,-0.125187,-0.012731,0.0,1.0,0.0
64439,1,0,0,-0.069113,1,0.0,2022,0.280231,-0.959933,-1.631332,1.180305,0,0,1,1,0.0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,-0.833418,-1.079798,-0.496087,-0.241008,-0.141551,-0.018005,-0.025111,-0.045946,-0.125187,-0.012731,0.0,0.0,0.0


In [84]:
y_train.head()

index
11655    0
9303     0
56618    0
55579    1
64439    1
Name: copiesSold, dtype: int64

## Define Models and Hyperparameter Grids

In [85]:
# models = {
#     'LogisticRegression': {
#         'get_model': lambda param: LogisticRegression(C=param, max_iter=1000, solver='liblinear'),
#         'param_name': 'C',
#         'param_values': [1]
#     },
#     'LinearSVC': {
#         'get_model': lambda param: LinearSVC(C=param, max_iter=10000),
#         'param_name': 'C',
#         'param_values': [0.1, 1, 10]
#     },
#     'XGBoost': {
#         'get_model': lambda param: XGBClassifier(learning_rate=param, eval_metric='logloss', n_jobs=-1),
#         'param_name': 'learning_rate',
#         'param_values': [0.1, 0.2, 0.3]
#     }
# }

## Feature Selection and Hyperparameter Tuning

In [86]:
# best_params = {}
# best_selectors = {}
#
# for model_name, info in models.items():
#     scores = []
#     for val in info['param_values']:
#         model = info['get_model'](val)
#         # Feature Selection, RFECV automatically finds the best number of features
#         print(f"\nRunning RFECV for {model_name}...")
#         selector = RFECV(estimator=model, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
#         X_sel = selector.fit_transform(X_train, y_train)
#         # Evaluate performance on selected features with the current hyperparameter
#         print(f"\nRunning CV for {model_name}...")
#         cv_scores = cross_val_score(model, X_sel, y_train, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
#         scores.append(np.mean(cv_scores))
#
#     # Plot hyperparameter tuning curve
#     plt.figure()
#     plt.plot(info['param_values'], scores, marker='o')
#     plt.xlabel(info['param_name'])
#     plt.ylabel('CV Accuracy')
#     plt.title(f'{model_name} Hyperparameter Tuning')
#     plt.show()
#
#     # Record best hyperparameter and corresponding RFECV selector (which contains the best features selected for this model)
#     best_index = int(np.argmax(scores))
#     best_param = info['param_values'][best_index]
#     best_params[model_name] = best_param
#     best_model = info['get_model'](best_param)
#     best_selectors[model_name] = RFECV(estimator=best_model, cv=3, scoring='accuracy')
#     best_selectors[model_name].fit(X_train, y_train)

## Training and Testing

In [87]:
# # Train final models and collect metrics
# train_times = {}
# test_times = {}
# accuracies = {}
#
# for model_name, info in models.items():
#     selector = best_selectors[model_name]
#     X_train_sel = selector.transform(X_train)
#     X_test_sel = selector.transform(X_test)
#     model = info['get_model'](best_params[model_name])
#
#     # Training
#     start = time.time()
#     model.fit(X_train_sel, y_train)
#     train_times[model_name] = time.time() - start
#
#     # Testing
#     start = time.time()
#     y_pred = model.predict(X_test_sel)
#     test_times[model_name] = time.time() - start
#
#     # Accuracy
#     accuracies[model_name] = accuracy_score(y_test, y_pred)
#
# # Bar plot: Test Accuracy
# print(list(accuracies.keys()))
# print(list(round(accuracy, 3) for accuracy in accuracies.values()))
# plt.figure()
# plt.bar(list(accuracies.keys()), list(accuracies.values()))
# plt.ylabel('Accuracy')
# plt.title('Test Accuracy Comparison')
# plt.show()
#
# # Bar plot: Training Time
# plt.figure()
# plt.bar(list(train_times.keys()), list(train_times.values()))
# plt.ylabel('Training Time (s)')
# plt.title('Training Time Comparison')
# plt.show()
#
# # Bar plot: Testing Time
# plt.figure()
# plt.bar(list(test_times.keys()), list(test_times.values()))
# plt.ylabel('Testing Time (s)')
# plt.title('Testing Time Comparison')
# plt.show()

In [88]:
models = {
    'LogisticRegression': {
        'class': LogisticRegression,
        'params': {
            'C': [0.01, 0.1, 1],
            # 'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'lbfgs', 'saga'],
            'fixed': {
                # 'solver': 'liblinear',
                'penalty': 'l2',
                'max_iter': 5000,
                'class_weight': 'balanced',
            }
        }
    },
    'SVC': {
        'class': SVC,
        'params': {
            'C': [0.1, 1.0, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'degree': [2, 3, 4],                    # Polynomial degree (active when kernel = 'poly')
            'fixed': {
                'max_iter': 10000,
                'probability': True,
                'class_weight': 'balanced'
            }
        }
    },
    'XGBoost': {
        'class': XGBClassifier,
        'params': {
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 5, 7],
            'fixed': {
                'eval_metric': 'mlogloss',
                'use_label_encoder': False,
                'n_jobs': -1
            }
        }
    }
}

In [89]:
best_params = {}
best_selectors = {}
best_scores = {}
param_plot_data = {model: {} for model in models}

for model_name, config in models.items():
    print(f"\n=== Tuning {model_name} ===")
    model_class = config['class']
    fixed_params = config['params'].get('fixed', {})

    for param_name, param_values in config['params'].items():
        if param_name == 'fixed':
            continue

        # Skip degree for now — handle after kernel is selected
        if model_name == 'SVC' and param_name == 'degree':
            continue

        print(f"\nTuning {param_name}...")
        param_scores = []

        for value in param_values:
            print(f"  Testing {param_name} = {value}")
            model_args = {**fixed_params, param_name: value}
            model = model_class(**model_args)

            selector = RFECV(model, cv=3, scoring='accuracy', n_jobs=-1)
            X_sel = selector.fit_transform(X_train, y_train)

            score = cross_val_score(
                model, X_sel, y_train,
                cv=3, scoring='accuracy', n_jobs=-1
            ).mean()

            param_scores.append(score)
            print(f"    Score: {score:.4f}")

        # Save for plotting
        param_plot_data[model_name][param_name] = {
            'values': param_values,
            'scores': param_scores
        }

        # Store best
        best_idx = np.argmax(param_scores)
        best_val = param_values[best_idx]
        best_params[f"{model_name}_{param_name}"] = best_val
        best_scores[f"{model_name}_{param_name}"] = param_scores[best_idx]

        final_model = model_class(**{**fixed_params, param_name: best_val})
        best_selector = RFECV(final_model, cv=3, scoring='accuracy', n_jobs=-1)
        best_selector.fit(X_train, y_train)
        best_selectors[model_name] = best_selector

    # Special case: if SVC and best kernel is poly, tune degree now
    if model_name == 'SVC' and best_params.get('SVC_kernel') == 'poly':
        print(f"\nTuning degree for {model_name} since kernel='poly'")
        degree_scores = []
        degree_values = config['params']['degree']

        for deg in degree_values:
            print(f"  Testing degree = {deg}")
            model_args = {
                **fixed_params,
                'kernel': 'poly',
                'degree': deg,
                'C': best_params['SVC_C']  # use best C found
            }
            model = model_class(**model_args)

            selector = RFECV(model, cv=3, scoring='accuracy', n_jobs=-1)
            X_sel = selector.fit_transform(X_train, y_train)

            score = cross_val_score(
                model, X_sel, y_train,
                cv=3, scoring='accuracy', n_jobs=-1
            ).mean()

            degree_scores.append(score)
            print(f"    Score: {score:.4f}")

        # Save for plotting
        param_plot_data[model_name]['degree'] = {
            'values': degree_values,
            'scores': degree_scores
        }

        # Save best degree
        best_idx = np.argmax(degree_scores)
        best_val = degree_values[best_idx]
        best_params[f"{model_name}_degree"] = best_val
        best_scores[f"{model_name}_degree"] = degree_scores[best_idx]

        final_model = model_class(**{
            **fixed_params,
            'kernel': 'poly',
            'degree': best_val,
            'C': best_params['SVC_C']
        })
        best_selector = RFECV(final_model, cv=3, scoring='accuracy', n_jobs=-1)
        best_selector.fit(X_train, y_train)
        best_selectors[model_name] = best_selector


=== Tuning LogisticRegression ===

Tuning C...
  Testing C = 0.01


KeyboardInterrupt: 

In [None]:
# Summary
print("\n=== Best Parameters ===")
for param, val in best_params.items():
    print(f"{param}: {val} (score: {best_scores.get(param, 'N/A'):.4f})")

# Plotting
print("\n=== Plotting Accuracy vs Hyperparameter ===")
for model_name, param_data in param_plot_data.items():
    for param_name, data in param_data.items():
        plt.figure(figsize=(8, 5))

        if isinstance(data['values'][0], str):
            sns.barplot(x=data['values'], y=data['scores'])
        else:
            sns.lineplot(x=data['values'], y=data['scores'], marker='o')

        plt.title(f"{model_name}: Accuracy vs {param_name}")
        plt.xlabel(param_name)
        plt.ylabel("Accuracy")
        plt.grid(True)
        plt.tight_layout()
        plt.show()


In [None]:
train_times = {}
test_times = {}
accuracies = {}

for model_name, info in models.items():
    selector = best_selectors[model_name]
    X_train_sel = selector.transform(X_train)
    X_test_sel = selector.transform(X_test)

    # Reconstruct best parameters
    model_class = info['class']
    model_args = {**info['params'].get('fixed', {})}

    for key, val in best_params.items():
        if key.startswith(model_name + "_"):
            param_name = key.replace(model_name + "_", "")
            model_args[param_name] = val

    # Enable probabilities for SVM
    if model_name == "SVC":
        model_args["probability"] = True

    model = model_class(**model_args)

    # Train
    start = time.time()
    model.fit(X_train_sel, y_train)
    train_times[model_name] = time.time() - start

    # Predict
    start = time.time()

    if model_name == "SVC":
        y_proba = model.predict_proba(X_test_sel)
        y_pred = np.argmax(y_proba, axis=1)
    else:
        y_pred = model.predict(X_test_sel)

    test_times[model_name] = time.time() - start

    # Accuracy
    accuracies[model_name] = accuracy_score(y_test, y_pred)

# Plot Accuracy
plt.figure()
plt.bar(list(accuracies.keys()), list(accuracies.values()))
plt.ylabel('Accuracy')
plt.title('Test Accuracy Comparison')
plt.show()

# Plot Training Time
plt.figure()
plt.bar(list(train_times.keys()), list(train_times.values()))
plt.ylabel('Training Time (s)')
plt.title('Training Time Comparison')
plt.show()

# Plot Testing Time
plt.figure()
plt.bar(list(test_times.keys()), list(test_times.values()))
plt.ylabel('Testing Time (s)')
plt.title('Testing Time Comparison')
plt.show()
