## Imports

In [None]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
PREPROCESSED_DATA_PATH = "../../../data/preprocessed/"

## Read Data

In [None]:
train_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "train.parquet")

val_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "validation.parquet")
small_test_df = pd.read_parquet(PREPROCESSED_DATA_PATH + "test.parquet")

test_df = pd.concat([val_df, small_test_df], axis=0)

## Split Data Into Training and Testing

In [None]:
TARGET = 'copiesSold'

X_train = train_df.drop(columns=TARGET)
y_train = train_df[TARGET]

X_test = test_df.drop(columns=TARGET)
y_test = test_df[TARGET]

In [None]:
pd.set_option('display.max_columns', None)
X_train.head()

In [None]:
y_train.head()

## Define Models and Hyperparameter Grids

In [None]:
models = {
    'LogisticRegression': {
        'get_model': lambda param: LogisticRegression(C=param, max_iter=1000, solver='liblinear'),
        'param_name': 'C',
        'param_values': [1]
    },
    'LinearSVC': {
        'get_model': lambda param: LinearSVC(C=param, max_iter=10000),
        'param_name': 'C',
        'param_values': [0.1, 1, 10]
    },
    'XGBoost': {
        'get_model': lambda param: XGBClassifier(learning_rate=param, eval_metric='logloss', n_jobs=-1),
        'param_name': 'learning_rate',
        'param_values': [0.1, 0.2, 0.3]
    }
}

## Feature Selection and Hyperparameter Tuning

In [None]:
best_params = {}
best_selectors = {}

for model_name, info in models.items():
    scores = []
    for val in info['param_values']:
        model = info['get_model'](val)
        # Feature Selection, RFECV automatically finds the best number of features
        print(f"\nRunning RFECV for {model_name}...")
        selector = RFECV(estimator=model, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
        X_sel = selector.fit_transform(X_train, y_train)
        # Evaluate performance on selected features with the current hyperparameter
        print(f"\nRunning CV for {model_name}...")
        cv_scores = cross_val_score(model, X_sel, y_train, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
        scores.append(np.mean(cv_scores))

    # Plot hyperparameter tuning curve
    plt.figure()
    plt.plot(info['param_values'], scores, marker='o')
    plt.xlabel(info['param_name'])
    plt.ylabel('CV Accuracy')
    plt.title(f'{model_name} Hyperparameter Tuning')
    plt.show()

    # Record best hyperparameter and corresponding RFECV selector (which contains the best features selected for this model)
    best_index = int(np.argmax(scores))
    best_param = info['param_values'][best_index]
    best_params[model_name] = best_param
    best_model = info['get_model'](best_param)
    best_selectors[model_name] = RFECV(estimator=best_model, cv=3, scoring='accuracy')
    best_selectors[model_name].fit(X_train, y_train)

## Training and Testing

In [None]:
# Train final models and collect metrics
train_times = {}
test_times = {}
accuracies = {}

for model_name, info in models.items():
    selector = best_selectors[model_name]
    X_train_sel = selector.transform(X_train)
    X_test_sel = selector.transform(X_test)
    model = info['get_model'](best_params[model_name])

    # Training
    start = time.time()
    model.fit(X_train_sel, y_train)
    train_times[model_name] = time.time() - start

    # Testing
    start = time.time()
    y_pred = model.predict(X_test_sel)
    test_times[model_name] = time.time() - start

    # Accuracy
    accuracies[model_name] = accuracy_score(y_test, y_pred)

# Bar plot: Test Accuracy
plt.figure()
plt.bar(list(accuracies.keys()), list(accuracies.values()))
plt.ylabel('Accuracy')
plt.title('Test Accuracy Comparison')
plt.show()

# Bar plot: Training Time
plt.figure()
plt.bar(list(train_times.keys()), list(train_times.values()))
plt.ylabel('Training Time (s)')
plt.title('Training Time Comparison')
plt.show()

# Bar plot: Testing Time
plt.figure()
plt.bar(list(test_times.keys()), list(test_times.values()))
plt.ylabel('Testing Time (s)')
plt.title('Testing Time Comparison')
plt.show()