In [None]:
import os
import json
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import numpy as np
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [2]:
import pandas as pd

# Load training and testing data
train_csv_path = "train_labels_cleaned.csv"
test_csv_path = "test_labels_cleaned.csv"
valid_csv_path = "validate_labels_cleaned.csv"

train_data = pd.read_csv(train_csv_path)
valid_data = pd.read_csv(valid_csv_path)
test_data = pd.read_csv(test_csv_path)

# Ensure both CSVs have 'filepath' and 'label' columns
train_file_paths = train_data['image_path'].values
train_labels = train_data['class'].values

test_file_paths = test_data['image_path'].values
test_labels = test_data['class'].values

valid_file_paths = valid_data['image_path'].values
valid_labels = valid_data['class'].values

class_labels = np.unique(train_labels)
label_encoder = LabelEncoder()

In [3]:
# Define base folder to save results
results_base_folder = "./results_rf/"
os.makedirs(results_base_folder, exist_ok=True)

In [6]:
X_train = np.load("./resnet100_train_features.npy")
y_train = label_encoder.fit_transform(train_labels)

X_val = np.load("./resnet100_val_features.npy")
y_val = label_encoder.fit_transform(valid_labels)

X_test = np.load("./resnet100_test_features.npy")
y_test = label_encoder.fit_transform(test_labels)
# # Convert numpy arrays to PyTorch tensors
# X_train = torch.tensor(X_train, dtype=torch.float32)
# y_train = torch.tensor(y_train, dtype=torch.long)

# Use only 15% of the training data
# X_train, _, y_train, _ = train_test_split(X_train_full, y_train_full, test_size=0.95, random_state=42)

In [4]:
# Define hyperparameter combinations
n_estimators_range = [50, 100, 150]
max_depth_range = [10, 20, None]
hyperparameter_combinations = product(n_estimators_range, max_depth_range)

In [9]:
%%time
# Dictionary to store results
results_dict = {}

# Hyperparameter search
best_val_acc = -np.inf
best_model_path = None
best_hyperparams = None

for n_estimators, max_depth in hyperparameter_combinations:
    print(f"Training with n_estimators={n_estimators}, max_depth={max_depth}")
    
    # Train Random Forest
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, average='weighted')
    val_recall = recall_score(y_val, y_val_pred, average='weighted')
    val_f1 = f1_score(y_val, y_val_pred, average='weighted')

    # Save the best model and hyperparameters
    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        best_model_path = os.path.join(results_base_folder, f"best_model_{n_estimators}_depth{max_depth}.joblib")
        joblib.dump(model, best_model_path)
        best_hyperparams = {"n_estimators": n_estimators, "max_depth": max_depth}
        print(f"Best model saved with Acc score = {val_accuracy:.4f}")

    # Save results for this combination
    results = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "val_metrics": {
            "accuracy": val_accuracy,
            "precision": val_precision,
            "recall": val_recall,
            "f1": val_f1,
        }
    }
    results_dict[f"{n_estimators}_depth{max_depth}"] = results

# Evaluate best model on train, validation, and test sets
best_model = joblib.load(best_model_path)

def evaluate(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    return accuracy, precision, recall, f1

train_metrics = evaluate(best_model, X_train, y_train)
val_metrics = evaluate(best_model, X_val, y_val)
test_metrics = evaluate(best_model, X_test, y_test)

final_results = {
    "train_metrics": {
        "accuracy": train_metrics[0],
        "precision": train_metrics[1],
        "recall": train_metrics[2],
        "f1": train_metrics[3],
    },
    "val_metrics": {
        "accuracy": val_metrics[0],
        "precision": val_metrics[1],
        "recall": val_metrics[2],
        "f1": val_metrics[3],
    },
    "test_metrics": {
        "accuracy": test_metrics[0],
        "precision": test_metrics[1],
        "recall": test_metrics[2],
        "f1": test_metrics[3],
    },
    "best_hyperparameters": best_hyperparams
}

# Save final metrics and best hyperparameters to a JSON file
metrics_path = os.path.join(results_base_folder, "final_metrics.json")
with open(metrics_path, "w") as f:
    json.dump(final_results, f, indent=4)

print(f"Final results saved to {metrics_path}")


Training with n_estimators=100, max_depth=None
Best model saved with F1 score = 0.6253
Training with n_estimators=150, max_depth=10
Training with n_estimators=150, max_depth=20
Best model saved with F1 score = 0.6282
Training with n_estimators=150, max_depth=None
Best model saved with F1 score = 0.6321
Final results saved to ./results_rf/final_metrics.json
CPU times: user 58min 30s, sys: 5.23 s, total: 58min 35s
Wall time: 59min 6s
