In [210]:
import os
import time
import torch
import pandas as pd
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
import argparse
import sys
import random
import pickle
import os
# export PYTHONPATH="../quantumVM:$PYTHONPATH"
sys.path.insert(0,'../')
from qnet import *

data_path = "./E_NET_EMBEDDINGS/efficientnet_b3"
# data_path = "./E_NET_EMBEDDINGS/urban8k_efficientnet_b3"
variance_threshold=0.1#0.95,0.8,0.75,0.6,0.5,0.4


In [211]:
def create_args(data_path,variance_threshold):
    parser = argparse.ArgumentParser(description="Train a simple MLP or SVC with K-Fold Cross Validation")
    parser.add_argument('--data_path', type=str, default=data_path, help='Path to the dataset directory containing CSV files')
    parser.add_argument('--pca', action='store_true', help='Perform evaluation only')
    parser.add_argument('--variance_threshold', type=float, default=variance_threshold, help='Batch size for training (only applicable for MLP)')
    args, unknown = parser.parse_known_args()
    return args



args = create_args(data_path, variance_threshold)


In [212]:

root_PCA = f"{args.data_path}_PCA"
os.makedirs(root_PCA, exist_ok=True)

folders = os.listdir(args.data_path)
print("Folders: ", folders)

saved_embeddings = []

for folder in folders:
    folder_path = os.path.join(args.data_path, folder)
    
    if not os.path.isdir(folder_path):
        continue

    print(f"FOLDER: {folder}".center(60, "-"))

    train_csv = os.path.join(folder_path, 'train_embeddings.csv')
    val_csv = os.path.join(folder_path, 'val_embeddings.csv')

    if not os.path.exists(train_csv) or not os.path.exists(val_csv):
        print(f"CSV files not found in {folder}, skipping.")
        continue

    print(f"Train data: {train_csv}")
    print(f"Val data: {val_csv}")

    exp = f"{folder}_PCA{args.variance_threshold}"
    experiment_folder = os.path.join(root_PCA, exp)
    os.makedirs(experiment_folder, exist_ok=True)
    print(f"Output folder: {experiment_folder}")

    # Assuming EmbeddingDataset is a custom class to load features and labels
    train_dataset = EmbeddingDataset(train_csv, shuffle=True)
    val_dataset = EmbeddingDataset(val_csv, shuffle=False)

    X_train, y_train = train_dataset.features, train_dataset.labels
    X_val, y_val = val_dataset.features, val_dataset.labels

    print(f"EMBEDDINGS SHAPES".center(60, " "))
    print(f"Training: X: {X_train.shape} - y: {y_train.shape}\nValidation: X: {X_val.shape} - y: {y_val.shape}")

    # Apply PCA
    X_train_reduced, pca_model = apply_pca(X_train, variance_threshold=args.variance_threshold)
    X_val_reduced = pca_model.transform(X_val)

    print(f"PCA EMBEDDINGS SHAPES".center(60, " "))
    print(f"Training: X: {X_train_reduced.shape} - Validation: X: {X_val_reduced.shape}")

    # Combine the features and labels
    X_train_with_labels = np.hstack([X_train_reduced, y_train.reshape(-1, 1)])
    X_val_with_labels = np.hstack([X_val_reduced, y_val.reshape(-1, 1)])

    # Create column names: feat1, feat2, ..., featN, label
    num_features = X_train_reduced.shape[1]
    columns = [f"feat{i+1}" for i in range(num_features)] + ['label']

    # Save as DataFrame with column names
    train_df = pd.DataFrame(X_train_with_labels, columns=columns)
    val_df = pd.DataFrame(X_val_with_labels, columns=columns)

    # Define save paths
    train_save_path = os.path.join(experiment_folder, f'train_embeddings.csv')
    val_save_path = os.path.join(experiment_folder, f'val_embeddings.csv')
    
    # Save to CSV with column names
    train_df.to_csv(train_save_path, index=False)
    val_df.to_csv(val_save_path, index=False)
    
    saved_embeddings.append({
        'train': train_save_path,
        'val': val_save_path
    })

    print("\nEmbeddings were saved in the following locations:")
for saved in saved_embeddings:
    print(f"Train: {saved['train']}")
    print(f"Val: {saved['val']}")

print("Finished processing..")

Folders:  ['efficientnet_b3_8_bs64', 'efficientnet_b3_32_bs64', 'efficientnet_b3_64_bs64', 'efficientnet_b3_2_bs64', 'efficientnet_b3_1536_bs64', 'efficientnet_b3_16_bs64', '.ipynb_checkpoints', 'efficientnet_b3_4_bs64']
---------------FOLDER: efficientnet_b3_8_bs64---------------
Train data: ./E_NET_EMBEDDINGS/efficientnet_b3/efficientnet_b3_8_bs64/train_embeddings.csv
Val data: ./E_NET_EMBEDDINGS/efficientnet_b3/efficientnet_b3_8_bs64/val_embeddings.csv
Output folder: ./E_NET_EMBEDDINGS/efficientnet_b3_PCA/efficientnet_b3_8_bs64_PCA0.1
                     EMBEDDINGS SHAPES                      
Training: X: (9152, 8) - y: (9152,)
Validation: X: (1050, 8) - y: (1050,)
                   PCA EMBEDDINGS SHAPES                    
Training: X: (9152, 1) - Validation: X: (1050, 1)

Embeddings were saved in the following locations:
--------------FOLDER: efficientnet_b3_32_bs64---------------
Train data: ./E_NET_EMBEDDINGS/efficientnet_b3/efficientnet_b3_32_bs64/train_embeddings.csv
Val da