In [None]:
# Parameters
feature_file = None
target_file = None
model_file = None

patch_x = None
patch_y = None
hidden_size = None
num_hidden_layers = None
num_attention_heads = None
intermediate_size = None
hidden_dropout_prob = None
attention_probs_dropout_prob = None

test_size = None
lr = None
num_epochs = None
batch_size = None
random_state = None

In [None]:
patch_x = int(patch_x)
patch_y = int(patch_y)
hidden_size = int(hidden_size)
num_hidden_layers = int(num_hidden_layers)
num_attention_heads = int(num_attention_heads)
intermediate_size = int(intermediate_size)
hidden_dropout_prob = float(hidden_dropout_prob)
attention_probs_dropout_prob = float(attention_probs_dropout_prob)

In [None]:
# array data manipulation and plotting
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import ViTConfig, ViTForImageClassification

# machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, \
    accuracy_score, \
    f1_score, \
    auc, \
    recall_score, \
    precision_score, \
    precision_recall_curve, \
    roc_curve
from sklearn.metrics import confusion_matrix

In [None]:
!nvidia-smi

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# X = np.load(feature_file)
# y = np.load(target_file)

X = np.load("dbr/mel.npy")
y = np.load("dbr/y.npy")
test_size = 0.2
batch_size = 100
random_state = 42

# Encode labels and split the dataset
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into training and testing sets
y_tensor = torch.tensor(y_encoded, dtype=torch.long)
X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=test_size, stratify=y_tensor, random_state=random_state
)

batch_size = batch_size
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
y_train

In [None]:
config = ViTConfig(
    image_size=(X[0].shape[0],X[0].shape[1]),
    patch_size=(patch_x,patch_y),
    num_channels=1,  # Adjust for your input data (e.g., 1 for grayscale, 3 for RGB)
    hidden_size=hidden_size,
    num_hidden_layers=num_hidden_layers,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
    hidden_dropout_prob=hidden_dropout_prob,
    attention_probs_dropout_prob=attention_probs_dropout_prob,
    num_labels=len(np.unique(y_encoded)),
)
model = ViTForImageClassification(config)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Training loop

model.to(device)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for inputs, labels in train_loader:

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Pass to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        print(outputs.logits)
        print(labels)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        running_loss += loss.item()

    # Print the average loss for this epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

In [None]:
torch.save(model, model_file)

In [None]:
model = torch.load(model_file, weights_only=False)

In [None]:
model.to(device)

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = np.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")

# Print classification report
print(classification_report(all_labels, all_preds, target_names=le.classes_))

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()