In [None]:
# Parameters
feature_file = None
target_file = None
model_file = None

patch_x = None
patch_y = None
embed_dim = None
depth = None
num_heads = None
dropout = None

test_size = None
lr = None
num_epochs = None
batch_size = None
random_state = None

In [None]:
patch_x = int(patch_x)
patch_y = int(patch_y)
embed_dim = int(embed_dim)
depth = int(depth)
num_heads = int(num_heads)
dropout = float(dropout)

In [None]:
# array data manipulation and plotting
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import ViTConfig, ViTForImageClassification

# machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, \
    accuracy_score, \
    f1_score, \
    auc, \
    recall_score, \
    precision_score, \
    precision_recall_curve, \
    roc_curve
from sklearn.metrics import confusion_matrix

In [None]:
!nvidia-smi

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
X = np.load(feature_file)
y = np.load(target_file)

# Encode labels and split the dataset
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the data into training and testing sets
y_tensor = torch.tensor(y_encoded, dtype=torch.long)
X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=test_size, stratify=y_tensor, random_state=random_state
)

batch_size = batch_size
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
class AcousticViT(nn.Module):
    def __init__(self, num_classes, patch_size, embed_dim, num_heads, depth, dropout):
        super(AcousticViT, self).__init__()
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.depth = depth

        # Patch embedding
        self.patch_embedding = nn.Conv2d(1, embed_dim, kernel_size=patch_size, stride=patch_size)

        # Positional encoding
        self.positional_encoding = nn.Parameter(torch.randn(1, embed_dim))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, 
                                                   nhead=num_heads, 
                                                   dim_feedforward=embed_dim, 
                                                   dropout=dropout, 
                                                   batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # Classification head
        self.classification_head = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Patch embedding
        x = self.patch_embedding(x)  # (B, embed_dim, num_patches)
        x = x.flatten(2)  # (B, embed_dim, num_patches)
        x = x.permute(0, 2, 1)  # (B, num_patches, embed_dim)

        # Positional encoding
        x += self.positional_encoding

        # Transformer encoder
        x = self.transformer_encoder(x)

        # Classification head
        x = x.mean(dim=1)  # (B, embed_dim)
        x = self.classification_head(x)
        return x

In [None]:
model = AcousticViT(
    num_classes=len(np.unique(y_encoded)),
    patch_size=(patch_x,patch_y),
    embed_dim=embed_dim,
    num_heads=num_heads,
    depth=depth,
    dropout=dropout,
)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# Training loop

model.to(device)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0

    for inputs, labels in train_loader:

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Pass to GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        running_loss += loss.item()

    # Print the average loss for this epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.4f}")

In [None]:
torch.save(model, model_file)

In [None]:
model = torch.load(model_file, weights_only=False)

In [None]:
model.to(device)

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = np.argmax(outputs, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")

# Print classification report
print(classification_report(all_labels, all_preds, target_names=le.classes_))

In [None]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()