# Team Billie Jean
**DS203 Project, Autumn 2024**

Team Members:
1. Chinmay Kale, 23B1849 **[Anchor]**
2. Gokularamanan R S, 23B1854
3. Arya Sameer Joshi, 23B1853
4. Arash Dev Ahlawat, 23B1817

In [14]:
import os
import tabulate
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import librosa

# Configuration
NUM_CLASSES = 6  # Updated number of classes based on given labels
BATCH_SIZE = 20
EPOCHS = 40
LEARNING_RATE = 0.005
GRAD_CLIP = 5
DATA_FOLDER = r"/kaggle/input/train-songs/Train_data_songs/All_CSV_Files"

# Define labels based on keywords in filenames
LABEL_MAP = {
    "asha bhosale": 0,
    "jana gana mana": 1,
    "kishore kumar": 2,
    "marathi bhavgeet": 3,
    "marathi lavni": 4,
    "michael jackson": 5
}

# Specify the length of each chunk
CHUNK_LENGTH = 500

def load_data_from_csv(folder_path):
    mfcc_data = []
    labels = {0:[],1:[],2:[],3:[],4:[],5:[]}

    # Iterate through all CSV files in the folder
    for file_path in glob.glob(os.path.join(folder_path, '*.csv')):
        # Load MFCC data from CSV
        mfcc = np.loadtxt(file_path, delimiter=',')
        
        # Calculate delta MFCC
        delta_mfcc = librosa.feature.delta(mfcc)

        # Concatenate MFCC and delta MFCC along the first axis (number of coefficients)
        mfcc_with_delta = np.concatenate((mfcc, delta_mfcc), axis=0)

        # Determine label based on filename
        file_name = os.path.basename(file_path).lower()
        label = -1  # Default label if no keyword matches
        for keyword, lbl in LABEL_MAP.items():
            if keyword in file_name:
                label = lbl

                break
        
        # If no keyword matches, skip this file
        if label == -1:
            continue

        # Calculate total columns and divide into chunks of specified length
        total_columns = mfcc_with_delta.shape[1]  # Get the number of columns (time series)
        timme = total_columns/(60*86)
        labels[lbl].append(timme)
        # Loop to divide data into chunks of CHUNK_LENGTH
        for start_idx in range(0, total_columns, CHUNK_LENGTH):
            end_idx = start_idx + CHUNK_LENGTH
            # Ignore the last chunk if it's less than CHUNK_LENGTH
            if end_idx > total_columns:
                break
            
            # Create a chunk and add to the data
            chunk = mfcc_with_delta[:, start_idx:end_idx]  # Slice along columns
            mfcc_data.append(chunk)

    return mfcc_data, labels


# Load data and split into train/test sets
mfcc_data, labels = load_data_from_csv(DATA_FOLDER)
for i in labels:
    labels[i] = np.mean(labels[i])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(mfcc_data, labels_encoded, test_size=0.3, random_state=341)

In [15]:
for i in labels:
    labels[i] = np.mean(labels[i])
print(labels)

{0: 4.725175043760941, 1: 1.5278985507246376, 2: 4.710581395348838, 3: 4.308885658914728, 4: 4.540010002500624, 5: 4.604638659664917}


In [2]:
# Define custom dataset
class MFCCDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mfcc = torch.tensor(self.data[idx], dtype=torch.float32).squeeze()  # (128, time)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return mfcc, label, mfcc.shape[1]  # Include length of each sequence

# Collate function to handle variable-length sequences
# Collate function to handle variable-length sequences
# Collate function to handle variable-length sequences

# def collate_fn(batch):
#     data, labels, lengths = zip(*batch)
    
#     # Find the maximum length in the batch
#     max_length = max([d.shape[1] for d in data])

#     # Pad each tensor to the maximum length and stack them
#     data_padded = [torch.nn.functional.pad(d, (0, 0, 0, max_length - d.shape[1])) for d in data]
    
#     # Stack padded data
#     data_padded = torch.stack(data_padded)

#     # Convert lengths and labels to tensors
#     lengths = torch.tensor([min(d.shape[1], max_length) for d in data], dtype=torch.int64)
#     labels = torch.tensor(labels, dtype=torch.long)

#     return data_padded, labels, lengths

def collate_fn(batch):
    data, labels, lengths = zip(*batch)
    
    # Stack data and labels directly, as all sequences are the same length
    data_stacked = torch.stack(data)
    labels = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.int64)  # Optional if lengths are all the same

    return data_stacked, labels, lengths


# Data loaders
train_dataset = MFCCDataset(X_train, y_train)
test_dataset = MFCCDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [3]:
print(labels_encoded)
#print(mfcc_data)

[5 5 5 ... 4 4 4]


In [4]:
class CNNRNNModel(nn.Module):
    def __init__(self, num_classes):
        super(CNNRNNModel, self).__init__()
        
        # Convolutional layers with batch normalization
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)  # Make sure this is defined
        self.pool = nn.MaxPool2d(2, 2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)  # Make sure this is defined
        
        # LSTM layer
        self.lstm = nn.LSTM(640, 128, batch_first=True)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.01)

    def forward(self, x, lengths):
        x = x.unsqueeze(1)  # Add channel dimension
        
        # Pass through convolutional and batch normalization layers
        x = self.pool(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool(torch.relu(self.bn2(self.conv2(x))))
        
        # Reshape for LSTM input
        batch_size, _, conv_height, conv_width = x.shape
        x = x.permute(0, 3, 1, 2).reshape(batch_size, conv_width, conv_height * 64)
        
        # Pack sequences and pass through LSTM
        max_seq_length = x.size(1)
        lengths = torch.clamp(lengths, max=max_seq_length)
        x = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        
        # Global average pooling and fully connected layers
        x = torch.mean(x, dim=1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [5]:
# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on GPU" if torch.cuda.is_available() else "Running on CPU")
model = CNNRNNModel(num_classes=NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
# Training function
def train(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for inputs, labels, lengths in train_loader:
        # Move data to GPU
        inputs, labels, lengths = inputs.to(device), labels.to(device), lengths.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, lengths)
        loss = criterion(outputs, labels)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def evaluate(model, test_loader):
    model.eval()
    correct, total = 0, 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels, lengths in test_loader:
            # Move data to GPU
            inputs, labels, lengths = inputs.to(device), labels.to(device), lengths.to(device)

            outputs = model(inputs, lengths)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            y_true.extend(labels.cpu().tolist())
            y_pred.extend(predicted.cpu().tolist())

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return acc, prec, rec, f1

print("Vinmay")

Running on GPU
Vinmay


In [6]:
# Initialize best_f1 before the training loop
best_acc = 0.0  # or float('-inf') if you want to allow for any positive f1 score initially
patience = 5
epochs_without_improvement = 0

for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, criterion, optimizer)
    acc, prec, rec, f1 = evaluate(model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {train_loss:.4f}, Acc: {acc:.8f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}')
    
    # Update the learning rate scheduler
    scheduler.step()

    # Save the best model
    
    if acc > best_acc:
        best_acc = acc
        epochs_without_improvement = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Best model saved with Accuracy: {best_acc:.5f}")
    else:
        epochs_without_improvement += 1
torch.save(model.state_dict(), 'final_model_state.pth')
print("Training complete.")
print("Final model saved.")

Epoch 1/40, Loss: 1.0820, Acc: 0.65649166, Prec: 0.7018, Rec: 0.6565, F1: 0.6516
Best model saved with Accuracy: 0.65649
Epoch 2/40, Loss: 0.8306, Acc: 0.73748474, Prec: 0.7373, Rec: 0.7375, F1: 0.7294
Best model saved with Accuracy: 0.73748
Epoch 3/40, Loss: 0.7002, Acc: 0.74603175, Prec: 0.7563, Rec: 0.7460, F1: 0.7437
Best model saved with Accuracy: 0.74603
Epoch 4/40, Loss: 0.6085, Acc: 0.78021978, Prec: 0.7917, Rec: 0.7802, F1: 0.7785
Best model saved with Accuracy: 0.78022
Epoch 5/40, Loss: 0.5196, Acc: 0.81400081, Prec: 0.8233, Rec: 0.8140, F1: 0.8148
Best model saved with Accuracy: 0.81400
Epoch 6/40, Loss: 0.3648, Acc: 0.84940985, Prec: 0.8515, Rec: 0.8494, F1: 0.8496
Best model saved with Accuracy: 0.84941
Epoch 7/40, Loss: 0.3210, Acc: 0.85958486, Prec: 0.8608, Rec: 0.8596, F1: 0.8597
Best model saved with Accuracy: 0.85958
Epoch 8/40, Loss: 0.2960, Acc: 0.86894587, Prec: 0.8710, Rec: 0.8689, F1: 0.8688
Best model saved with Accuracy: 0.86895
Epoch 9/40, Loss: 0.2728, Acc: 0

In [7]:
import os
import glob
import numpy as np
import torch
import librosa
from collections import Counter

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNRNNModel(num_classes=NUM_CLASSES)  # Ensure NUM_CLASSES is defined
model.load_state_dict(torch.load('best_model.pth'))
model = model.to(device)
model.eval()

# Set the chunk length

CHUNK_LENGTH = 500 
# Function to load and split data into chunks of a specified length, with delta MFCC appended
def load_unknown_data(file_path):
    # Load MFCC data from the CSV
    mfcc = np.loadtxt(file_path, delimiter=',')

    # Ensure 20 coefficients
    if mfcc.shape[0] != 20:
        raise ValueError(f"Expected 20 MFCC coefficients, but got {mfcc.shape[0]}.")

    # Calculate delta MFCC
    delta_mfcc = librosa.feature.delta(mfcc)

    # Concatenate MFCC and delta MFCC along the first axis
    mfcc_with_delta = np.concatenate((mfcc, delta_mfcc), axis=0)

    # Calculate the total columns along the time dimension
    total_columns = mfcc_with_delta.shape[1]
    
    # Split into chunks of specified length
    chunks = []
    for start_idx in range(0, total_columns, CHUNK_LENGTH):
        end_idx = start_idx + CHUNK_LENGTH
        
        # Ignore the last chunk if it is shorter than CHUNK_LENGTH
        if end_idx > total_columns:
            break
        
        chunk = mfcc_with_delta[:, start_idx:end_idx]
        chunks.append(chunk)

    return chunks  # List of (40, CHUNK_LENGTH) arrays

# Predict function for each file, using majority voting across chunks
def predict_label(file_path):
    chunks = load_unknown_data(file_path)
    chunk_predictions = []

    for chunk in chunks:
        # Prepare the chunk for model input
        chunk_tensor = torch.tensor(chunk, dtype=torch.float32).unsqueeze(0).to(device)  # Shape: (1, 40, CHUNK_LENGTH)
        length = torch.tensor([chunk_tensor.shape[2]], dtype=torch.int64).to(device)  # Use the time dimension

        with torch.no_grad():
            output = model(chunk_tensor, length)  # Pass both data and length to the model
            _, predicted_label = torch.max(output, 1)
            chunk_predictions.append(predicted_label.item())

    # Find the most common prediction among the chunks
    most_common_prediction = Counter(chunk_predictions).most_common(1)[0][0]
    return most_common_prediction

# Folder path containing unknown files for prediction
UNKNOWN_FOLDER = "/kaggle/input/mfcc-official/MFCC-files-v2"
predictions = {"file":[],"label":[]}
# Iterate over each file in the unknown folder and print the majority prediction
for file_path in glob.glob(os.path.join(UNKNOWN_FOLDER, '*.csv')):
    label = predict_label(file_path)
    file_name = os.path.basename(file_path)
    predictions["file"].append(file_name)
    predictions["label"].append(label)
    print(f"File: {file_name} -> Predicted Label: {label}")

import datetime as dt
import pandas as pd
pr = pd.DataFrame(predictions).sort_values(by=['file'])
pr.to_csv(f"predictions_{dt.datetime.now()}.csv", index = False)

  model.load_state_dict(torch.load('best_model.pth'))


File: 70-MFCC.csv -> Predicted Label: 4
File: 104-MFCC.csv -> Predicted Label: 4
File: 67-MFCC.csv -> Predicted Label: 3
File: 110-MFCC.csv -> Predicted Label: 0
File: 92-MFCC.csv -> Predicted Label: 3
File: 33-MFCC.csv -> Predicted Label: 4
File: 86-MFCC.csv -> Predicted Label: 1
File: 29-MFCC.csv -> Predicted Label: 3
File: 65-MFCC.csv -> Predicted Label: 2
File: 16-MFCC.csv -> Predicted Label: 1
File: 63-MFCC.csv -> Predicted Label: 2
File: 01-MFCC.csv -> Predicted Label: 1
File: 44-MFCC.csv -> Predicted Label: 0
File: 03-MFCC.csv -> Predicted Label: 4
File: 111-MFCC.csv -> Predicted Label: 2
File: 72-MFCC.csv -> Predicted Label: 4
File: 48-MFCC.csv -> Predicted Label: 2
File: 53-MFCC.csv -> Predicted Label: 4
File: 25-MFCC.csv -> Predicted Label: 4
File: 91-MFCC.csv -> Predicted Label: 4
File: 10-MFCC.csv -> Predicted Label: 4
File: 23-MFCC.csv -> Predicted Label: 0
File: 94-MFCC.csv -> Predicted Label: 4
File: 74-MFCC.csv -> Predicted Label: 4
File: 64-MFCC.csv -> Predicted Label: