# Malayalam

In [1]:
!nvidia-smi

Sat Mar 29 17:01:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:41:00.0 Off |                    0 |
| N/A   38C    P0             45W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import os
import pickle
import numpy as np
import librosa
import torch
import warnings
from sklearn.preprocessing import LabelEncoder

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to clear GPU memory
def clear_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Function to extract MFCC features
def extract_mfcc_features(audio, sr, n_mfcc=39, n_fft=480, hop_length=512):
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Calculate deltas and deltadeltas
    delta_mfcc = librosa.feature.delta(mfcc)
    delta_delta_mfcc = librosa.feature.delta(mfcc, order=2)
    
    # Concatenate MFCC, deltas, and deltadeltas
    mfcc_features = np.concatenate((mfcc, delta_mfcc, delta_delta_mfcc), axis=0)
    
    print(f"MFCC features shape: {mfcc_features.shape}")  # Debugging statement
    
    return mfcc_features  # Feature dimension should be [n_coefficients, n_frames]

# Function to pool features
def pool_features(features):
    print(f"Original features shape: {features.shape}")  # Debugging statement

    if len(features.shape) == 2:  # [n_coefficients, n_frames]
        mean_features = np.mean(features, axis=1)  # Mean over time frames
        std_features = np.std(features, axis=1)  # Std over time frames
        print(f"Mean features shape: {mean_features.shape}")
        print(f"Std features shape: {std_features.shape}")

        meanstd_features = np.concatenate((mean_features, std_features), axis=0)
    else:
        raise ValueError("Unexpected features shape.")
    
    return mean_features, std_features, meanstd_features

# Function to process a single audio file
def process_audio_file(audio_file):
    audio, sr = librosa.load(audio_file, sr=None)
    mfcc_features = extract_mfcc_features(audio, sr)
    mean_mfcc, std_mfcc, meanstd_mfcc = pool_features(mfcc_features)
    clear_gpu_memory()  # Clear GPU memory after each file
    return audio_file, mean_mfcc, std_mfcc, meanstd_mfcc

# Function to process audio files in parallel and batch-wise
def process_audio_files_parallel(main_folder, batch_size=50):
    audio_files = []
    for class_label in os.listdir(main_folder):
        class_path = os.path.join(main_folder, class_label)
        if os.path.isdir(class_path):
            for audio_file in os.listdir(class_path):
                if audio_file.endswith('.wav'):
                    audio_files.append(os.path.join(class_path, audio_file))
    
    processed_batches = set()
    if os.path.exists('Malayalam-MFCC-Features/processed_batches.pkl'):
        with open('Malayalam-MFCC-Features/processed_batches.pkl', 'rb') as f:
            processed_batches = pickle.load(f)
        print(f"Resuming from batch {len(processed_batches) + 1}")
    else:
        print("Starting from the beginning")
        os.makedirs('Malayalam-MFCC-Features', exist_ok=True)  # Create the directory if it doesn't exist

    num_batches = len(audio_files) // batch_size + 1
    for i in range(num_batches):
        if i in processed_batches:
            continue
        
        clear_gpu_memory()  # Clear GPU memory before each batch

        batch_files = audio_files[i*batch_size:(i+1)*batch_size]
        features_dict = {}
        for audio_file in batch_files:
            audio_path, mean_mfcc, std_mfcc, meanstd_mfcc = process_audio_file(audio_file)
            class_label = os.path.basename(os.path.dirname(audio_path))
            features_dict[audio_path] = {
                'label': class_label,
                'mean_mfcc': mean_mfcc,
                'std_mfcc': std_mfcc,
                'meanstd_mfcc': meanstd_mfcc
            }
        
        # Save intermediate results
        with open(f'Malayalam-MFCC-Features/features_batch_{i}.pkl', 'wb') as f:
            pickle.dump(features_dict, f)
        
        processed_batches.add(i)
        with open('Malayalam-MFCC-Features/processed_batches.pkl', 'wb') as f:
            pickle.dump(processed_batches, f)
        
        print(f"Processed batch {i+1}/{num_batches}")

        # Clear memory after each batch
        clear_gpu_memory()
    
    return num_batches

# Load and concatenate results
def load_and_concatenate_batches(num_batches):
    features_dict = {}
    for i in range(num_batches):
        try:
            with open(f'Malayalam-MFCC-Features/features_batch_{i}.pkl', 'rb') as f:
                batch_features = pickle.load(f)
                features_dict.update(batch_features)
        except FileNotFoundError:
            print(f"Warning: Malayalam-MFCC-Features/features_batch_{i}.pkl not found. Skipping this batch.")
    return features_dict

# Prepare dataset
def prepare_dataset(features_dict):
    X = []
    y = []
    label_encoder = LabelEncoder()
    
    for audio_path, features in features_dict.items():
        mean_mfcc = features['mean_mfcc']
        label = features['label']
        
        X.append(mean_mfcc)
        y.append(label)
    
    X = np.array(X)
    y = np.array(y)
    y = label_encoder.fit_transform(y)
    
    num_classes = len(label_encoder.classes_)
    
    print(f"Features shape: {X.shape}, Labels shape: {y.shape}, Number of classes: {num_classes}")
    
    return X, y, num_classes

# Example Usage:
# Process and save features in batches
num_batches = process_audio_files_parallel(main_folder='/dist_home/jairam/Malayalam/', batch_size=100)

# Load and concatenate all features
features_mfcc = load_and_concatenate_batches(num_batches)

# Prepare the dataset for training
X, y, num_classes = prepare_dataset(features_mfcc)

Starting from the beginning
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features sh

In [5]:
import torch
import torch.nn as nn

# Define the DNN model
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(117, 256)  # Change input size to match feature size
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)  # Output layer for classification
        self.dropout = nn.Dropout(p=0.1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    best_val_accuracy = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                _, predicted = torch.max(outputs, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        val_accuracy = correct / total
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    if best_model_state is None:
        print("Warning: Best model state was never updated. Returning the model without loading any state.")
    else:
        model.load_state_dict(best_model_state)
    
    return model

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create DataLoaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=100, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Initialize model
model = AudioClassifier(num_classes)

# Train the model
model = train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4)

# Save the trained model
torch.save(model.state_dict(), 'XLSR_Tamil_classifier_50.pth')


Epoch 1/50, Loss: 1.2022, Validation Accuracy: 0.5760
Epoch 2/50, Loss: 0.8878, Validation Accuracy: 0.5760
Epoch 3/50, Loss: 0.8317, Validation Accuracy: 0.5818
Epoch 4/50, Loss: 0.8072, Validation Accuracy: 0.6165
Epoch 5/50, Loss: 0.7915, Validation Accuracy: 0.6182
Epoch 6/50, Loss: 0.7736, Validation Accuracy: 0.6264
Epoch 7/50, Loss: 0.7592, Validation Accuracy: 0.6388
Epoch 8/50, Loss: 0.7468, Validation Accuracy: 0.6570
Epoch 9/50, Loss: 0.7289, Validation Accuracy: 0.6545
Epoch 10/50, Loss: 0.7104, Validation Accuracy: 0.6579
Epoch 11/50, Loss: 0.6842, Validation Accuracy: 0.7091
Epoch 12/50, Loss: 0.6675, Validation Accuracy: 0.7372
Epoch 13/50, Loss: 0.6454, Validation Accuracy: 0.7545
Epoch 14/50, Loss: 0.6308, Validation Accuracy: 0.7702
Epoch 15/50, Loss: 0.6092, Validation Accuracy: 0.7587
Epoch 16/50, Loss: 0.5939, Validation Accuracy: 0.7686
Epoch 17/50, Loss: 0.5833, Validation Accuracy: 0.7826
Epoch 18/50, Loss: 0.5723, Validation Accuracy: 0.7826
Epoch 19/50, Loss: 

In [8]:
from sklearn.metrics import classification_report

# Evaluate the model
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        
        all_labels.extend(batch_y.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# Calculate the accuracy
correct = sum(p == l for p, l in zip(all_preds, all_labels))
total = len(all_labels)
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate and print the classification report
report = classification_report(all_labels, all_preds)
print("Classification Report:")
print(report)

Test Accuracy: 0.8258
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.79      0.75       258
           1       0.94      0.96      0.95       405
           2       0.75      0.79      0.77       301
           3       0.87      0.68      0.77       247

    accuracy                           0.83      1211
   macro avg       0.82      0.81      0.81      1211
weighted avg       0.83      0.83      0.83      1211



# Tamil

In [10]:
import os
import pickle
import numpy as np
import librosa
import torch
import warnings
from sklearn.preprocessing import LabelEncoder

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to clear GPU memory
def clear_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Function to extract MFCC features
def extract_mfcc_features(audio, sr, n_mfcc=39, n_fft=480, hop_length=512):
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Calculate deltas and deltadeltas
    delta_mfcc = librosa.feature.delta(mfcc)
    delta_delta_mfcc = librosa.feature.delta(mfcc, order=2)
    
    # Concatenate MFCC, deltas, and deltadeltas
    mfcc_features = np.concatenate((mfcc, delta_mfcc, delta_delta_mfcc), axis=0)
    
    print(f"MFCC features shape: {mfcc_features.shape}")  # Debugging statement
    
    return mfcc_features  # Feature dimension should be [n_coefficients, n_frames]

# Function to pool features
def pool_features(features):
    print(f"Original features shape: {features.shape}")  # Debugging statement

    if len(features.shape) == 2:  # [n_coefficients, n_frames]
        mean_features = np.mean(features, axis=1)  # Mean over time frames
        std_features = np.std(features, axis=1)  # Std over time frames
        print(f"Mean features shape: {mean_features.shape}")
        print(f"Std features shape: {std_features.shape}")

        meanstd_features = np.concatenate((mean_features, std_features), axis=0)
    else:
        raise ValueError("Unexpected features shape.")
    
    return mean_features, std_features, meanstd_features

# Function to process a single audio file
def process_audio_file(audio_file):
    audio, sr = librosa.load(audio_file, sr=None)
    mfcc_features = extract_mfcc_features(audio, sr)
    mean_mfcc, std_mfcc, meanstd_mfcc = pool_features(mfcc_features)
    clear_gpu_memory()  # Clear GPU memory after each file
    return audio_file, mean_mfcc, std_mfcc, meanstd_mfcc

# Function to process audio files in parallel and batch-wise
def process_audio_files_parallel(main_folder, batch_size=50):
    audio_files = []
    for class_label in os.listdir(main_folder):
        class_path = os.path.join(main_folder, class_label)
        if os.path.isdir(class_path):
            for audio_file in os.listdir(class_path):
                if audio_file.endswith('.wav'):
                    audio_files.append(os.path.join(class_path, audio_file))
    
    processed_batches = set()
    if os.path.exists('Tamil-MFCC-Features/processed_batches.pkl'):
        with open('Tamil-MFCC-Features/processed_batches.pkl', 'rb') as f:
            processed_batches = pickle.load(f)
        print(f"Resuming from batch {len(processed_batches) + 1}")
    else:
        print("Starting from the beginning")
        os.makedirs('Tamil-MFCC-Features', exist_ok=True)  # Create the directory if it doesn't exist

    num_batches = len(audio_files) // batch_size + 1
    for i in range(num_batches):
        if i in processed_batches:
            continue
        
        clear_gpu_memory()  # Clear GPU memory before each batch

        batch_files = audio_files[i*batch_size:(i+1)*batch_size]
        features_dict = {}
        for audio_file in batch_files:
            audio_path, mean_mfcc, std_mfcc, meanstd_mfcc = process_audio_file(audio_file)
            class_label = os.path.basename(os.path.dirname(audio_path))
            features_dict[audio_path] = {
                'label': class_label,
                'mean_mfcc': mean_mfcc,
                'std_mfcc': std_mfcc,
                'meanstd_mfcc': meanstd_mfcc
            }
        
        # Save intermediate results
        with open(f'Tamil-MFCC-Features/features_batch_{i}.pkl', 'wb') as f:
            pickle.dump(features_dict, f)
        
        processed_batches.add(i)
        with open('Tamil-MFCC-Features/processed_batches.pkl', 'wb') as f:
            pickle.dump(processed_batches, f)
        
        print(f"Processed batch {i+1}/{num_batches}")

        # Clear memory after each batch
        clear_gpu_memory()
    
    return num_batches

# Load and concatenate results
def load_and_concatenate_batches(num_batches):
    features_dict = {}
    for i in range(num_batches):
        try:
            with open(f'Tamil-MFCC-Features/features_batch_{i}.pkl', 'rb') as f:
                batch_features = pickle.load(f)
                features_dict.update(batch_features)
        except FileNotFoundError:
            print(f"Warning: Tamil-MFCC-Features/features_batch_{i}.pkl not found. Skipping this batch.")
    return features_dict

# Prepare dataset
def prepare_dataset(features_dict):
    X = []
    y = []
    label_encoder = LabelEncoder()
    
    for audio_path, features in features_dict.items():
        mean_mfcc = features['mean_mfcc']
        label = features['label']
        
        X.append(mean_mfcc)
        y.append(label)
    
    X = np.array(X)
    y = np.array(y)
    y = label_encoder.fit_transform(y)
    
    num_classes = len(label_encoder.classes_)
    
    print(f"Features shape: {X.shape}, Labels shape: {y.shape}, Number of classes: {num_classes}")
    
    return X, y, num_classes

# Example Usage:
# Process and save features in batches
num_batches = process_audio_files_parallel(main_folder='/dist_home/jairam/Tamil/', batch_size=100)

# Load and concatenate all features
features_mfcc = load_and_concatenate_batches(num_batches)

# Prepare the dataset for training
X, y, num_classes = prepare_dataset(features_mfcc)


Starting from the beginning
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features sh

In [11]:
import torch
import torch.nn as nn

# Define the DNN model
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(117, 256)  # Change input size to match feature size
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)  # Output layer for classification
        self.dropout = nn.Dropout(p=0.1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    best_val_accuracy = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                _, predicted = torch.max(outputs, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        val_accuracy = correct / total
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    if best_model_state is None:
        print("Warning: Best model state was never updated. Returning the model without loading any state.")
    else:
        model.load_state_dict(best_model_state)
    
    return model

In [12]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create DataLoaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=100, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Initialize model
model = AudioClassifier(num_classes)

# Train the model
model = train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4)

# Save the trained model
torch.save(model.state_dict(), 'MFCC_tamil.pth')


Epoch 1/50, Loss: 1.1564, Validation Accuracy: 0.5632
Epoch 2/50, Loss: 0.7441, Validation Accuracy: 0.5480
Epoch 3/50, Loss: 0.6555, Validation Accuracy: 0.5955
Epoch 4/50, Loss: 0.6140, Validation Accuracy: 0.7471
Epoch 5/50, Loss: 0.5684, Validation Accuracy: 0.7623
Epoch 6/50, Loss: 0.5461, Validation Accuracy: 0.7623
Epoch 7/50, Loss: 0.5271, Validation Accuracy: 0.7785
Epoch 8/50, Loss: 0.4942, Validation Accuracy: 0.7973
Epoch 9/50, Loss: 0.4760, Validation Accuracy: 0.7695
Epoch 10/50, Loss: 0.4834, Validation Accuracy: 0.7821
Epoch 11/50, Loss: 0.4637, Validation Accuracy: 0.7928
Epoch 12/50, Loss: 0.4428, Validation Accuracy: 0.8117
Epoch 13/50, Loss: 0.4329, Validation Accuracy: 0.8072
Epoch 14/50, Loss: 0.4385, Validation Accuracy: 0.8260
Epoch 15/50, Loss: 0.4268, Validation Accuracy: 0.8350
Epoch 16/50, Loss: 0.4024, Validation Accuracy: 0.8341
Epoch 17/50, Loss: 0.3964, Validation Accuracy: 0.8430
Epoch 18/50, Loss: 0.3977, Validation Accuracy: 0.8377
Epoch 19/50, Loss: 

In [13]:
from sklearn.metrics import classification_report

# Evaluate the model
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        
        all_labels.extend(batch_y.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# Calculate the accuracy
correct = sum(p == l for p, l in zip(all_preds, all_labels))
total = len(all_labels)
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate and print the classification report
report = classification_report(all_labels, all_preds)
print("Classification Report:")
print(report)

Test Accuracy: 0.8719
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.69      0.76       285
           1       0.96      0.96      0.96       279
           2       0.96      0.96      0.96       288
           3       0.72      0.88      0.79       264

    accuracy                           0.87      1116
   macro avg       0.88      0.87      0.87      1116
weighted avg       0.88      0.87      0.87      1116



# Kannada

In [15]:
import os
import pickle
import numpy as np
import librosa
import torch
import warnings
from sklearn.preprocessing import LabelEncoder

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to clear GPU memory
def clear_gpu_memory():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# Function to extract MFCC features
def extract_mfcc_features(audio, sr, n_mfcc=39, n_fft=480, hop_length=512):
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    
    # Calculate deltas and deltadeltas
    delta_mfcc = librosa.feature.delta(mfcc)
    delta_delta_mfcc = librosa.feature.delta(mfcc, order=2)
    
    # Concatenate MFCC, deltas, and deltadeltas
    mfcc_features = np.concatenate((mfcc, delta_mfcc, delta_delta_mfcc), axis=0)
    
    print(f"MFCC features shape: {mfcc_features.shape}")  # Debugging statement
    
    return mfcc_features  # Feature dimension should be [n_coefficients, n_frames]

# Function to pool features
def pool_features(features):
    print(f"Original features shape: {features.shape}")  # Debugging statement

    if len(features.shape) == 2:  # [n_coefficients, n_frames]
        mean_features = np.mean(features, axis=1)  # Mean over time frames
        std_features = np.std(features, axis=1)  # Std over time frames
        print(f"Mean features shape: {mean_features.shape}")
        print(f"Std features shape: {std_features.shape}")

        meanstd_features = np.concatenate((mean_features, std_features), axis=0)
    else:
        raise ValueError("Unexpected features shape.")
    
    return mean_features, std_features, meanstd_features

# Function to process a single audio file
def process_audio_file(audio_file):
    audio, sr = librosa.load(audio_file, sr=None)
    mfcc_features = extract_mfcc_features(audio, sr)
    mean_mfcc, std_mfcc, meanstd_mfcc = pool_features(mfcc_features)
    clear_gpu_memory()  # Clear GPU memory after each file
    return audio_file, mean_mfcc, std_mfcc, meanstd_mfcc

# Function to process audio files in parallel and batch-wise
def process_audio_files_parallel(main_folder, batch_size=50):
    audio_files = []
    for class_label in os.listdir(main_folder):
        class_path = os.path.join(main_folder, class_label)
        if os.path.isdir(class_path):
            for audio_file in os.listdir(class_path):
                if audio_file.endswith('.wav'):
                    audio_files.append(os.path.join(class_path, audio_file))
    
    processed_batches = set()
    if os.path.exists('Kannada-MFCC-Features/processed_batches.pkl'):
        with open('Kannada-MFCC-Features/processed_batches.pkl', 'rb') as f:
            processed_batches = pickle.load(f)
        print(f"Resuming from batch {len(processed_batches) + 1}")
    else:
        print("Starting from the beginning")
        os.makedirs('Kannada-MFCC-Features', exist_ok=True)  # Create the directory if it doesn't exist

    num_batches = len(audio_files) // batch_size + 1
    for i in range(num_batches):
        if i in processed_batches:
            continue
        
        clear_gpu_memory()  # Clear GPU memory before each batch

        batch_files = audio_files[i*batch_size:(i+1)*batch_size]
        features_dict = {}
        for audio_file in batch_files:
            audio_path, mean_mfcc, std_mfcc, meanstd_mfcc = process_audio_file(audio_file)
            class_label = os.path.basename(os.path.dirname(audio_path))
            features_dict[audio_path] = {
                'label': class_label,
                'mean_mfcc': mean_mfcc,
                'std_mfcc': std_mfcc,
                'meanstd_mfcc': meanstd_mfcc
            }
        
        # Save intermediate results
        with open(f'Kannada-MFCC-Features/features_batch_{i}.pkl', 'wb') as f:
            pickle.dump(features_dict, f)
        
        processed_batches.add(i)
        with open('Kannada-MFCC-Features/processed_batches.pkl', 'wb') as f:
            pickle.dump(processed_batches, f)
        
        print(f"Processed batch {i+1}/{num_batches}")

        # Clear memory after each batch
        clear_gpu_memory()
    
    return num_batches

# Load and concatenate results
def load_and_concatenate_batches(num_batches):
    features_dict = {}
    for i in range(num_batches):
        try:
            with open(f'Kannada-MFCC-Features/features_batch_{i}.pkl', 'rb') as f:
                batch_features = pickle.load(f)
                features_dict.update(batch_features)
        except FileNotFoundError:
            print(f"Warning: Kannada-MFCC-Features/features_batch_{i}.pkl not found. Skipping this batch.")
    return features_dict

# Prepare dataset
def prepare_dataset(features_dict):
    X = []
    y = []
    label_encoder = LabelEncoder()
    
    for audio_path, features in features_dict.items():
        mean_mfcc = features['mean_mfcc']
        label = features['label']
        
        X.append(mean_mfcc)
        y.append(label)
    
    X = np.array(X)
    y = np.array(y)
    y = label_encoder.fit_transform(y)
    
    num_classes = len(label_encoder.classes_)
    
    print(f"Features shape: {X.shape}, Labels shape: {y.shape}, Number of classes: {num_classes}")
    
    return X, y, num_classes

# Example Usage:
# Process and save features in batches
num_batches = process_audio_files_parallel(main_folder='/dist_home/jairam/Kannada/', batch_size=100)

# Load and concatenate all features
features_mfcc = load_and_concatenate_batches(num_batches)

# Prepare the dataset for training
X, y, num_classes = prepare_dataset(features_mfcc)

Starting from the beginning
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features shape: (117,)
MFCC features shape: (117, 313)
Original features shape: (117, 313)
Mean features shape: (117,)
Std features sh

In [16]:
import torch
import torch.nn as nn

# Define the DNN model
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(117, 256)  
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)  # num_classes will be defined based on the data
        self.dropout = nn.Dropout(p=0.1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

def train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    best_val_accuracy = 0.0
    best_model_state = None
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                outputs = model(batch_X)
                _, predicted = torch.max(outputs, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        val_accuracy = correct / total
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")
    
    if best_model_state is None:
        print("Warning: Best model state was never updated. Returning the model without loading any state.")
    else:
        model.load_state_dict(best_model_state)
    
    return model

In [17]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create DataLoaders
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=100, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

# Initialize model
model = AudioClassifier(num_classes)

# Train the model
model = train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=1e-4)

# Save the trained model
torch.save(model.state_dict(), 'MFCC_Kannada_classifier_50.pth')


Epoch 1/50, Loss: 1.0974, Validation Accuracy: 0.3711
Epoch 2/50, Loss: 1.0541, Validation Accuracy: 0.6971
Epoch 3/50, Loss: 1.0056, Validation Accuracy: 0.5209
Epoch 4/50, Loss: 0.9341, Validation Accuracy: 0.6707
Epoch 5/50, Loss: 0.8668, Validation Accuracy: 0.6652
Epoch 6/50, Loss: 0.8195, Validation Accuracy: 0.7104
Epoch 7/50, Loss: 0.7738, Validation Accuracy: 0.7236
Epoch 8/50, Loss: 0.7304, Validation Accuracy: 0.7346
Epoch 9/50, Loss: 0.6943, Validation Accuracy: 0.7291
Epoch 10/50, Loss: 0.6794, Validation Accuracy: 0.7478
Epoch 11/50, Loss: 0.6421, Validation Accuracy: 0.7555
Epoch 12/50, Loss: 0.6319, Validation Accuracy: 0.7643
Epoch 13/50, Loss: 0.6013, Validation Accuracy: 0.7742
Epoch 14/50, Loss: 0.5843, Validation Accuracy: 0.7753
Epoch 15/50, Loss: 0.5687, Validation Accuracy: 0.7896
Epoch 16/50, Loss: 0.5484, Validation Accuracy: 0.7919
Epoch 17/50, Loss: 0.5492, Validation Accuracy: 0.8073
Epoch 18/50, Loss: 0.5182, Validation Accuracy: 0.7952
Epoch 19/50, Loss: 

In [18]:
from sklearn.metrics import classification_report

# Evaluate the model
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        
        all_labels.extend(batch_y.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# Calculate the accuracy
correct = sum(p == l for p, l in zip(all_preds, all_labels))
total = len(all_labels)
test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate and print the classification report
report = classification_report(all_labels, all_preds)
print("Classification Report:")
print(report)

Test Accuracy: 0.8711
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.81      0.85       279
           1       0.87      0.87      0.87       334
           2       0.86      0.92      0.89       295

    accuracy                           0.87       908
   macro avg       0.87      0.87      0.87       908
weighted avg       0.87      0.87      0.87       908

