In [1]:
pip install openai-whisper --no-cache-dir

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->openai-whisper)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->openai-whisper)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->openai-whisper)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->openai-whisper)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-non

In [2]:
import time
import torch
import whisper
import librosa
import datetime
import requests
import pandas as pd
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from pydub import AudioSegment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [3]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(64 * 32 * 70, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [4]:
classificatorModel = AudioCNN()
classificatorModel.load_state_dict(torch.load('/kaggle/input/speechmusicclassificator/pytorch/default/2/speech_music_classificator.pth', map_location='cpu', weights_only=True))
classificatorModel.eval()

AudioCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=143360, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [5]:
languageIdentifierModel = whisper.load_model('medium')

100%|█████████████████████████████████████| 1.42G/1.42G [00:18<00:00, 81.4MiB/s]


In [6]:
def language_identifier(array, sr=48000):
    array = np.int16(array / np.max(np.abs(array)) * 32767)
    audio_segment = AudioSegment(
        array.tobytes(), 
        frame_rate=sr,
        sample_width=2,
        channels=1
    )
    audio_segment.export('piece.mp3', format='mp3')
    audio = whisper.load_audio('/kaggle/working/piece.mp3')
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio, n_mels=languageIdentifierModel.dims.n_mels)
    _, probs = languageIdentifierModel.detect_language(mel)
    return max(probs, key=probs.get)

In [7]:
def preprocess(audio, sr=48000):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)
    return log_mel_spectrogram # (128, 282)

In [8]:
def speech_music_classificator(array):
    array = preprocess(array)
    array = torch.tensor(array).unsqueeze(0).unsqueeze(0).float()
    with torch.no_grad():
        output = classificatorModel(array)
    prediction = output.item()
    prediction = 'music' if prediction < 0.5 else 'speech'
    return prediction

In [9]:
data = pd.read_parquet('/kaggle/input/speechandmusicinlanguagestestdataset/speechAndMusicWLanguagesTestSplit.parquet')

In [10]:
data.value_counts(['language_code'])

language_code
en               40
es               40
fr               40
id               40
ja               40
lt               40
nl               40
pt               40
ro               40
Name: count, dtype: int64

In [11]:
audios = data['array'].values.tolist()
labels = data['label'].values.tolist()
language_codes = data['language_code'].values.tolist()

In [12]:
print(len(audios), len(labels), len(language_codes))

360 360 360


In [13]:
def test_system(audios, labels, language_codes):
    language_preds = []
    content_preds = []
    for audio in tqdm(audios):
        language_pred = language_identifier(audio)
        language_preds.append(language_pred)
        content_pred = speech_music_classificator(audio)
        content_preds.append(content_pred)
    return language_preds, content_preds

In [14]:
language_preds, content_preds = test_system(audios, labels, language_codes)

100%|██████████| 360/360 [1:17:11<00:00, 12.87s/it]


In [15]:
language_accuracy = accuracy_score(language_codes, language_preds)
content_accuracy = accuracy_score(labels, content_preds)

In [16]:
language_precision = precision_score(language_codes, language_preds, average='micro')
content_precision = precision_score(labels, content_preds, average='micro')

In [17]:
language_recall = recall_score(language_codes, language_preds, average='micro')
content_recall = recall_score(labels, content_preds, average='micro')

In [18]:
print(f'Test Results - Language identification accuracy: {language_accuracy*100:.5f}%, Content classification accuracy: {content_accuracy*100:.5f}%, Language identification precision: {content_accuracy:.5f}, Content classification precision: {content_precision:.5f}, Language identification recall: {language_recall:.5f}, Content classification recall: {content_recall:.5f}')

Test Results - Language identification accuracy: 63.05556%, Content classification accuracy: 98.05556%, Language identification precision: 0.98056, Content classification precision: 0.98056, Language identification recall: 0.63056, Content classification recall: 0.98056
