In [2]:
pip install openai-whisper --no-cache-dir

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Downloading triton-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (156.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.4/156.4 MB[0m [31m242.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-an

In [49]:
import re
import time
import torch
import whisper
import librosa
import datetime
import requests
import pandas as pd
import numpy as np
import torch.nn as nn

In [8]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(64 * 32 * 70, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [21]:
classificatorModel = AudioCNN()
classificatorModel.load_state_dict(torch.load('/kaggle/input/speechmusicclassificator/pytorch/default/2/speech_music_classificator.pth', map_location='cpu', weights_only=True))
classificatorModel.eval()

AudioCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=143360, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [59]:
languageIdentifierModel = whisper.load_model('medium')

100%|█████████████████████████████████████| 1.42G/1.42G [00:25<00:00, 60.4MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [55]:
streams = [
    {'url': 'https://stream22.tmwradio.com/tmw.mp3',
     'name': 'RMC Sport Network',
     'country': 'Italy'}
]

In [36]:
def recorder(stream_url):
    with open('piece.mp3', 'wb') as f:
        r = requests.get(stream_url, stream=True)
        for block in r.iter_content(50000):
            f.write(block)
            break
    return 'piece.mp3'

In [37]:
def language_identifier(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio, n_mels=languageIdentifierModel.dims.n_mels)
    _, probs = languageIdentifierModel.detect_language(mel)
    return max(probs, key=probs.get)

In [46]:
def preprocess(audio, sr=48000):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)
    return log_mel_spectrogram # (128, 282)

In [57]:
def speech_music_classificator(audio):
    array, sampling_rate = librosa.load(audio, dtype='float32', sr=48000)
    array = preprocess(array[:144000])
    print(array.shape)
    array = torch.tensor(array).unsqueeze(0).unsqueeze(0).float()
    with torch.no_grad():
        output = classificatorModel(array)
    prediction = output.item()
    prediction = 'music' if prediction < 0.5 else 'speech'
    return prediction

In [58]:
for stream in streams:
    print(stream['name'], ' from ', stream['country'])
    try:
        while(True):
            audio = recorder(stream['url'])
            language = language_identifier(audio)
            content = speech_music_classificator(audio)
            print(f'i can hear {content} in {language}')
            time.sleep(10)
    except KeyboardInterrupt:
        pass

RMC Sport Network  from  Italy
(128, 282)
i can hear speech in it
(128, 282)
i can hear speech in ro
(128, 282)
i can hear speech in ro
(128, 282)
i can hear speech in hi
