In [None]:
pip install openai-whisper --no-cache-dir

In [None]:
pip install supabase

In [None]:
import re
import time
import torch
import whisper
import librosa
import datetime
import requests
import pandas as pd
import numpy as np
import torch.nn as nn
import plotly.graph_objects as go
from collections import Counter
#from supabase import create_client, Client

In [None]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(64 * 32 * 70, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.dropout(x)
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [None]:
classificatorModel = AudioCNN()
classificatorModel.load_state_dict(torch.load('/kaggle/input/speechmusicclassificator/pytorch/default/2/speech_music_classificator.pth', map_location='cpu', weights_only=True))
classificatorModel.eval()

In [None]:
languageIdentifierModel = whisper.load_model('medium')

In [None]:
stream = 'https://kpradio.hostingradio.ru:8000/irkutsk.radiokp128.mp3'

In [None]:
def recorder(stream_url):
    with open('piece.mp3', 'wb') as f:
        r = requests.get(stream_url, stream=True)
        for block in r.iter_content(50000):
            f.write(block)
            break
    return 'piece.mp3'

In [None]:
def language_identifier(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio, n_mels=languageIdentifierModel.dims.n_mels)
    _, probs = languageIdentifierModel.detect_language(mel)
    return max(probs, key=probs.get)

In [None]:
def preprocess(audio, sr=48000):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    log_mel_spectrogram = (log_mel_spectrogram - np.mean(log_mel_spectrogram)) / np.std(log_mel_spectrogram)
    return log_mel_spectrogram # (128, 282)

In [None]:
def speech_music_classificator(audio):
    array, sampling_rate = librosa.load(audio, dtype='float32', sr=48000)
    array = preprocess(array[:144000])
    array = torch.tensor(array).unsqueeze(0).unsqueeze(0).float()
    with torch.no_grad():
        output = classificatorModel(array)
    prediction = output.item()
    prediction = 'music' if prediction < 0.5 else 'speech'
    return prediction

In [None]:
LANGUAGES = {'en': 'english', 'zh': 'chinese', 'de': 'german', 'es': 'spanish', 'ru': 'russian', 'ko': 'korean', 'fr': 'french', 'ja': 'japanese', 'pt': 'portuguese', 'tr': 'turkish', 'pl': 'polish', 'ca': 'catalan', 'nl': 'dutch', 'ar': 'arabic', 'sv': 'swedish', 'it': 'italian', 'id': 'indonesian', 'hi': 'hindi', 'fi': 'finnish', 'vi': 'vietnamese', 'he': 'hebrew', 'uk': 'ukrainian', 'el': 'greek', 'ms': 'malay', 'cs': 'czech', 'ro': 'romanian', 'da': 'danish', 'hu': 'hungarian', 'ta': 'tamil', 'no': 'norwegian', 'th': 'thai', 'ur': 'urdu', 'hr': 'croatian', 'bg': 'bulgarian', 'lt': 'lithuanian', 'la': 'latin', 'mi': 'maori', 'ml': 'malayalam', 'cy': 'welsh', 'sk': 'slovak', 'te': 'telugu', 'fa': 'persian', 'lv': 'latvian', 'bn': 'bengali', 'sr': 'serbian', 'az': 'azerbaijani', 'sl': 'slovenian', 'kn': 'kannada', 'et': 'estonian', 'mk': 'macedonian', 'br': 'breton', 'eu': 'basque', 'is': 'icelandic', 'hy': 'armenian', 'ne': 'nepali', 'mn': 'mongolian', 'bs': 'bosnian', 'kk': 'kazakh', 'sq': 'albanian', 'sw': 'swahili', 'gl': 'galician', 'mr': 'marathi', 'pa': 'punjabi', 'si': 'sinhala', 'km': 'khmer', 'sn': 'shona', 'yo': 'yoruba', 'so': 'somali', 'af': 'afrikaans', 'oc': 'occitan', 'ka': 'georgian', 'be': 'belarusian', 'tg': 'tajik', 'sd': 'sindhi', 'gu': 'gujarati', 'am': 'amharic', 'yi': 'yiddish', 'lo': 'lao', 'uz': 'uzbek', 'fo': 'faroese', 'ht': 'haitian creole', 'ps': 'pashto', 'tk': 'turkmen', 'nn': 'nynorsk', 'mt': 'maltese', 'sa': 'sanskrit', 'lb': 'luxembourgish', 'my': 'myanmar', 'bo': 'tibetan', 'tl': 'tagalog', 'mg': 'malagasy', 'as': 'assamese', 'tt': 'tatar', 'haw': 'hawaiian', 'ln': 'lingala', 'ha': 'hausa', 'ba': 'bashkir', 'jw': 'javanese', 'su': 'sundanese', 'yue': 'cantonese'}

In [None]:
def plot_drawer(statistics, radioname, plot_num, include_plotlyjs=False):
    statistics_counted = {
        'speech': Counter(statistics['speech']),
        'music': Counter(statistics['music'])
    }

    labels = []
    parents = []
    values = []
    ids = []

    for category in ['speech', 'music']:
        total = sum(statistics_counted[category].values())
        labels.append(category)
        ids.append(category)
        parents.append('')
        values.append(total)
    
        for lang, count in statistics_counted[category].items():
            node_id = f"{category}-{lang}"
            labels.append(lang)
            ids.append(node_id)
            parents.append(category)
            values.append(count)

    fig = go.Figure(go.Sunburst(
        labels=labels,
        parents=parents,
        values=values,
        ids=ids,
        branchvalues='total',
        hoverinfo='label+value+percent parent'
    ))
    
    fig.update_layout(
        title=radioname,
        margin=dict(t=40, l=10, r=10, b=10)
    )
    
    return fig.to_html(full_html=False, include_plotlyjs=include_plotlyjs)

In [None]:
def system(stream, minutes=1, plots=2):
    html_start = f'''
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>Combined Plotly Charts</title>
    </head>
    <body>
    
    '''
    html_end = f'''
    </body>
    </html>
    '''
    statistics = {
        'speech': [],
        'music': []
    }
    start_time = time.time()
    finish_time = start_time + 60 * minutes
    plot_interval = 60 * minutes / plots
    next_plot_time = start_time + plot_interval
    plots_shown = 0
    while time.time() < finish_time:
        current_time = time.time()
        if plots_shown < plots and current_time >= next_plot_time:
            if plots_shown == 0:
                html_start += '<\n>' + plot_drawer(statistics, f'Plot {plots_shown + 1}', plots_shown + 1, 'cdn')
            else:
                html_start += '<\n>' + plot_drawer(statistics, f'Plot {plots_shown + 1}', plots_shown + 1)
            plots_shown += 1
            next_plot_time = current_time + plot_interval * (plots_shown + 1)
        audio = recorder(stream)
        language = language_identifier(audio)
        content = speech_music_classificator(audio)
        statistics[content].append(LANGUAGES[language])
    html_start += '<\n>' + plot_drawer(statistics, f'Final plot', 'final') + '<br/>'
    full_html = html_start + html_end
    with open('charts.html', 'w') as html_f:
        html_f.write(full_html)

In [None]:
system(stream, minutes=31, plots=3)