# Building ML Web Apps: Part 2

## Table of Contents

1. Overview
2. Tools
3. Architecture
4. Front-End
5. ML Microservices
6. Recommendations
7. Testing
8. Final Thoughts
9. Exercises

Use Case
Connect to an external API and do all sorts of cool things to a song
use the timeline component of NiceGui if possible

## 1. Overview

## 2. Tools

## 3. Architecture

## 4. Front-End

## 5. ML Microservices

### Music Splitter

In [8]:
from mlserver.codecs import NumpyCodec
from pedalboard.io import AudioFile
import requests

In [9]:
with AudioFile("05mUf9x3V3RIqafuY4H54E.mp3", "r") as f:
    first_song = f.read(f.frames)
    first_sample_rate = f.samplerate

In [21]:
first_song.shape

(2, 1323648)

In [22]:
endpoint = "http://localhost:5070/v2/models/music_splitter/infer"

In [23]:
input_request = {
    "inputs": [
        NumpyCodec.encode_input(name="song", payload=first_song).dict()
    ]
}

In [24]:
res = requests.post(endpoint, json=input_request)

In [25]:
res.json()

{'model_name': 'music_splitter',
 'id': 'd4c6406f-c6f2-4809-961b-a691ec56ca8f',
 'parameters': {},
 'outputs': [{'name': 'output-0',
   'shape': [4, 1323648],
   'datatype': 'FP32',
   'parameters': {'content_type': 'np'},
   'data': [4.395841096993536e-05,
    2.9174894734751433e-05,
    1.648487159400247e-05,
    2.9442926461342722e-05,
    6.330572796287015e-05,
    6.936577119631693e-05,
    3.973202910856344e-05,
    5.1435887144180015e-05,
    8.276110020233318e-05,
    7.395581633318216e-05,
    1.8706668925005943e-05,
    3.7751531635876745e-05,
    3.1225819839164615e-05,
    4.0067825466394424e-05,
    3.124810609733686e-05,
    4.472468208405189e-05,
    1.3653887435793877e-05,
    1.7195776308653876e-05,
    3.234206451452337e-05,
    4.248674667906016e-05,
    1.9263705326011404e-05,
    3.977862434112467e-05,
    6.147629756014794e-05,
    8.747967513045296e-05,
    8.781741780694574e-05,
    0.00011165242904098704,
    0.00013858692545909435,
    0.00013663197751156986,


### Transcriber

In [38]:
from transformers import pipeline
import librosa

In [34]:
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [41]:
def pre_process(song: np.ndarray, sample_rate) -> np.ndarray:
    return librosa.resample(
        song[0], orig_sr=sample_rate, target_sr=pipe.feature_extractor.sampling_rate
    )

In [42]:
new_first = pre_process(first_song, first_sample_rate)

In [43]:
new_first

array([ 1.1289497e-08, -3.3608849e-09,  2.4956615e-10, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32)

In [45]:
trans = pipe(new_first, max_new_tokens=2000)['text']
trans

" Life is an automatic, hey as you go You won't get too far if you ain't got the dough Way down in death, your chips have been fried There ain't no place to run And nowhere to hide Needs mud, the devil drives"

In [29]:
import numpy as np

In [46]:
endpoint = "http://localhost:5060/v2/models/music_transcriber/infer"

In [47]:
input_request = {
    "inputs": [
        NumpyCodec.encode_input(name="song", payload=first_song).dict(),
        NumpyCodec.encode_input(name="sample_rate", payload=np.array([first_sample_rate])).dict(),
    ]
}

In [48]:
res = requests.post(endpoint, json=input_request)

In [49]:
res.json()

{'model_name': 'music_transcriber',
 'id': 'f321c625-f3e4-4954-bb1f-ce4dda550c1e',
 'parameters': {},
 'outputs': [{'name': 'output-0',
   'shape': [1, 1],
   'datatype': 'BYTES',
   'parameters': {'content_type': 'str'},
   'data': [" Life is an automatic, hey as you go You won't get too far if you ain't got the dough Way down in death, your chips have been fried There ain't no place to run And nowhere to hide Needs mud, the devil drives"]}]}

### Text Embeddings

In [51]:
from mlserver.codecs import StringCodec

In [73]:
endpoint = "http://localhost:4080/v2/models/text_embedding/infer"

In [74]:
input_request = {
    "inputs": [
        StringCodec.encode_input(name="lyrics", payload=res.json()['outputs'][0]['data'], use_bytes=False).dict()
    ]
}

In [75]:
res.json()['outputs'][0]['data'][0]

" Life is an automatic, hey as you go You won't get too far if you ain't got the dough Way down in death, your chips have been fried There ain't no place to run And nowhere to hide Needs mud, the devil drives"

In [76]:
StringCodec.encode_input(name="lyrics", payload=res.json()['outputs'][0]['data'], use_bytes=False).dict()

{'name': 'lyrics',
 'shape': [1, 1],
 'datatype': 'BYTES',
 'parameters': {'content_type': 'str'},
 'data': [" Life is an automatic, hey as you go You won't get too far if you ain't got the dough Way down in death, your chips have been fried There ain't no place to run And nowhere to hide Needs mud, the devil drives"]}

In [77]:
embs = requests.post(endpoint, json=input_request)

In [79]:
embs.json()

{'model_name': 'text_embedding',
 'id': '535e9a67-c7d9-4184-9dc1-74ae4f07bd38',
 'parameters': {},
 'outputs': [{'name': 'output-0',
   'shape': [1, 384],
   'datatype': 'FP32',
   'parameters': {'content_type': 'np'},
   'data': [-0.00262596202082932,
    -0.005708673503249884,
    -0.005277496296912432,
    -0.0024855020456016064,
    0.032794393599033356,
    0.028131598606705666,
    0.04496707394719124,
    -0.05128021165728569,
    0.018897423520684242,
    -0.02569490671157837,
    0.023357322439551353,
    -0.02439548820257187,
    0.033512815833091736,
    -0.007919454015791416,
    -0.11746230721473694,
    -0.01635747216641903,
    0.0265838410705328,
    -0.06279219686985016,
    -0.034084219485521317,
    0.050643909722566605,
    0.011130168102681637,
    0.0675671175122261,
    0.028123805299401283,
    0.02282983437180519,
    -0.1462325155735016,
    0.0852329358458519,
    -0.01007222943007946,
    0.006036548875272274,
    -0.08327685296535492,
    0.0081616872921586

### Audio Embeddings

In [84]:
endpoint = "http://localhost:4030/v2/models/audio_embedding/infer"

In [85]:
first_song[0][None].shape

(1, 1323648)

In [86]:
input_request = {
    "inputs": [
        NumpyCodec.encode_input(name="song", payload=first_song[0][None]).dict(),
    ]
}

In [87]:
aembs = requests.post(endpoint, json=input_request)

In [89]:
aembs.json()

{'model_name': 'audio_embedding',
 'id': '636641d5-2574-4bd0-9f34-067c0cbf7902',
 'parameters': {},
 'outputs': [{'name': 'output-0',
   'shape': [1, 2048],
   'datatype': 'FP32',
   'parameters': {'content_type': 'np'},
   'data': [0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.4533822536468506,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.15135128796100616,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.2049483060836792,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.5735266804695129,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,

### Sentiment Classification

https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english

In [None]:
%%writefile servers/sentiment/model-settings.json
{
    "name": "transformer",
    "implementation": "mlserver_huggingface.HuggingFaceRuntime",
    "parameters": {
        "extra": {
            "task": "text-classification",
            "device": 0
        }
    }
}

In [None]:
%%writefile servers/sentiment/settings.json
{
    "http_port": 5010,
    "grpc_port": 5020,
    "metrics_port": 5018
}

In [3]:
from IPython.display import Audio
import pandas as pd

In [2]:
df = pd.read_csv("payload.csv")
df.head()

Unnamed: 0,index,ids,artist,genre,name,subgenres,urls,artist_song,photos
0,0,01MWMVIJ5PDKOLuoZm3AL7,Michael Burks,blues,Make It Rain,['blues---electric blues'],https://datalakerpg.s3.ap-southeast-2.amazonaw...,Michael Burks - Make It Rain,https://tse4.explicit.bing.net/th?id=OIP.8JRZs...
1,1,01PS1areNdzaORljuLQTAR,Elmore James,blues,Sinful Woman,['blues---electric blues'],https://datalakerpg.s3.ap-southeast-2.amazonaw...,Elmore James - Sinful Woman,https://tse4.mm.bing.net/th?id=OIP.gzSoL3TUBEZ...
2,2,01ZOelFcLgOBpYg3xoagoR,John Lee Hooker,blues,How Can You Do It,['blues---electric blues'],https://datalakerpg.s3.ap-southeast-2.amazonaw...,John Lee Hooker - How Can You Do It,https://tse3.mm.bing.net/th?id=OIP.DEcSQ9z7dJc...
3,3,02P3Thwk7K52CgvznzMxz2,Dave Van Ronk,blues,Buckets of Rain,['blues---country blues'],https://datalakerpg.s3.ap-southeast-2.amazonaw...,Dave Van Ronk - Buckets of Rain,https://tse1.mm.bing.net/th?id=OIP.oZDeYMp4_Oo...
4,4,02egLiBBNLyww0dI0s02LB,Jessie Mae Hemphill,blues,"Baby, Please Don't Go",['blues---country blues'],https://datalakerpg.s3.ap-southeast-2.amazonaw...,"Jessie Mae Hemphill - Baby, Please Don't Go",https://tse3.mm.bing.net/th?id=OIP.h3eDgzNytlf...


In [4]:
Audio(url=df['urls'][10])

In [5]:
from transformers import pipeline


ModuleNotFoundError: No module named 'transformers'

In [None]:
pipe = pipeline("audio-classification", model="ramonpzg/wav2musicgenre")

In [29]:
import requests

In [30]:
from pedalboard.io import AudioFile

In [71]:
url = 'https://datalakerpg.s3.ap-southeast-2.amazonaws.com/ludwig_music_data/mp3/blues/01MWMVIJ5PDKOLuoZm3AL7.mp3'

In [86]:
import librosa
import torchaudio
import io

In [88]:
with requests.get(url, stream=True) as music:
    content = music.content
    fil = io.BytesIO(content)
    # y, sr = torchaudio.load(f)
    with AudioFile(fil, "r") as f:
        song = f.read(f.frames)
        sample_rate = f.samplerate

In [92]:
song[0].shape

(1323648,)

In [90]:
song.shape, sample_rate, y.shape, sr

((2, 1323648), 44100, torch.Size([2, 1322496]), 44100)

In [72]:
with AudioFile("05mUf9x3V3RIqafuY4H54E.mp3", "r") as f:
    song = f.read(f.frames)
    sample_rate = f.samplerate

ValueError: Failed to open audio file: file does not exist: https://datalakerpg.s3.ap-southeast-2.amazonaws.com/ludwig_music_data/mp3/blues/01MWMVIJ5PDKOLuoZm3AL7.mp3

In [34]:
song[0]

array([ 0.0000000e+00, -1.6823791e-09, -9.7171571e-10, ...,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32)

In [69]:
from mlserver.codecs import NumpyCodec

In [None]:
input_request = {
    "inputs": [
        NumpyCodec.encode_input(name='song', payload=song[0][None]).dict()
    ]
}
input_request

In [36]:
endpoint = "http://localhost:5080/v2/models/music_classifier/infer"

In [37]:
res = requests.post(endpoint, json=input_request)

In [40]:
from mlserver.types import InferenceResponse

In [65]:
res.json()['outputs'][1]

{'name': 'label',
 'shape': [5, 1],
 'datatype': 'BYTES',
 'parameters': {'content_type': 'str'},
 'data': ['rock', 'electronic', 'pop', 'funk _ soul', 'hip hop']}

In [96]:
next(zip(res.json()['outputs'][0]['data'], res.json()['outputs'][1]['data']))

(0.6089968085289001, 'rock')

In [97]:
rows = []
for score, label in zip(res.json()['outputs'][0]['data'], res.json()['outputs'][1]['data']):
    results = {}
    results['genre'] = label
    results['score'] = score
    rows.append(results)
rows

[{'genre': 'rock', 'score': 0.6089968085289001},
 {'genre': 'electronic', 'score': 0.22553586959838867},
 {'genre': 'pop', 'score': 0.049627259373664856},
 {'genre': 'funk _ soul', 'score': 0.02759118564426899},
 {'genre': 'hip hop', 'score': 0.018503567203879356}]

In [62]:
InferenceResponse(**res.json()).outputs[1]

ResponseOutput(name='label', shape=[5, 1], datatype='BYTES', parameters=Parameters(content_type='str', headers=None), data=TensorData(__root__=['rock', 'electronic', 'pop', 'funk _ soul', 'hip hop']))

## 7. Recommendations

## 8. Testing

## 8. Final Thoughts

## 9. Exercises

In [42]:
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration, pipeline

In [78]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large", #device=device
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [67]:
with AudioFile("test_split/05mUf9x3V3RIqafuY4H54E/vocals.wav", "r") as f:
    song = f.read(f.frames)
    sample_rate = f.samplerate

In [51]:
song.shape, sample_rate

((2, 1322496), 44100)

In [79]:
pipe.feature_extractor.sampling_rate

16000

In [None]:
from pedalboard import Resample

In [64]:
from datasets import Audio as ft_audio
import numpy as np
import librosa

In [74]:
first_resampled_audio = librosa.resample(first_song[0], orig_sr=first_sample_rate, target_sr=pipe.feature_extractor.sampling_rate)
first_resampled_audio.shape

(480236,)

In [None]:
resampled_audio = librosa.resample(song[0], orig_sr=sample_rate, target_sr=pipe.feature_extractor.sampling_rate)
resampled_audio.shape

(479818,)

In [82]:
text_out = pipe(
    resampled_audio,
    max_new_tokens=2000,
    generate_kwargs={"task": "transcribe"},
    # chunk_length_s=30,
    # batch_size=8,
    # return_timestamps=True
)
text_out

{'text': " It's an automatic pay as you go You won't get too far if you ain't got the dough Weighed down in debt, your chips have been fried There ain't no place to run And no way to hurt Needs much, the devil drives"}

In [81]:
text_out = pipe(
    first_resampled_audio,
    max_new_tokens=1000,
    generate_kwargs={"task": "transcribe"},
    # chunk_length_s=30,
    # batch_size=8,
    # return_timestamps=True
)
text_out

{'text': " Life is an automatic, hey, as you go You won't get too far if you ain't got the dough Weighed down in death, your chips have been fried There ain't no place to run And nowhere to hide Needs much, the devil drives"}

In [1]:
import nussl

2023-10-18 23:58:51.719362: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-18 23:58:51.724379: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-10-18 23:58:51.724397: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
signal1 = nussl.AudioSignal("05mUf9x3V3RIqafuY4H54E.mp3")

In [3]:
signal1.embed_audio()
print(signal1)

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

AudioSignal (unlabeled): 29.989 sec @ 05mUf9x3V3RIqafuY4H54E.mp3, 44100 Hz, 2 ch.


In [4]:
print("Duration: {} seconds".format(signal1.signal_duration))
print("Duration in samples: {} samples".format(signal1.signal_length))
print("Number of channels: {} channels".format(signal1.num_channels))
print("File name: {}".format(signal1.file_name))
print("Full path to input: {}".format(signal1.path_to_input_file))
print("Root mean square energy: {:.4f}".format(signal1.rms().mean()))

Duration: 29.98857142857143 seconds
Duration in samples: 1322496 samples
Number of channels: 2 channels
File name: 05mUf9x3V3RIqafuY4H54E.mp3
Full path to input: 05mUf9x3V3RIqafuY4H54E.mp3
Root mean square energy: 0.2054


In [5]:
signal1.audio_data

array([[ 0.0000000e+00, -1.6823536e-09, -9.7169939e-10, ...,
         1.9913574e-01,  1.9168098e-01,  1.8395220e-01],
       [ 0.0000000e+00, -2.1263804e-09, -1.7413624e-09, ...,
         3.0822849e-01,  3.0062255e-01,  2.9158875e-01]], dtype=float32)

In [6]:
signal1.audio_data.shape

(2, 1322496)

In [None]:
nussl.

In [7]:
separator = nussl.separate_harmonic_percussive(signal1)

AttributeError: module 'nussl' has no attribute 'separate_harmonic_percussive'

Yes, it is possible to use librosa to separate vocal and instrumental sounds from mp3 audio files. Here is one approach:

1. Load the mp3 file using librosa and convert to mono:

```python
import librosa

audio, sr = librosa.load('song.mp3', mono=True) 
```

2. Use the librosa onset detection to find note onsets:

```python
onsets = librosa.onset.onset_detect(y=audio, sr=sr, units='time')
```

3. Segment the audio into notes based on the detected onsets: 

```python 
onsets_times = librosa.frames_to_time(onsets, sr=sr)

notes_segments = []
for i in range(len(onsets_times) - 1):
    onset_start_sample = int(onsets_times[i] * sr)
    onset_end_sample = int(onsets_times[i + 1] * sr)
    note_segment = audio[onset_start_sample:onset_end_sample]
    notes_segments.append(note_segment)
```

4. Compute spectrograms for each note segment:

```python
spectrograms = [librosa.stft(n) for n in notes]
```

5. Analyze the harmonic and percussive elements of each spectrogram to separate into vocals and instruments:

```python
vocals = [librosa.decompose.hpss(s)[0] for s in spectrograms]
instruments = [librosa.decompose.hpss(s)[1] for s in spectrograms]
```

6. Resynthesize the separated vocal and instrument signals:

```python
vocal_audio = librosa.effects.remix(vocals)
inst_audio = librosa.effects.remix(instruments)
```

This gives you two output audio signals - one with just the vocals, and the other with the instrumental backing track.

Let me know if you need any help implementing this approach!

In [3]:
import demucs.api

In [4]:
separator = demucs.api.Separator()

AttributeError: module 'demucs' has no attribute 'api'

Create bucket
open policy
Copy directory with server settings to the bucket 

In [10]:
endpoint = 'http://172.19.255.2:80/v2/models/model/infer'

In [25]:
import json
input_test = json.loads('{"inputs": [{"name": "lyrics", "shape": [1, 1], "datatype": "BYTES", "parameters": {"content_type": "str"}, "data": ["Lorem Ipsum has been the industrys standard dummy text ever since the 1500s when an unknown printer took a galley of type and scrambled it to make a type specimen book."]}]}')
input_test

{'inputs': [{'name': 'lyrics',
   'shape': [1, 1],
   'datatype': 'BYTES',
   'parameters': {'content_type': 'str'},
   'data': ['Lorem Ipsum has been the industrys standard dummy text ever since the 1500s when an unknown printer took a galley of type and scrambled it to make a type specimen book.']}]}

In [26]:
headers = {"Content-Type": "application/json", "seldon-model": 'text-embeddings.model'}

In [27]:
import requests
test_seldon = requests.post(endpoint, json=input_test, headers=headers)

In [28]:
test_seldon.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Splitter

In [98]:
import requests, torch, numpy as np

In [122]:
endpoint = "http://localhost:5070/v2/models/music_splitter/infer"

In [125]:
song.dtype

dtype('float32')

In [126]:
input_request = {
        "inputs": [{
            "name": "song", "parameters": {"content_type": "np"}, "datatype": "FP32",
            "shape": song.shape, "data": song.tolist()
        }]
    }

In [127]:
res = requests.post(endpoint, json=input_request)

In [134]:
song.shape

(2, 1323648)

In [136]:
np.array(res.json()['outputs'][0]['data']).reshape([4, 1323648]).shape

(4, 1323648)

In [137]:
import tempfile

In [148]:
def create_tmp_audio(audio_data, sr):
    with tempfile.NamedTemporaryFile(suffix='.mp3') as tmpf:
        with AudioFile(tmpf.name, 'w', samplerate=sr, num_channels=1) as func:
            func.write(audio_data)
    return tmpf.name

In [149]:
song

array([[-4.8602793e-05, -4.8606558e-05, -4.8610309e-05, ...,
        -4.8602793e-05, -4.8602793e-05, -4.8602793e-05],
       [ 3.6379788e-12,  1.1641532e-10, -8.5492502e-10, ...,
         3.6379788e-12,  3.6379788e-12,  3.6379788e-12]], dtype=float32)

In [150]:
create_tmp_audio(audio_data=song[0], sr=44100)

'/tmp/tmpqpz2_1lx.mp3'

In [143]:
tmpf.name

'/tmp/tmp6z9pxtji.mp3'

In [141]:
tempfile.NamedTemporaryFile(suffix='.mp3').name

'/tmp/tmpr97269wn.mp3'

In [142]:
with tempfile.NamedTemporaryFile(suffix='.mp3') as tmpf:
    with AudioFile(tmpf.name, 'w', samplerate=44100, num_channels=1) as func:
        func.write(song[0][None])

In [139]:
create_tmp_audio(np.array(res.json()['outputs'][0]['data']).reshape([4, 1323648])[0], 44100)

<tempfile._TemporaryFileWrapper at 0x7f2b7942e8d0>

In [129]:
separator.samplerate

44100