Fairly complete version of common-factored out audio async handler fo…

…r both stdin and SDL stream versions (next up is making one stream.cpp to rule them all, via factory method and CLI params to choose source)
shanelenagh · Feb 15, 2024 · 59a1906 · 59a1906
1 parent 9ee53a7
commit 59a1906
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 46 deletions.
diff --git a/examples/audio-stdin.cpp b/examples/audio-stdin.cpp
@@ -4,13 +4,6 @@
 #include <cstring>
 #include <cassert>
 
-// Because the original happened to handle OS signals in the same library as
-// handled the audio, this is implemented here.
-// TODO: split this out to something a bit more coherent
-
-namespace stream_constants {
-  float S16_TO_F32_SCALE_FACTOR = 0.000030517578125f;
-}
 
 audio_stdin::audio_stdin(int len_ms) : audio_async(len_ms) { }
 
@@ -25,8 +18,7 @@ passed in needs to already be open, and that the destructor doesn't close it.
 bool audio_stdin::init(whisper_params params, int sample_rate) {
 
   audio_async::init(params, sample_rate);
-  m_audio.resize(0);  // resize to reclaim this memory, as it isn't needed (floats buffer filled on the fly)
-  m_in_buffer.resize((m_sample_rate*m_len_ms)/1000);
+  m_audio.resize((m_sample_rate*m_len_ms)/1000);
 
   return true;
 }
@@ -49,18 +41,13 @@ void audio_stdin::get(int ms, std::vector<float> & result) {
 
         size_t n_samples = (m_sample_rate * ms) / 1000;
 
-        assert(n_samples <= m_in_buffer.size()/sizeof(int16_t));
+        assert(n_samples <= m_audio.size()/sizeof(int16_t));
         // stdin is PCM mono 16khz in s16le format.  Use ffmpeg to make that happen.
-        int nread = read(STDIN_FILENO, m_in_buffer.data(), n_samples*sizeof(int16_t) /*m_in_buffer.size()*/);
+        int nread = read(STDIN_FILENO, m_audio.data(), n_samples*sizeof(int16_t) /*m_in_buffer.size()*/);
         if (nread <= 0) { 
           m_running = false;
           return; 
-        }
-
-        int float_sample_count = nread / sizeof(int16_t);
-        result.resize(float_sample_count);
-        for (int i = 0; i < float_sample_count; i++) {
-            result[i] = m_in_buffer[i] * stream_constants::S16_TO_F32_SCALE_FACTOR;
-        }
+        } 
+        transfer_buffer(result, 0, nread / sizeof(int16_t));
     }
 }
diff --git a/examples/audio-stdin.h b/examples/audio-stdin.h
@@ -10,7 +10,7 @@
 //
 // Stdin wav capture
 //
-class audio_stdin  : public audio_async {
+class audio_stdin  : public audio_async<int16_t> {
 public:
     audio_stdin(int len_ms);
     ~audio_stdin();
@@ -20,10 +20,6 @@ class audio_stdin  : public audio_async {
     // get audio data from the circular buffer
     // Returns false if the stream's closed.
     void get(int ms, std::vector<float> & audio) override;
-
-private:
-    // Since the data we plan on receiving needs converting, we need somewhere to hold it while we do that
-    std::vector<int16_t> m_in_buffer;    
 };
 
 // Return false if need to quit - goes false at eof?

diff --git a/examples/common-audioasync.h b/examples/common-audioasync.h
@@ -8,6 +8,10 @@
 #include <thread>
 #include <cstring>
 
+namespace audioasync_constants {
+  const float S16_TO_F32_SCALE_FACTOR = 0.000030517578125f;
+}
+
 // command-line parameters
 struct whisper_params {
     int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -37,9 +41,9 @@ struct whisper_params {
 };
 
 //
-// Abstract interface for audio capture
+// Abstract class for audio capture
 //
-class audio_async {
+template <typename T = float> class audio_async {
 public:
     audio_async(int len_ms) { 
         m_len_ms = len_ms;
@@ -55,17 +59,17 @@ class audio_async {
         return true; 
     }
 
-    virtual bool resume() {
+    bool resume() {
         m_running = true;
         return true;
     }
 
-    virtual bool pause() {
+    bool pause() {
         m_running = false;
         return true;
     }
 
-    virtual bool clear() {
+    bool clear() {
         {
             std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -75,7 +79,7 @@ class audio_async {
         return true;
     }
 
-    virtual bool is_running() {
+    bool is_running() {
         return m_running;
     }
 
@@ -107,19 +111,12 @@ class audio_async {
                 s0 += m_audio.size();
             }
 
-            if (s0 + n_samples > m_audio.size()) {
-                const size_t n0 = m_audio.size() - s0;
-
-                memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-                memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-            } else {
-                memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-            }
+            transfer_buffer(result, s0, n_samples);
         }        
     }
 
-    // callback to be called by audio source
-    virtual void callback(uint8_t * stream, int len) {
+    // buffer callback persistence to be called by audio source ingester
+    void callback(uint8_t * stream, int len) {
         if (!m_running) {
             return;
         }
@@ -129,7 +126,7 @@ class audio_async {
         if (n_samples > m_audio.size()) {
             n_samples = m_audio.size();
 
-            stream += (len - (n_samples * sizeof(float)));
+            stream += (len - (n_samples * sizeof(T)));
         }
 
         //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
@@ -140,13 +137,13 @@ class audio_async {
             if (m_audio_pos + n_samples > m_audio.size()) {
                 const size_t n0 = m_audio.size() - m_audio_pos;
 
-                memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-                memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
+                memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(T));
+                memcpy(&m_audio[0], stream + n0 * sizeof(T), (n_samples - n0) * sizeof(T));
 
                 m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
                 m_audio_len = m_audio.size();
             } else {
-                memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+                memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(T));
 
                 m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
                 m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
@@ -161,7 +158,41 @@ class audio_async {
     std::atomic_bool m_running;
     std::mutex       m_mutex;
 
-    std::vector<float> m_audio;
+    std::vector<T> m_audio;
     size_t             m_audio_pos = 0;
     size_t             m_audio_len = 0;
-};
+
+    void transfer_buffer(std::vector<float> & result, int start, int sampleCount);
+};
+
+// Fast memcpy transfer for float
+template <> inline void audio_async<float>::transfer_buffer(std::vector<float> & result, int start, int sampleCount) {
+    result.resize(sampleCount);
+    if (start + sampleCount > m_audio.size()) {
+        const size_t n0 = m_audio.size() - start;
+
+        memcpy(result.data(), &m_audio[start], n0 * sizeof(float));
+        memcpy(&result[n0], &m_audio[0], (sampleCount - n0) * sizeof(float));
+    } else {
+        memcpy(result.data(), &m_audio[start], sampleCount * sizeof(float));
+    }        
+}  
+
+// Conversion from int16_t to float for s16le buffers
+template <> inline void audio_async<int16_t>::transfer_buffer(std::vector<float> & result, int start, int sampleCount) {
+    result.resize(sampleCount);
+    if (start + sampleCount > m_audio.size()) {
+        const size_t n0 = m_audio.size() - start;
+
+        for (int i = 0; i < n0; i++) {
+            result[i] = m_audio[start + i] * audioasync_constants::S16_TO_F32_SCALE_FACTOR;
+        }
+        for (int i = n0; i < sampleCount; i++) {
+            result[i] = m_audio[i] * audioasync_constants::S16_TO_F32_SCALE_FACTOR;
+        }
+    } else {
+        for (int i = 0; i < sampleCount; i++) {
+            result[i] = m_audio[start + i] * audioasync_constants::S16_TO_F32_SCALE_FACTOR;
+        }
+    }        
+}
diff --git a/examples/common-sdl.h b/examples/common-sdl.h
@@ -13,7 +13,7 @@
 //
 // SDL Audio capture
 //
-class audio_async_sdl : public audio_async {
+class audio_async_sdl : public audio_async<float> {
 public:
     audio_async_sdl(int len_ms);
     ~audio_async_sdl();