diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 2dcab10..d3a2543 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,10 +3,13 @@ run-name: ${{ github.actor }} is building wheels on: [push] jobs: build_wheels: - runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest, windows-latest] + # os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.10"] + #xcode: [13.2] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - name: Checkout submodules @@ -30,6 +33,15 @@ jobs: fi shell: bash + - name: Install MSBuild + if: runner.os == 'windows' + uses: microsoft/setup-msbuild@v1.0.2 + + - name: Set up C++ environment + if: runner.os == 'windows' + run: | + "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x86 + - name: Build wheel run: python -m cibuildwheel --output-dir dist/ env: diff --git a/.gitignore b/.gitignore index b6e4761..408faf8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ __pycache__/ # C extensions *.so +# Prroject specific +/local +whispercpp.cpp + # Distribution / packaging .Python build/ diff --git a/README.md b/README.md index f6fe283..f5393a8 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,22 @@ Python bindings for whisper.cpp =============================== +This project provides Python bindings for the whisper.cpp library, which is a C++ implementation of a speech-to-text engine. The bindings are implemented in Cython, a language that allows easy integration between Python and C/C++ code. +# Installation `pip install git+https://github.com/o4dev/whispercpp.py` +# Examples +Once the package is installed, you can use it to transcribe speech files. Here's an example: +```python +from whispercpp import Whisper + +w = Whisper('tiny') + +result = w.transcribe("myfile.mp3", language='en') +text = w.extract_text(result) +``` +This code creates a Whisper object using the 'tiny' model and transcribes the audio file "myfile.mp3" using English language. The resulting result object is a byte string that can be decoded into a text string using the extract_text method. +If you don't specify a language, Whisper will try to determine the language of the audio: ```python from whispercpp import Whisper @@ -14,3 +28,25 @@ text = w.extract_text(result) Note: default parameters might need to be tweaked. See Whispercpp.pyx. + +# Available Models +The following models are available for use with Whispercpp.py: +| Model | Disk | Mem | SHA | +|-----------|---------|-----------|------------------------------------------------------------------| +| tiny | 75 MB | ~390 MB | bd577a113a864445d4c299885e0cb97d4ba92b5f | +| tiny.en | 75 MB | ~390 MB | c78c86eb1a8faa21b369bcd33207cc90d64ae9df | +| base | 142 MB | ~500 MB | 465707469ff3a37a2b9b8d8f89f2f99de7299dac | +| base.en | 142 MB | ~500 MB | 137c40403d78fd54d454da0f9bd998f78703390c | +| small | 466 MB | ~1.0 GB | 55356645c2b361a969dfd0ef2c5a50d530afd8d5 | +| small.en | 466 MB | ~1.0 GB | db8a495a91d927739e50b3fc1cc4c6b8f6c2d022 | +| medium | 1.5 GB | ~2.6 GB | fd9727b6e1217c2f614f9b698455c4ffd82463b4 | +| medium.en | 1.5 GB | ~2.6 GB | 8c30f0e44ce9560643ebd10bbe50cd20eafd3723 | +| large-v1 | 2.9 GB | ~4.7 GB | b1caaf735c4cc1429223d5a74f0f4d0b9b59a299 | +| large | 2.9 GB | ~4.7 GB | 0f4c8e34f21cf1a914c59d8b3ce882345ad349d6 | + +To use a specific model with Whispercpp.py, specify the model name when creating a Whisper object: +```python +from whispercpp import Whisper + +w = Whisper('base.en') +``` \ No newline at end of file diff --git a/whisper.cpp b/whisper.cpp index 1d716d6..b597c5a 160000 --- a/whisper.cpp +++ b/whisper.cpp @@ -1 +1 @@ -Subproject commit 1d716d6e34f3f4ba57bd9706a9258a0bdb008153 +Subproject commit b597c5a779e1086bded7d06e0f46112b2f688989 diff --git a/whispercpp.pxd b/whispercpp.pxd index 1a033db..d37205c 100644 --- a/whispercpp.pxd +++ b/whispercpp.pxd @@ -71,7 +71,7 @@ cdef extern from "whisper.h" nogil: whisper_encoder_begin_callback encoder_begin_callback void* encoder_begin_callback_user_data whisper_full_params whisper_full_default_params(whisper_sampling_strategy) - cdef whisper_context* whisper_init(char*) + cdef whisper_context* whisper_init_from_file(char*) cdef void whisper_free(whisper_context*) cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int) cdef int whisper_set_mel(whisper_context*, float*, int, int) diff --git a/whispercpp.pyx b/whispercpp.pyx index fba2eff..43869ff 100644 --- a/whispercpp.pyx +++ b/whispercpp.pyx @@ -7,34 +7,47 @@ import requests import os from pathlib import Path -MODELS_DIR = str(Path('~/.ggml-models').expanduser()) +MODELS_DIR = str(Path('~/.cache/ggml-models').expanduser()) print("Saving models to:", MODELS_DIR) cimport numpy as cnp +from cpython.mem cimport PyMem_Malloc, PyMem_Free cdef int SAMPLE_RATE = 16000 cdef char* TEST_FILE = 'test.wav' cdef char* DEFAULT_MODEL = 'tiny' -cdef char* LANGUAGE = b'fr' +cdef char* LANGUAGE = NULL cdef int N_THREADS = os.cpu_count() MODELS = { 'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin', + 'ggml-tiny.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin', 'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin', + 'ggml-base.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin', 'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin', + 'ggml-small.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin', 'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin', + 'ggml-medium.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin', 'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin', + 'ggml-large-v1.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin', } def model_exists(model): return os.path.exists(Path(MODELS_DIR).joinpath(model)) +def sampling_strategy_from_string(strategy_string): + strategy_map = { + 'GREEDY': whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH, + 'BEAM_SEARCH': whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH + } + return strategy_map[strategy_string.upper()] + def download_model(model): if model_exists(model): return - print(f'Downloading {model}...') + url = MODELS[model] r = requests.get(url, allow_redirects=True) os.makedirs(MODELS_DIR, exist_ok=True) @@ -43,6 +56,7 @@ def download_model(model): cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE): + print("Sampling rate:", sr) try: out = ( ffmpeg.input(file, threads=0) @@ -57,7 +71,7 @@ cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr capture_stderr=True ) )[0] - except: + except FileNotFoundError: raise RuntimeError(f"File '{file}' not found") cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = ( @@ -68,9 +82,10 @@ cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr return frames -cdef whisper_full_params default_params() nogil: +cdef whisper_full_params default_params(strategy='GREEDY'): + strategy_value = sampling_strategy_from_string(strategy) cdef whisper_full_params params = whisper_full_default_params( - whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY + strategy_value ) params.print_realtime = True params.print_progress = True @@ -81,27 +96,57 @@ cdef whisper_full_params default_params() nogil: cdef class Whisper: + """ + This class provides an interface for speech recognition using the Whisper library. + + Parameters: + ----------- + model (str): Model to use for transcription. One of ['ggml-tiny', 'ggml-tiny.en', 'ggml-base', + 'ggml-base.en', 'ggml-small', 'ggml-small.en', 'ggml-medium', 'ggml-medium.en', 'ggml-large', + 'ggml-large-v1']. Defaults to 'ggml-base'. + **kwargs: optional + Additional arguments to override the default parameters for speech recognition. Accepts the following arguments: + - strategy (str): Sampling strategy to use. Choose from 'GREEDY' or 'BEAM_SEARCH'. Default: 'GREEDY'. + - print_progress (bool): Whether to print progress messages during transcription. Default: True. + - print_realtime (bool): Whether to print transcription results in real time. Default: True. + + Attributes: + ----------- + ctx: whisper_context * + The pointer to the Whisper context used for speech recognition. + params: whisper_full_params + The parameters used for speech recognition. + """ cdef whisper_context * ctx cdef whisper_full_params params - def __init__(self, model=DEFAULT_MODEL, pb=None): - model_fullname = f'ggml-{model}.bin'.encode('utf8') + def __init__(self, model='tiny', **kwargs): + model_fullname = f'ggml-{model}.bin' download_model(model_fullname) model_path = Path(MODELS_DIR).joinpath(model_fullname) cdef bytes model_b = str(model_path).encode('utf8') - self.ctx = whisper_init(model_b) - self.params = default_params() + self.ctx = whisper_init_from_file(model_b) + self.params = default_params(kwargs.get('strategy', 'GREEDY')) whisper_print_system_info() + # Override default params + self.params.print_progress = kwargs.get('print_progress', True) + self.params.print_realtime = kwargs.get('print_realtime', True) + def __dealloc__(self): whisper_free(self.ctx) - def transcribe(self, filename=TEST_FILE): - print("Loading data..") - cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(filename) - - print("Transcribing..") - return whisper_full(self.ctx, self.params, &frames[0], len(frames)) + def transcribe(self, filename=TEST_FILE, language=None): + print("Transcribing...") + cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(filename, SAMPLE_RATE) + if language: + print("Language:", language) + LANGUAGE = language.encode('utf-8') + self.params.language = LANGUAGE + else: + self.params.language = NULL + transcript = whisper_full(self.ctx, self.params, &frames[0], len(frames)) + return transcript def extract_text(self, int res): print("Extracting text...")