~~~
Copyright 2025 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
~~~

# Quick start with Hugging Face

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/google-health/hear/blob/master/notebooks/quick_start_with_hugging_face.ipynb">
      <img alt="Google Colab logo" src="https://www.tensorflow.org/images/colab_logo_32px.png" width="32px"><br> Run in Google Colab
    </a>
  </td>  
  <td style="text-align: center">
    <a href="https://github.com/google-health/hear/blob/master/notebooks/quick_start_with_hugging_face.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://huggingface.co/google/hear">
      <img alt="HuggingFace logo" src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" width="32px"><br> View on HuggingFace
    </a>
  </td>
</tr></tbody></table>

This Colab notebook provides a basic usage example of the HeAR encoder that generates a machine learning representation (known as "embeddings") from health-related sounds (2-second audio clips sampled at 16kHz). These embeddings can be used to develop custom machine learning models for health acoustic use-cases with less data and compute compared to traditional model development methods.

 Learn more about embeddings and their benefits at [this page](https://developers.google.com/health-ai-developer-foundations/hear).

# Authenticate with HuggingFace, skip if you have a HF_TOKEN secret

In [None]:
from huggingface_hub.utils import HfFolder

if HfFolder.get_token() is None:
    from huggingface_hub import notebook_login
    notebook_login()

## Load and play cough audio recording

In [None]:
SAMPLE_RATE = 16000  # Samples per second (Hz)
CLIP_DURATION = 2    # Duration of the audio clip in seconds
CLIP_LENGTH = SAMPLE_RATE * CLIP_DURATION  # Total number of samples

In [None]:
import numpy as np
from scipy.io import wavfile
from scipy import signal
from IPython.display import Audio, display


def resample_audio_and_convert_to_mono(
    audio_array: np.ndarray,
    sampling_rate: float,
    new_sampling_rate: float = SAMPLE_RATE,
  ) -> np.ndarray:
  """
  Resamples an audio array to 16kHz and converts it to mono if it has multiple channels.

  Args:
    audio_array: A numpy array representing the audio data.
    sampling_rate: The original sampling rate of the audio.
    new_sampling_rate: Target sampling rate.

  Returns:
    resampled_audio_mono: A numpy array representing the resampled mono audio at 16kHz.
  """
  # Convert to mono if it's multi-channel
  if audio_array.ndim > 1:
    audio_mono = np.mean(audio_array, axis=1)
  else:
    audio_mono = audio_array

  # Resample
  original_sample_count = audio_mono.shape[0]
  new_sample_count = int(round(original_sample_count * (new_sampling_rate / sampling_rate)))
  resampled_audio_mono = signal.resample(audio_mono, new_sample_count)

  return resampled_audio_mono


In [None]:
!wget -nc https://upload.wikimedia.org/wikipedia/commons/b/be/Woman_coughing_three_times.wav

In [None]:
# Load file
with open('Woman_coughing_three_times.wav', 'rb') as f:
  original_sampling_rate, audio_array = wavfile.read(f)

print(f"Sample Rate: {original_sampling_rate} Hz")
print(f"Data Shape: {audio_array.shape}")
print(f"Data Type: {audio_array.dtype}")


In [None]:
audio_array = resample_audio_and_convert_to_mono(audio_array, original_sampling_rate, SAMPLE_RATE)
display(Audio(audio_array, rate=SAMPLE_RATE))

## Compute embeddings

In [None]:
from huggingface_hub import from_pretrained_keras
import tensorflow as tf
import matplotlib.pyplot as plt

# This index corresponds to a cough and was determined by hand. In practice, you
# would need a detector.
START = 0

# Add batch dimension
input_tensor = np.expand_dims(audio_array[START: START + CLIP_LENGTH], axis=0)

# Load the model directly from Hugging Face Hub
loaded_model = from_pretrained_keras("google/hear")

# Call inference
infer = lambda audio_array: loaded_model.signatures["serving_default"](x=audio_array)
output = infer(tf.constant(input_tensor, dtype=tf.float32))

# Extract the embedding vector
embedding_vector = output['output_0'].numpy().flatten()
print("Size of embedding vector:", len(embedding_vector))

# Plot the embedding vector
plt.figure(figsize=(12, 4))
plt.plot(embedding_vector)
plt.title('Embedding Vector')
plt.xlabel('Index')
plt.ylabel('Value')
plt.grid(True)
plt.show()

# Next steps

Explore the other [notebooks](https://github.com/google-health/hear/blob/master/notebooks) to learn what else you can do with the model.