<a href="https://colab.research.google.com/github/sudomason/ADSBexchange-MLAT/blob/main/Whisper_atcosim3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Install required libraries
!pip install transformers torch librosa soundfile

import os
import torch
from transformers import pipeline
import librosa
import numpy as np
from google.colab import files
import soundfile as sf

# Set up the pipeline
try:
    # Use GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe = pipeline("automatic-speech-recognition", model="luigisaetta/whisper-atcosim3", device=device)
    print(f"Pipeline set up successfully on device: {device}")
except Exception as e:
    print(f"Error setting up pipeline: {str(e)}")
    raise

def transcribe_audio(file_path):
    try:
        # Load audio using soundfile
        audio, sr = sf.read(file_path)

        if len(audio) == 0:
            raise ValueError("The audio file is empty or could not be read correctly.")

        print(f"Processing {file_path}")
        print(f"Audio shape: {audio.shape}, Sample rate: {sr}")

        # Convert stereo to mono if necessary
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)

        # Resample to 16kHz if necessary
        if sr != 16000:
            print(f"Resampling from {sr} Hz to 16000 Hz")
            audio = librosa.resample(y=audio, orig_sr=sr, target_sr=16000)
            sr = 16000

        print(f"Processed audio shape: {audio.shape}, Sample rate: {sr}")

        # Normalize audio
        audio = (audio / np.max(np.abs(audio))).astype(np.float32)

        # Transcribe using the pipeline
        transcription = pipe({"raw": audio, "sampling_rate": sr})

        print(f"Raw pipeline output: {transcription}")

        if isinstance(transcription, dict) and "text" in transcription:
            return transcription["text"]
        elif isinstance(transcription, list) and len(transcription) > 0 and "text" in transcription[0]:
            return transcription[0]["text"]
        else:
            print(f"Unexpected transcription format: {type(transcription)}")
            return str(transcription)

    except Exception as e:
        print(f"An error occurred processing {file_path}: {str(e)}")
        print(f"Error type: {type(e).__name__}")
        return None

def upload_and_process_files():
    print("Please upload your .wav files:")
    uploaded = files.upload()

    results = {}

    for filename in uploaded.keys():
        if filename.lower().endswith('.wav'):
            with open(filename, 'wb') as f:
                f.write(uploaded[filename])
            transcription = transcribe_audio(filename)
            if transcription:
                results[filename] = transcription
            else:
                results[filename] = "Transcription failed or returned null"

    return results

# Main execution
try:
    transcriptions = upload_and_process_files()

    # Save all transcriptions to a single TXT file
    with open('transcriptions.txt', 'w') as f:
        for filename, transcription in transcriptions.items():
            f.write(f"File: {filename}\n")
            f.write(f"Transcription: {transcription}\n")
            f.write("-" * 50 + "\n")

    print("Transcriptions saved to 'transcriptions.txt'")
    files.download('transcriptions.txt')
except Exception as e:
    print(f"An error occurred during processing: {str(e)}")
    print(f"Error type: {type(e).__name__}")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Pipeline set up successfully on device: cuda
Please upload your .wav files:


Saving segment_1.wav to segment_1 (4).wav
Processing segment_1 (4).wav
Audio shape: (330750,), Sample rate: 11025
Resampling from 11025 Hz to 16000 Hz
Processed audio shape: (480000,), Sample rate: 16000


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Raw pipeline output: {'text': 'november quebec x-ray downwind touch and go november quebec x-ray number one two eight left number one two eight left november quebec x-ray quebec x-ray if you want to make it to a wide circuit you got the vehicle there to get rid of the bird wide                                                                                                                                                                                                                                                                                                                                                                                  '}
Transcriptions saved to 'transcriptions.txt'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>