In [18]:
import os
import pickle
import glob
import librosa
import numpy as np
from transformers import pipeline
import torch
import tqdm
import json


In [19]:

class AudioProcessor:
    def __init__(self, sample_rate=16000, duration=30, n_mels=80, n_fft=400, hop_length=160, mono=True):
        self.sample_rate = sample_rate
        self.duration = duration
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.mono = mono
        self.num_expected_samples = int(sample_rate * duration)
        self.expected_spec_shape = (n_mels, 3000)  # Whisper expects (80, 3000) shape

    def load_audio(self, file_path):
        signal, _ = librosa.load(file_path, sr=self.sample_rate, mono=self.mono)
        return np.pad(signal, (0, max(0, self.num_expected_samples - len(signal))), mode="constant")

    def extract_features(self, signal):
        mel_spec = librosa.feature.melspectrogram(
            y=signal, sr=self.sample_rate, n_fft=self.n_fft,
            hop_length=self.hop_length, n_mels=self.n_mels
        )
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Convert to log scale
        
        # Fix shape to (80, 3000)
        log_mel_spec = self._fix_spectrogram_shape(log_mel_spec)

        # Normalize for Whisper (-1 to 1)
        norm_feature = self._normalize(log_mel_spec)

        # Convert to PyTorch tensor
        return torch.tensor(norm_feature, dtype=torch.float32)

    def _fix_spectrogram_shape(self, spectrogram):
        """Ensure the spectrogram has shape (80, 3000) by padding/truncating."""
        current_shape = spectrogram.shape[1]

        if current_shape < self.expected_spec_shape[1]:  # Pad if too short
            pad_width = self.expected_spec_shape[1] - current_shape
            spectrogram = np.pad(spectrogram, ((0, 0), (0, pad_width)), mode="constant")
        elif current_shape > self.expected_spec_shape[1]:  # Truncate if too long
            spectrogram = spectrogram[:, :self.expected_spec_shape[1]]

        return spectrogram

    def _normalize(self, feature):
        """Normalize spectrogram to range [-1, 1] (Whisper requirement)."""
        feature = (feature - feature.min()) / (feature.max() - feature.min())  # Normalize to [0, 1]
        return 2 * feature - 1  # Scale to range [-1, 1]

    def save_data(self, feature, file_path, save_dir):
        os.makedirs(save_dir, exist_ok=True)
        np.save(os.path.join(save_dir, os.path.basename(file_path) + ".npy"), feature.numpy())  # Save as NumPy array

    def save_min_max(self, min_max_values, save_path):
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, "wb") as f:
            pickle.dump(min_max_values, f)

    def process_files(self, file_paths, feature_dir, min_max_path):
        min_max_values = {}

        if not file_paths:
            raise FileNotFoundError("No audio files found! Check the FILES_DIR path.")

        print(f" Processing {len(file_paths)} audio files...")

        for file_path in file_paths:
            signal = self.load_audio(file_path)
            feature_tensor = self.extract_features(signal)  # Returns a PyTorch tensor
            self.save_data(feature_tensor, file_path, feature_dir)
            min_max_values[file_path] = {"min": feature_tensor.min().item(), "max": feature_tensor.max().item()}
            print(f"✔ Processed: {os.path.basename(file_path)}")

        self.save_min_max(min_max_values, min_max_path)
        print("All files processed successfully!")

if __name__ == "__main__":
    FILES_DIR = "/Users/puravgupta/Desktop/python/stt-Whisper2002/Speaker_0000/*.wav"
    FEATURE_DIR = "./datasets/spectrograms/"
    MIN_MAX_PATH = "./datasets/minmax/min_max_values.pkl"
    
    audio_files = glob.glob(FILES_DIR, recursive=True)

    processor = AudioProcessor()
    processor.process_files(audio_files, FEATURE_DIR, MIN_MAX_PATH)


 Processing 11 audio files...
✔ Processed: Speaker_0000_00009.wav
✔ Processed: Speaker_0000_00008.wav
✔ Processed: Speaker_0000_00005.wav
✔ Processed: Speaker_0000_00004.wav
✔ Processed: Speaker_0000_00010.wav
✔ Processed: Speaker_0000_00006.wav
✔ Processed: Speaker_0000_00007.wav
✔ Processed: Speaker_0000_00003.wav
✔ Processed: Speaker_0000_00002.wav
✔ Processed: Speaker_0000_00000.wav
✔ Processed: Speaker_0000_00001.wav
All files processed successfully!


In [20]:
processor = AudioProcessor()
audio_path = "/Users/puravgupta/Desktop/python/stt-Whisper2002/Speaker_0000/Speaker_0000_00001.wav"
signal = processor.load_audio(audio_path)
input_tensor = processor.extract_features(signal) 
print("Tensor Shape:", input_tensor.shape)
print("Tensor Data:\n", input_tensor)

Tensor Shape: torch.Size([80, 3000])
Tensor Data:
 tensor([[-0.3635, -0.5583, -0.6897,  ..., -0.4162, -0.4014, -0.5088],
        [-0.2329, -0.3242, -0.5107,  ..., -0.0677, -0.0268, -0.0108],
        [-0.2264, -0.3017, -0.4451,  ...,  0.2028,  0.2091,  0.2257],
        ...,
        [-0.7672, -1.0000, -1.0000,  ..., -0.5798, -0.3596, -0.4839],
        [-0.7760, -0.9979, -1.0000,  ..., -0.5979, -0.4271, -0.5276],
        [-0.7870, -1.0000, -1.0000,  ..., -0.7436, -0.5475, -0.6940]])


In [16]:
# def transcribe_audio(audio_file):
#     """Transcribes audio using Whisper model."""
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
#     transcription = whisper_pipe(audio_file)
#     return transcription

whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small" , device=-1)
processor = AudioProcessor(sample_rate=16000)

waveform = processor.load_audio(audio_path)

waveform = np.array(waveform, dtype=np.float32)
transcription = whisper_pipe(waveform,return_timestamps=True)

transcription_data = {
    "audio_file": audio_path,
    "transcription": transcription["text"]
}


Device set to use cpu
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


In [17]:
# def save_to_json(data, filename="transcription.json"):
#     """Saves transcription output to a JSON file."""
#     with open(filename, "w") as f:
#         json.dump(data, f, indent=4)
#     return filename


json_file_path = "transcription_output.json"
with open(json_file_path, "w") as json_file:
    json.dump(transcription_data, json_file, indent=4)

print(f"Transcription saved to {json_file_path}")

Transcription saved to transcription_output.json
