In [None]:
!pip install git+https://github.com/openai/whisper.git

import whisper

model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
print(result["text"])


In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Install necessary libraries if not already installed
!pip install transformers datasets

# Load the model and processor
model_id = "openai/whisper-base"  # Or any other whisper model
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

# Move the model to the GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


# Load audio file
# Replace with your actual audio file path
audio_filepath = "audio.mp3"

# Use librosa or other library to load audio if necessary
import librosa
audio_input, sr = librosa.load(audio_filepath)

# Preprocess the audio
inputs = processor(audio_input, sampling_rate=sr, return_tensors="pt").to(device)

# Perform inference
with torch.no_grad():
  generated_ids = model.generate(**inputs)

# Decode the generated IDs to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Print the transcribed text
transcription


In [None]:
import whisper
import torch
from transformers import pipeline

# Load the Whisper model
model = whisper.load_model("base")

# Create a pipeline for transcription
pipe = pipeline("automatic-speech-recognition", model=model)

# Transcribe the audio file
result = pipe("audio.mp3")

# Print the transcribed text
print(result["text"])


In [None]:
!pip install faster-whisper

from faster_whisper import WhisperModel

model_size = "large-v2"

# Run on GPU with FP16
device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperModel(model_size, device=device, compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("audio.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))


In [None]:
!pip install transformers datasets librosa soundfile

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import librosa

# Load the model and processor
model_id = "facebook/seamless_m4t_large"  # Or another SeamlessM4T model
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained(model_id)

# Load audio file
audio_filepath = "audio.mp3"
audio_input, sr = librosa.load(audio_filepath, sr=16000) # Ensure consistent sample rate

# Preprocess the audio
inputs = processor(audio_input, sampling_rate=sr, return_tensors="pt").to(model.device)


# Perform inference
with torch.no_grad():
    generated_ids = model.generate(**inputs)

# Decode the generated IDs to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

transcription


In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import librosa

# Load the model and processor
model_id = "facebook/wav2vec2-base-960h"  # Replace with your desired Wav2Vec 2.0 model
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu")
processor = AutoProcessor.from_pretrained(model_id)

# Load audio file
audio_filepath = "audio.mp3"  # Replace with your audio file path
audio_input, sr = librosa.load(audio_filepath, sr=16000) # Ensure consistent sample rate

# Preprocess the audio
inputs = processor(audio_input, sampling_rate=sr, return_tensors="pt").to(model.device)

# Perform inference
with torch.no_grad():
    generated_ids = model.generate(**inputs)

# Decode the generated IDs to text
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

transcription


In [None]:
from transformers import pipeline

model_path = "hindi_models/whisper-medium-hi_alldata_multigpu"
device = "cuda"
lang_code = "hi"

whisper_asr = pipeline(
    "automatic-speech-recognition", model=model_path, device=device,
)

# Special case to handle odia since odia is not supported by whisper model
if lang_code == 'or':
    whisper_asr.model.config.forced_decoder_ids = (
        whisper_asr.tokenizer.get_decoder_prompt_ids(
            language=None, task="transcribe"
        )
    )
else:
    whisper_asr.model.config.forced_decoder_ids = (
        whisper_asr.tokenizer.get_decoder_prompt_ids(
            language=lang_code, task="transcribe"
        )
    )

result = whisper_asr("audio.mp3")
print(result["text"])