In [1]:
! pip install SpeechRecognition pyaudio pydub
! pip install torch



In [2]:
!pip install sounddevice
!pip install scipy
!pip install transformers
!pip install librosa
!pip install torch   



In [20]:
import os
import torch
import sounddevice as sd
from scipy.io.wavfile import write
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa

SCRIPT_DIR = os.getcwd()
MODEL_PATH = os.path.join(SCRIPT_DIR, "wav2vec2_tiny_quantized")

# Create directory
os.makedirs(MODEL_PATH, exist_ok=True)

# Load or download small + quantized model
def load_or_download_model():
    quantized_path = os.path.join(MODEL_PATH, "pytorch_model_quantized.pt")

    if not os.path.exists(quantized_path):
        print("🌐 Downloading small model and applying quantization...")

        # Load a tiny model
        processor = Wav2Vec2Processor.from_pretrained("patrickvonplaten/wav2vec2_tiny_random_robust")
        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2_tiny_random_robust")

        # Apply dynamic quantization
        model.cpu()
        model_quantized = torch.quantization.quantize_dynamic(
            model, {torch.nn.Linear}, dtype=torch.qint8
        )

        # Save quantized model
        torch.save(model_quantized.state_dict(), quantized_path)
        processor.save_pretrained(MODEL_PATH)

        print(f"✅ Quantized model saved to: {quantized_path}")
    else:
        print("📦 Loading quantized model from disk...")

        processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)
        model = Wav2Vec2ForCTC.from_pretrained("patrickvonplaten/wav2vec2_tiny_random_robust")
        model.load_state_dict(torch.load(quantized_path))
        model.eval()

    return processor, model

# Record from mic
def record_audio(duration=5, filename="mic_audio.wav"):
    print("🎤 Recording...")
    fs = 16000
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()
    write(filename, fs, audio)
    print(f"✅ Saved: {filename}")
    return filename

# Transcribe audio file
def transcribe_wav2vec2(file_path):
    processor, model = load_or_download_model()
    audio, _ = librosa.load(file_path, sr=16000)
    input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    print("📝 Transcription:", transcription)

# ---------- Run ----------
# file = record_audio()
# transcribe_wav2vec2(file)

# Just ensure model is ready
load_or_download_model()


🌐 Downloading small model and applying quantization...


Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at patrickvonplaten/wav2vec2_tiny_random_robust and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'lm_head.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Quantized model saved to: d:\ai\wav2vec2_tiny_quantized\pytorch_model_quantized.pt


(Wav2Vec2Processor:
 - feature_extractor: Wav2Vec2FeatureExtractor {
   "do_normalize": true,
   "feature_extractor_type": "Wav2Vec2FeatureExtractor",
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
   "processor_class": "Wav2Vec2Processor",
   "return_attention_mask": false,
   "sampling_rate": 16000
 }
 
 - tokenizer: Wav2Vec2CTCTokenizer(name_or_path='patrickvonplaten/wav2vec2_tiny_random_robust', vocab_size=12, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),
 Wav2Vec2ForCTC(
   (wav2vec2): Wav2Vec2Model(
     (feature_extractor): Wav2Vec2FeatureEncoder(
       (conv_layers): ModuleList(
         (0): Wav2Vec2LayerNormConvLayer(
           (conv): Conv1d(1, 64, kernel_size=(40,), stride=(30,), bias=False)
           (layer_norm): LayerNorm((64,), eps=1e-05, 