# Speech to Text gENERATOR

In [None]:
import whisper
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav

# Load the Whisper model
whisper_model = whisper.load_model("base")

# Function to record audio and save as WAV file
def record_audio(duration=5, sample_rate=44100):
    print("🎤 Recording... Speak now!")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype=np.int16)
    sd.wait()
    wav.write("input_audio.wav", sample_rate, audio)
    print("✅ Recording complete!")

# Function to transcribe the recorded audio
def transcribe_audio():
    result = whisper_model.transcribe("input_audio.wav")
    return result["text"].lower()

# Record and transcribe
record_audio()
text_command = transcribe_audio()
print(f"🗣️ Recognized Speech: {text_command}")


In [None]:
# CIFAR-10 Classes
cifar10_classes = ["airplane", "automobile", "bird", "cat", "deer", 
                   "dog", "frog", "horse", "ship", "truck"]

# Function to find the closest CIFAR-10 class
def map_to_cifar10(text_command):
    for cls in cifar10_classes:
        if cls in text_command:
            return cifar10_classes.index(cls), cls  # Return index & class name
    return None, None  # No match found

# Find the corresponding CIFAR-10 class
label_index, label_name = map_to_cifar10(text_command)

if label_name:
    print(f"🎯 Mapped Class: {label_name} ({label_index})")
else:
    print("⚠️ No valid CIFAR-10 class detected!")


In [None]:
import torchvision.utils as vutils

# Function to generate an image based on speech input
def generate_from_voice(generator, label_index, label_name):
    generator.eval()  # Set to evaluation mode
    
    # Generate random noise
    noise = get_random_noise(1, noise_dim)
    
    # Convert label to tensor
    label = torch.tensor([label_index]).to(device)

    # Generate the image
    with torch.no_grad():
        fake_img = generator(noise, label)
    
    # Unnormalize and display the image
    fake_img = (fake_img + 1) / 2  # Scale from [-1,1] to [0,1]
    
    plt.figure(figsize=(3,3))
    plt.imshow(np.transpose(fake_img[0].cpu().numpy(), (1, 2, 0)))
    plt.axis("off")
    plt.title(f"Generated Image: {label_name}")
    plt.show()

# Generate image if a valid label was detected
if label_name:
    generate_from_voice(generator, label_index, label_name)
else:
    print("⚠️ No valid image can be generated.")
