<a href="https://colab.research.google.com/github/project-2-2-2/mental/blob/main/voices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install flask pydub librosa scikit-learn speechrecognition pytorch transformers soundfile tqdm gradio


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting speechrecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting pytorch
  Downloading pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting python-multipart>=0.0.18

In [2]:
!pip install torch torchvision torchaudio





In [4]:
!pip install SpeechRecognition




Collecting SpeechRecognition
  Using cached SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Using cached SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.11.0


In [6]:
!pip install gradio


Collecting gradio
  Using cached gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Using cached fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Using cached gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Using cached MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Using cached python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Using cached ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [19]:
import os
import librosa
import numpy as np
import pickle
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import speech_recognition as sr
import gradio as gr

# Initialize directories
REGISTERED_DIR = "/content/registered_voices/"
PROFILES_PATH = "/content/voice_profiles.pkl"

os.makedirs(REGISTERED_DIR, exist_ok=True)

# Utility functions
def extract_features(audio_path):
    """Extract MFCC features from audio."""
    audio, sr = librosa.load(audio_path, sr=16000)
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)

def register_voice(name, audio_path):
    """Register a voice sample for a new speaker."""
    if not os.path.exists(PROFILES_PATH):
        profiles = {}
    else:
        with open(PROFILES_PATH, "rb") as f:
            profiles = pickle.load(f)

    # Extract voice features
    features = extract_features(audio_path)
    features = normalize([features])[0]

    # Save the profile
    profiles[name] = features
    with open(PROFILES_PATH, "wb") as f:
        pickle.dump(profiles, f)

    return f"Voice registered for: {name}"

def recognize_speakers(audio_path):
    """Identify speakers in a given audio."""
    if not os.path.exists(PROFILES_PATH):
        return "No registered speakers found."

    with open(PROFILES_PATH, "rb") as f:
        profiles = pickle.load(f)

    features = extract_features(audio_path)
    similarities = {name: cosine_similarity([features], [embedding])[0][0]
                    for name, embedding in profiles.items()}

    speaker = max(similarities, key=similarities.get)
    return speaker if similarities[speaker] > 0.6 else "unknown speaker"

def generate_subtitles(audio_path):
    """Generate subtitles from audio."""
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        transcript = recognizer.recognize_google(audio)
        return transcript
    except sr.UnknownValueError:
        return "Could not understand audio"
    except sr.RequestError:
        return "Speech-to-text API unavailable"

def process_audio(name, audio_file):
    """Process the uploaded audio: register, recognize, and generate subtitles."""
    audio_path = "/content/" + audio_file.name
    with open(audio_path, "wb") as f:
        f.write(audio_file.read())

    # Register the voice
    register_result = register_voice(name, audio_path)

    # Recognize speaker and generate subtitles
    speaker = recognize_speakers(audio_path)
    transcript = generate_subtitles(audio_path)

    return f"Registration Result: {register_result}\n\nSpeaker: {speaker}\nTranscript: {transcript}"

# Define the Gradio interface for recording or uploading audio
def transcribe_audio(audio_file):
    """Transcribe the audio to text using Google's Speech Recognition."""
    recognizer = sr.Recognizer()

    # Check if the input is a numpy array (for recorded audio) or a file
    if isinstance(audio_file, np.ndarray):
        # Audio is recorded from the microphone
        audio_data = sr.AudioData(audio_file.tobytes(), 16000, 2)  # 16kHz sample rate, 2 bytes per sample
    else:
        # Audio is uploaded as a file
        with sr.AudioFile(audio_file.name) as source:
            audio_data = recognizer.record(source)

    try:
        print("Transcribing...")
        text = recognizer.recognize_google(audio_data)
        print(f"Transcribed text: {text}")
        return text
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio")
        return "Could not understand the audio"
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return "Error with the speech recognition service"

# Gradio interface definition
iface = gr.Interface(
    fn=transcribe_audio,  # The function to call when the inputs are provided
    inputs=[
        gr.Textbox(label="Speaker Name"),
        gr.Audio(type="numpy", label="Record Audio")  # No 'source' argument, 'type="numpy"' handles it
    ],
    outputs="text"
)

iface.launch()




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5731836604fb3de7b0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [24]:
from google.colab import drive
drive.mount('/content/drive')




MessageError: Error: credential propagation was unsuccessful