In [1]:
pip install pyaudio speechrecognition

Note: you may need to restart the kernel to use updated packages.


**1. Importing Libraries**

In [2]:
import cv2
import os
import pyaudio
import wave
import speech_recognition as sr
import threading
import pyttsx3

**2. Taking in Inputs with PyAudio**

In [3]:
audio_file = "audio_input.wav"
video_file = "webcam_output.avi"
response_audio = "response.mp3"
transcription_text = ""
response_text = ""

def record_audio_with_pyaudio(start_event, filename="audio_input.wav", duration=10, rate=16000, channels=1):
    print("Waiting for video initialization...")
    start_event.wait() 
    print("Starting audio recording...")

    chunk = 1024
    format = pyaudio.paInt16
    p = pyaudio.PyAudio()

    stream = p.open(format=format,
                    channels=channels,
                    rate=rate,
                    input=True,
                    frames_per_buffer=chunk)
    
    frames = []
    for _ in range(0, int(rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)
    
    print("Audio recording complete.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(p.get_sample_size(format))
        wf.setframerate(rate)
        wf.writeframes(b''.join(frames))


**3. Taking video input with the help of OpenCV**

In [4]:
# Function to capture video from the webcam
def record_webcam_video(start_event, output_file="webcam_output.avi", duration=10):
    print("Accessing webcam...")
    cap = cv2.VideoCapture(0)  
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_rate = int(cap.get(cv2.CAP_PROP_FPS)) or 30  

    out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'XVID'), frame_rate, (frame_width, frame_height))

    
    start_event.set()  # Signal audio recording to start
    print("Starting video recording...")

    frame_count = 0
    while cap.isOpened() and frame_count < frame_rate * duration:
        ret, frame = cap.read()
        if not ret:
            print("Error: Unable to capture video.")
            break
        out.write(frame)  # Save the frame to file
        cv2.imshow("Webcam Feed", frame)
        frame_count += 1

        # Break on 'q' key
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()
    print("Video recording complete.")

**4. Transcribing text with the help of SpeechRecognition**

In [5]:
def transcribe_audio():
    global transcription_text
    recognizer = sr.Recognizer()
    print("Transcribing audio...")
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
        try:
            transcription_text = recognizer.recognize_google(audio)
            print("Transcription:", transcription_text)
        except sr.UnknownValueError:
            transcription_text = "Sorry, I couldn't understand the audio."
        except sr.RequestError as e:
            transcription_text = f"Speech recognition service error: {e}"


**5. Using a hosted ChatGPT 3.5 turbo model to get an output**

In [6]:
import openai

openai.api_key = "pk-dkpqgLnxgaoyhIGlDUOWbqSFwzqdVWygiNDpFvSMRmvukeUh"  
openai.api_base = "https://api.pawan.krd/cosmosrp/v1/"

# Function to generate a response using OpenAI's API
def generate_response(prompt):
    print("Generating response from AI...")
    try:
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a straightforward assistant who says concise answers without mentioning your tone or action "},
                {"role": "user", "content": prompt}
            ]
        )
        generated_text = completion.choices[0].message.content
        print("Generated Response:", generated_text)
        return generated_text
    except Exception as e:
        print(f"Error generating response: {e}")
        return "I'm sorry, I couldn't generate a response."


**6. Using pyttsx3 for text to speech**

In [7]:
# **Function to convert text to speech using pyttsx3**
def text_to_speech(text):
    print("Converting text to speech...")
    try:
        # Initialize the pyttsx3 engine
        engine = pyttsx3.init()
        engine.setProperty('rate', 150)            # Set speech speed
        engine.setProperty('volume', 0.9)                # Set volume (0.0 to 1.0)
        engine.say(text)                        # Queue the text for speech
        engine.runAndWait()                  # Run the speech engine
        print("Speech playback started.")
    except Exception as e:
        print(f"Error in text-to-speech conversion: {e}")

**7. Main Program**

In [9]:
if __name__ == "__main__":
    duration = 10  # Duration for synchronized recording

    # Event to synchronize audio and video start
    start_event = threading.Event()

    # Start audio and video recording threads
    audio_thread = threading.Thread(target=record_audio_with_pyaudio, args=(start_event, "audio_input.wav", duration))
    video_thread = threading.Thread(target=record_webcam_video, args=(start_event, "webcam_output.avi", duration))

    video_thread.start()  # Start video recording (includes camera initialization)
    audio_thread.start()  # Audio waits for the camera to be ready

    video_thread.join()
    audio_thread.join()

    # Transcribe the audio
    transcribe_audio()  # Use the function you defined
    if transcription_text:
        print("\nFinal Transcription:", transcription_text)

        # Generate response from transcription
        response_text = generate_response(transcription_text)
        if response_text:
            print("\nGenerated AI Response:", response_text)

            # Convert generated response to speech
            text_to_speech(response_text)

    print("\nAudio, Video, and Response Processing Completed Successfully.")

Accessing webcam...
Waiting for video initialization...
Starting video recording...Starting audio recording...

Video recording complete.
Audio recording complete.
Transcribing audio...
Transcription: what is the capital of Kerala

Final Transcription: what is the capital of Kerala
Generating response from AI...
Generated Response: *Thiruvananthapuram is the capital of Kerala.*

Generated AI Response: *Thiruvananthapuram is the capital of Kerala.*
Converting text to speech...
Speech playback started.

Audio, Video, and Response Processing Completed Successfully.
