# ASR + Reasoning (Qwen2 + llama3.1)

In [1]:
# Import libraries
import os
import sys
import base64
import IPython
import yt_dlp
from dotenv import load_dotenv
from langchain_community.chat_models import ChatSambaNovaCloud
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers import OutputFixingParser
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import Any, Dict, List, Optional

# Get absolute paths for kit_dir and repo_dir
current_dir = os.getcwd()
kit_dir =  os.path.abspath(os.path.join(current_dir, '..'))
repo_dir = os.path.abspath(os.path.join(kit_dir, '..'))

# Adding directories to the Python module search path
sys.path.append(repo_dir)


load_dotenv(os.path.join(repo_dir, '.env'), override=True)

True

## Utils functions

In [2]:
def encode_to_base64(content: bytes) -> str:
    """Encode audio file to base64"""
    return base64.b64encode(content).decode("utf-8")

def load_encode_audio(path: str) -> str:
    with open(path, 'rb') as file:
        audio = file.read()
    b64_audio =  encode_to_base64(content = audio)
    return b64_audio

class FileSizeExceededError(Exception):
    pass

def download_youtube_audio(
        url: str, 
        output_path: Optional[str] = None, 
        max_filesize: int = 25 * 1024 * 1024  # 25 MB in bytes
    ) -> Optional[str]:
        if output_path is None:
            output_path = os.path.join(kit_dir, 'data')
        downloaded_filename = None

        def progress_hook(d: Dict[str, Any]) -> None:
            nonlocal downloaded_filename
            if d['status'] == 'finished':
                downloaded_filename = d['filename']
            elif d['status'] == 'downloading':
                if 'total_bytes' in d and d['total_bytes'] > max_filesize:
                    if 'tmpfilename' in d:
                        try:
                            os.remove(d['tmpfilename'])
                            print(f"Deleted temporary file: {d['tmpfilename']}")
                        except OSError as e:
                            print(f'Error deleting temporary file: {e}')
                    raise FileSizeExceededError(f'File size exceeds {max_filesize/1024/1024:.2f} MB limit')

        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [
                {
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
                }
            ],
            'outtmpl': output_path + '/%(title)s.%(ext)s',
            'progress_hooks': [progress_hook],
        }

        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([url])
            print(f'Successfully downloaded audio from: {url}')

            # Ensure the filename has .mp3 extension
            if downloaded_filename and not downloaded_filename.endswith('.mp3'):
                new_filename = os.path.splitext(downloaded_filename)[0] + '.mp3'
                if os.path.exists(new_filename):
                    downloaded_filename = new_filename

            return downloaded_filename

        except FileSizeExceededError as e:
            print(f'Skipped downloading {url}: {str(e)}')
        except yt_dlp.utils.DownloadError as e:
            print(f'An error occurred while downloading {url}: {str(e)}')
        except Exception as e:
            print(f'An unexpected error occurred while downloading {url}: {str(e)}')

        return None

## Model setting

### Audio model
Add "QWEN2_URL", "QWEN2_API_KEY" environment variables to .env file in the root of the repo

In [3]:
audio_model = ChatSambaNovaCloud(
    sambanova_url = os.environ.get("QWEN2_URL"),
    sambanova_api_key = os.environ.get("QWEN2_API_KEY"),
    model = 'Qwen2-Audio-7B-Instruct',
    max_tokens = 1200,
    streaming=False,
)

In [4]:
# test audio model
audio_path=os.path.join(kit_dir,'data','base.mp3')
IPython.display.Audio(audio_path)

In [5]:
audio_model.invoke([
    HumanMessage(content = [{
        "type": "audio_content",
        "audio_content": {
            "content": f"data:audio/mp3;base64,{load_encode_audio(audio_path)}"
            }
        }]
    )
])

AIMessage(content='I am a large language model created by Alibaba Cloud. I am called QianWen.', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'completion_tokens': 19, 'completion_tokens_after_first_per_sec': 15.938705497327486, 'completion_tokens_after_first_per_sec_first_ten': 672.4065568514288, 'completion_tokens_per_sec': 14.751775486275731, 'end_time': 1733873051.7118666, 'is_last_response': True, 'prompt_tokens': 108, 'start_time': 1733873050.423886, 'time_to_first_token': 0.15865421295166016, 'total_latency': 1.287980556488037, 'total_tokens': 127, 'total_tokens_per_sec': 98.60397298721146}, 'model_name': 'Qwen2-Audio-7B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733873050}, id='2272a3e6-ebd8-4a97-a708-296151743c7d')

### LLM

Add your sambanova cloud in "SAMBANOVA_API_KEY" environment variable to .env file in the root of the repo

In [None]:
llm = ChatSambaNovaCloud(
    model = 'Meta-Llama-3.3-70B-Instruct',
    temperature = 0.01
)

In [7]:
# test llm
llm.invoke('hi, who are you?')

AIMessage(content='I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."', additional_kwargs={}, response_metadata={'finish_reason': 'stop', 'usage': {'acceptance_rate': 7, 'completion_tokens': 22, 'completion_tokens_after_first_per_sec': 504.0740312584055, 'completion_tokens_after_first_per_sec_first_ten': 584.6534708670198, 'completion_tokens_per_sec': 105.12519139628144, 'end_time': 1733873055.6752837, 'is_last_response': True, 'prompt_tokens': 41, 'start_time': 1733873055.461978, 'time_to_first_token': 0.1716451644897461, 'total_latency': 0.2092742919921875, 'total_tokens': 63, 'total_tokens_per_sec': 301.04032081662416}, 'model_name': 'Meta-Llama-3.1-70B-Instruct', 'system_fingerprint': 'fastcoe', 'created': 1733873055}, id='7360d655-9998-49e4-bd6a-81917bc83a70')

## ASR Pipeline

In [8]:
# test audio
audio_path=os.path.join(kit_dir,'data','sample_record.mp3')
IPython.display.Audio(audio_path)

### Simple transcription

In [9]:
class Transcript(BaseModel):
    transcript: str = Field(description="audio transcription")

def simple_asr(audio_path): 
    
    parser=PydanticOutputParser(pydantic_object=Transcript)
    
    autofix_parser = OutputFixingParser.from_llm(parser=parser, llm=llm)
    
    b64_audio = load_encode_audio(audio_path)

    conversation = [
        AIMessage(
            "You are Automatic Speech Recognition tool"
            ),
        HumanMessage(
            content = [{
                "type": "audio_content",
                "audio_content": {
                    "content": f"data:audio/mp3;base64,{b64_audio}"
                }
            }]
        ),
        HumanMessage(
            f'''Please transcribe the previous audio in the following format
            
            ```
                {{
                    "transcript":"<audio transcription>"
                }}
            ```
            
            Always return your response enclosed by ``` and using double quotes
            '''
            )
    ]
    
    chain = audio_model | autofix_parser
    
    return chain.invoke(conversation).transcript

In [10]:
result = simple_asr(audio_path)
result

"Hi, who is going? Not bad, just go back from my meeting. How about you? I'm good, just got some work done. So what was the meeting about? It was about the new project we're working on. We are going to be using a new software tool. Oh, cool. I hear of that tool before. It is going to be easy to use? Yeah, it's pretty user-friendly. I think we will be able to get up and running quickly. Great, I'm looking forward to learning more about it."

In [11]:
# Transcribe audio form youtube video
yt_audio_path = download_youtube_audio(url = 'https://www.youtube.com/watch?v=L-HCCaLe35w')
result = simple_asr(yt_audio_path)
result

[youtube] Extracting URL: https://www.youtube.com/watch?v=L-HCCaLe35w
[youtube] L-HCCaLe35w: Downloading webpage
[youtube] L-HCCaLe35w: Downloading ios player API JSON
[youtube] L-HCCaLe35w: Downloading web creator player API JSON
[youtube] L-HCCaLe35w: Downloading m3u8 information
[info] L-HCCaLe35w: Downloading 1 format(s): 251
[download] Destination: /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.webm
[download] 100% of  371.51KiB in 00:00:00 at 3.90MiB/s     
[ExtractAudio] Destination: /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.mp3
Deleting original file /Users/jorgep/Documents/ask_public_own/ai-starter-kit-snova/sambanova_scribe/data/An Ensemble of AI Models.webm (pass -k to keep)
Successfully downloaded audio from: https://www.youtube.com/watch?v=L-HCCaLe35w


"and something you said, i think, is so important to this discussion is the idea of an ensemble of models of the future. i really think we're going to see more and more of this. models are best for this, but i think we need two or three more to get to the best answer. to find best, as i say, the first time somebody uses a large language model to put something in the president's daily briefing book and it's wrong will be the last time somebody puts something in the president's daily briefing book. so getting it right is extraordinarily important here."

### Augmented ASR

advanced transcription methods to get extra information like speaker an gender per turn in the conversation

In [12]:
class TurnTranscription(BaseModel):
    speaker: int = Field(description="numerated speaker in the conversation")
    gender: str = Field(description="gender of the speaker")
    sentiment: str = Field(description="sentiment of the speaker")
    transcription: str = Field(description="transcription")
    
class ASROutput(BaseModel):
    output: List[Optional[TurnTranscription]] = Field(description="list turns transcriptions")

In [13]:
def complex_asr_few_shot_long(audio_path):
    
    conversation_parser=PydanticOutputParser(pydantic_object=ASROutput)
    
    format_instructions = conversation_parser.get_format_instructions()
    
    autofix_conversation_parser = OutputFixingParser.from_llm(parser=conversation_parser, llm=llm)
    
    b64_audio = load_encode_audio(audio_path)

    conversation = [
        AIMessage("You are Automatic Speech Recognition tool"),
        HumanMessage(
            content = [{
                "type": "audio_content",
                "audio_content": {
                    "content": f"data:audio/mp3;base64,{load_encode_audio(os.path.join(kit_dir, 'data', 'sample_record_2.mp3'))}"
                }
            }]
        ),
        HumanMessage(
            f'''Please transcribe, diarize,  extract sentiment and gender of the previous audio 
            {format_instructions}
            '''
        ),
        AIMessage(
            '''
            ```
            "output":[
                {{
                    "speaker":"1",
                    "gender":"male",
                    "sentiment":"angry",
                    "transcription":"they told me you didn't do the homework, whats wrong with you?"
                }},
                {{
                    "speaker":"2",
                    "gender":"female",
                    "sentiment":"neutral",
                    "transcription":I was sick"
                }},
                {{
                    "speaker":"1",
                    "gender":"male",
                    "sentiment":"angry",
                    "transcription":"dont make excuses, you are a liar"
                }}
                ]
            ```
            '''
        ),
         HumanMessage(
            content = [{
                "type": "audio_content",
                "audio_content": {
                    "content": f"data:audio/mp3;base64,{b64_audio}"
                }
            }]
        ),
        HumanMessage(
            f'''Please transcribe, diarize,  extract sentiment and gender of the previous audio 
            {format_instructions}
            '''
        ),
    ]
    
    chain = audio_model | autofix_conversation_parser
    
    return chain.invoke(conversation).output

In [14]:
result = complex_asr_few_shot_long(audio_path)
result

[TurnTranscription(speaker=1, gender='male', sentiment='neutral', transcription='hi who is going'),
 TurnTranscription(speaker=1, gender='male', sentiment='neutral', transcription='not bad just go back from a meeting how about you'),
 TurnTranscription(speaker=1, gender='male', sentiment='angry', transcription='im good just got some work done so'),
 TurnTranscription(speaker=2, gender='female', sentiment='neutral', transcription='what was the meeting about'),
 TurnTranscription(speaker=1, gender='male', sentiment='neutral', transcription="it was about the new project we're working on"),
 TurnTranscription(speaker=1, gender='male', sentiment='neutral', transcription='we are going to be using a new software tool'),
 TurnTranscription(speaker=2, gender='female', sentiment='neutral', transcription='oh cool'),
 TurnTranscription(speaker=1, gender='male', sentiment='neutral', transcription='i hear of that tool before'),
 TurnTranscription(speaker=1, gender='male', sentiment='neutral', transc

## Reasoning / chaining

### Audio Query
method for directly querying with the audio model

In [16]:
def query_audio(audio_path: str, query:str):
    b64_audio = load_encode_audio(audio_path)
    conversation = [
        AIMessage(
            "You are helpful assistant called Scribe developed by SambaNova Systems, you are helping users in general purpose tasks"
            ),
        HumanMessage(
            content = [{
                "type": "audio_content",
                "audio_content": {
                    "content": f"data:audio/mp3;base64,{b64_audio}"
                }
            }]
        ),
        HumanMessage(f'{query}, explain your response')
    ]
    response= audio_model.invoke(conversation)
    return response.content

In [17]:
response = query_audio(audio_path,"whats the mood of the person working on the project?")
response

"The mood of the person working on the project seems to be positive and enthusiastic. This can be inferred from their casual greeting, the use of words like 'cool' and 'great', and the anticipation of getting started with the new software tool."

In [18]:
response = query_audio(audio_path,"tell me an interesting fact about the conversation?")
response

'One interesting fact is that the tools they are going to use for the new project were discussed in a meeting.'

In [19]:
response = query_audio(audio_path,"how many people talk in the audio?")
response

'two people are talking in the audio.'

### Audio + LLM query

chained function

1- Use ASR function to get the audio transcript

2- Use query_audio function to get an intermediate response using audio model

3- Use llama 3 model to generate a final answer using transcription and intermediate audio answer 

In [20]:
def query_audio_pipeline(audio_path: str, query:str):
    transcription = simple_asr(audio_path) # complex_asr(audio_path)
    audio_result = query_audio(audio_path, query)
    conversation = [
        SystemMessage('''
                      You are helpful assistant called Scribe developed by SambaNova Systems.
                      the user will ask information about an audio.
                      You will get the user query, the audio transcription and an intermediate response generated by a model capable of listening the audio
                      Whit those give a final response to the user query
                      '''
                      ),
        HumanMessage(
            f'''
            Transcript: {transcription}
            Intermediate Audio Response: {audio_result}
            Query: {query}
            '''
            )
    ]
    chain = llm | StrOutputParser()
    response = chain.invoke(conversation)
    return response
    

In [21]:
response = query_audio_pipeline(audio_path,"whats the mood of the person working on the project?")
response

'The mood of the person working on the project appears to be positive and enthusiastic. They seem to be looking forward to learning more about the new software tool and express optimism about getting started with it quickly.'

In [22]:
response = query_audio_pipeline(audio_path,"tell me an interesting fact about the conversation?")
response

'One interesting fact about the conversation is that the tools they are going to use for the new project were discussed in a meeting, specifically a new software tool that is user-friendly and expected to get them up and running quickly.'

In [23]:
response = query_audio_pipeline(audio_path,"how many people talk in the audio?")
response

'There are 2 people talking in the audio.'