<a href="https://colab.research.google.com/github/saurabh2086/whisper_use_case/blob/dev/whisper_speech_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/whisper.git 

In [None]:
# on Ubuntu or Debian
!sudo apt update && sudo apt install ffmpeg

In [None]:
import whisper
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

List the available models

In [None]:
pd.DataFrame(dict(Models = whisper.available_models()))

let's load the base model

In [None]:
model = whisper.load_model(name='base')

Check if the model is running on the GPU. The output should be `device(type='cuda', index=0)`

In [None]:
model.device

In [None]:
from IPython.display import Audio
Audio("mary.mp3")

Load audio and pad/trim it to fit 30 seconds


In [None]:
mary = whisper.load_audio("mary.mp3")
audio = whisper.pad_or_trim(mary)

Make log-Mel spectrogram and move to the same device as the model


In [None]:
mel = whisper.log_mel_spectrogram(audio).to(model.device)

In [None]:
mel.shape

detect the spoken language

In [None]:
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs,key=probs.get)}")

Decode the audio

In [None]:
options = whisper.DecodingOptions(task="transcribe") 
result = whisper.decode(model=model,mel=mel,options=options)

In [None]:
print(f"The Transcription : \n{result.text}")

Using the transcribe method

In [None]:
Audio("daisy_HAL_9000.mp3")

In [None]:
result = model.transcribe("daisy_HAL_9000.mp3")
print(result["text"])

# Spacy - Extracting information

# Recording live audio

In [None]:
! pip install gradio -q

In [None]:
import gradio as gr
import pandas as pd
import datetime
import time

In [None]:
def transcribe(file_name):
  if file_name:
    model = whisper.load_model(name='base')
    result = model.transcribe(file_name)
    result = result["text"]
  else:
    result = None 
  return result

def gather_information(df_base,customer_id,
                       meeting_date,
                       short_term_goals, 
                       long_term_goals, 
                       happy_about, 
                       not_happy_about, 
                       next_steps):
  df = pd.DataFrame({'customer_ids':[customer_id],
             'meeting_date':[meeting_date],
             'short_term_goals':[short_term_goals], 
             'long_term_goals':[long_term_goals],
             'happy_about':[happy_about],
             'not_happy_about':[not_happy_about],
             'next_steps':[next_steps]
  })
  df = df_base.append(df,ignore_index=True)
  return {rm_df: df,
          df_out: df}
  

In [None]:
with gr.Blocks(theme=gr.themes.Default()) as demo:
  rm_df = gr.State(pd.DataFrame())
  gr.Markdown("## Emp Name: Miss Nobody")
  with gr.Row():
    gr.Markdown("Emp ID: 324")
    gr.Markdown("Branch: Holland Village")
  gr.Markdown("---")
  with gr.Row():
    with gr.Column():
      customer_id = gr.Textbox(label="Customer ID",type="text", interactive=True)
      customer_name = gr.Textbox(label="Customer Name",type="text", interactive=True)
    with gr.Column():
      meeting_date = gr.Textbox(placeholder=datetime.date.today(),label="Meeting time", type="text",interactive=True)
      meeting_time = gr.Textbox(placeholder=datetime.datetime.now().strftime("%H:%M"), label="Meeting time", type="text", interactive=True)
  
  with gr.Row():
    audio_record = gr.Audio(source="microphone",label="Record the meeting",type="filepath")
    transcription = gr.Textbox(label="Meeting notes", type="text", interactive=True)
  clear_button = gr.Button("Clear")

  audio_record.change(transcribe,inputs=audio_record,outputs=transcription)
  # audio_record.clear()
  clear_button.click(lambda: (gr.update(value=None), 
                              gr.update(value=None), 
                              gr.update(value=None),
                              gr.update(value=None)), 
                     None, 
                     [customer_id,
                      customer_name, 
                      audio_record, 
                      transcription])
demo.launch(debug=False)

In [None]:
with gr.Blocks(title="Voice To Text") as demo:
  rm_df = gr.State(pd.DataFrame())
  with gr.Row():
    customer_id = gr.Textbox(label="Customer ID",type="text")
    meeting_date = gr.Textbox(value=datetime.date.today(),label="Meeting date", type="text")
  short_term_goals = gr.Markdown("## Short Term Goals")
  with gr.Row():
    input_audio1 = gr.Audio(source='microphone',label="Short Term Goals", type="filepath")
    text_output1 = gr.Textbox(label="Short term goals",interactive=True,type="text")
  
  long_term_goals = gr.Markdown("## Long Term Goals")
  with gr.Row():
    input_audio2 = gr.Audio(source='microphone',label="Long Term Goals", type="filepath",)
    text_output2 = gr.Textbox(label="Long term goals",interactive=True,type="text")
  
  happy_about = gr.Markdown("## Things Client is happy about :)")
  with gr.Row():
    input_audio3 = gr.Audio(source='microphone',label="Happy About", type="filepath",)
    text_output3 = gr.Textbox(label="Things client is happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Things Client is not happy about :(")
  with gr.Row():
    input_audio4 = gr.Audio(source='microphone',label="Not Happy About", type="filepath",)
    text_output4 = gr.Textbox(label="Things client is not happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Next Steps ...")
  with gr.Row():
    input_audio5 = gr.Audio(source='microphone',label="Next Steps", type="filepath",)
    text_output5 = gr.Textbox(label="Next steps ...",interactive=True, type="text")
  submit_button = gr.Button("Submit report")
  df_out = gr.Dataframe()



  input_audio1.change(transcribe,inputs=input_audio1,outputs=text_output1)
  input_audio2.change(transcribe,inputs=input_audio2,outputs=text_output2)
  input_audio3.change(transcribe,inputs=input_audio3,outputs=text_output3)
  input_audio4.change(transcribe,inputs=input_audio4,outputs=text_output4)
  input_audio5.change(transcribe,inputs=input_audio5,outputs=text_output5)
  submit_button.click(gather_information,
                      inputs=[rm_df, customer_id,meeting_date,text_output1, text_output2, text_output3, text_output4, text_output5],
                      outputs=[rm_df,df_out]
                      )


demo.launch(inbrowser=True, share=True)

In [None]:
gr.Interface(
    title = "Whisper AI app",
    fn = transcribe,
    inputs = [gr.inputs.Audio(source="microphone", type="filepath")],
    outputs=["textbox"],
    live=True
).launch()

In [None]:
!pip install git+https://github.com/openai/whisper.git 
!sudo apt update && sudo apt install ffmpeg

import whisper
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import gradio as gr
import pandas as pd
import datetime
import time

def transcribe(file_name):
  model = whisper.load_model(name='base')
  result = model.transcribe(file_name)
  return result["text"]

def gather_information(df_base,customer_id,
                       meeting_date,
                       short_term_goals, 
                       long_term_goals, 
                       happy_about, 
                       not_happy_about, 
                       next_steps):
  df = pd.DataFrame({'customer_ids':[customer_id],
             'meeting_date':[meeting_date],
             'short_term_goals':[short_term_goals], 
             'long_term_goals':[long_term_goals],
             'happy_about':[happy_about],
             'not_happy_about':[not_happy_about],
             'next_steps':[next_steps]
  })
  df = df_base.append(df,ignore_index=True)
  return {rm_df: df,
          df_out: df}

with gr.Blocks(title="Voice To Text") as demo:
  rm_df = gr.State(pd.DataFrame())
  with gr.Row():
    customer_id = gr.Textbox(label="Customer ID",type="text")
    meeting_date = gr.Textbox(value=datetime.date.today(),label="Meeting date", type="text")
  short_term_goals = gr.Markdown("## Short Term Goals")
  with gr.Row():
    input_audio1 = gr.Audio(source='microphone',label="Short Term Goals", type="filepath")
    text_output1 = gr.Textbox(label="Short term goals",interactive=True,type="text")
  
  long_term_goals = gr.Markdown("## Long Term Goals")
  with gr.Row():
    input_audio2 = gr.Audio(source='microphone',label="Long Term Goals", type="filepath",)
    text_output2 = gr.Textbox(label="Long term goals",interactive=True,type="text")
  
  happy_about = gr.Markdown("## Things Client is happy about :)")
  with gr.Row():
    input_audio3 = gr.Audio(source='microphone',label="Happy About", type="filepath",)
    text_output3 = gr.Textbox(label="Things client is happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Things Client is not happy about :(")
  with gr.Row():
    input_audio4 = gr.Audio(source='microphone',label="Not Happy About", type="filepath",)
    text_output4 = gr.Textbox(label="Things client is not happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Next Steps ...")
  with gr.Row():
    input_audio5 = gr.Audio(source='microphone',label="Next Steps", type="filepath",)
    text_output5 = gr.Textbox(label="Next steps ...",interactive=True, type="text")
  submit_button = gr.Button("Submit report")
  df_out = gr.Dataframe()



  input_audio1.change(transcribe,inputs=input_audio1,outputs=text_output1)
  input_audio2.change(transcribe,inputs=input_audio2,outputs=text_output2)
  input_audio3.change(transcribe,inputs=input_audio3,outputs=text_output3)
  input_audio4.change(transcribe,inputs=input_audio4,outputs=text_output4)
  input_audio5.change(transcribe,inputs=input_audio5,outputs=text_output5)
  submit_button.click(gather_information,
                      inputs=[rm_df, customer_id,meeting_date,text_output1, text_output2, text_output3, text_output4, text_output5],
                      outputs=[rm_df,df_out]
                      )


demo.launch(inbrowser=True, share=True, auth=("baby_shark","dododododo"))

