<a href="https://colab.research.google.com/github/saurabh2086/whisper_use_case/blob/dev/whisper_speech_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/openai/whisper.git 

In [None]:
# on Ubuntu or Debian
!sudo apt update && sudo apt install ffmpeg

In [None]:
import whisper
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

List the available models

In [None]:
pd.DataFrame(dict(Models = whisper.available_models()))

let's load the base model

# Recording live audio

In [None]:
! pip install gradio -q

In [None]:
import gradio as gr
import pandas as pd
import datetime
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import time

In [None]:
def transcribe(file_name):
  if file_name:
    model = whisper.load_model(name='base')
    result = model.transcribe(file_name)
    result = result["text"]
  else:
    result = None 
  return result

# def gather_information(df_base,customer_id,
#                        meeting_date,
#                        short_term_goals, 
#                        long_term_goals, 
#                        happy_about, 
#                        not_happy_about, 
#                        next_steps):
#   df = pd.DataFrame({'customer_ids':[customer_id],
#              'meeting_date':[meeting_date],
#              'short_term_goals':[short_term_goals], 
#              'long_term_goals':[long_term_goals],
#              'happy_about':[happy_about],
#              'not_happy_about':[not_happy_about],
#              'next_steps':[next_steps]
#   })
#   df = df_base.append(df,ignore_index=True)
#   return {rm_df: df,
#           df_out: df}

def find_information(text_transcript):
  short_term_pattern = [{"LOWER":"short"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
  long_term_pattern = [{"LOWER":"long"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
  happy_pattern = [{"LEMMA": "be"}, {"LOWER": "happy"}, {"LOWER": "about"}]
  not_happy_pattern = [{"LEMMA": "be"}, {"LOWER": "not"}, {"LOWER": "happy"}, {"LOWER": "about"}]
  next_steps_pattern = [{"LOWER": "next"}, {"LOWER": "step"}]
  whisper_pattern = [short_term_pattern,
                   long_term_pattern,
                   happy_pattern,
                   not_happy_pattern,
                   next_steps_pattern]

  nlp=spacy.load("en_core_web_sm")
  doc = nlp(text_transcript)
  matcher = Matcher(nlp.vocab)
  matcher.add("WhisperPattern", whisper_pattern)
  matches = matcher(doc)
  
  initial_span = doc[0: matches[0][1]]
  short_term_span = doc[matches[0][1]:matches[1][1]]
  long_term_span = doc[matches[1][1]:matches[2][1]]
  happy_span = doc[matches[2][1]:matches[3][1]]
  not_happy_span = doc[matches[3][1]:matches[4][1]]
  next_step_span = doc[matches[4][1]:]
  
  return [(initial_span.text, None),
          (short_term_span.text, "ShortTermGoals"),
          (long_term_span.text, "LongTermGoals"),
          (happy_span.text, "HappyAbout"),
          (not_happy_span.text, "NotHappyAbout"),
          (next_step_span.text,"NextSteps")
          ]

def gather_information(df_base,customer_id,
                       customer_name,
                       meeting_date,
                       meeting_time,
                       transcription):
  information = {k: v for (v,k) in find_information(transcription)}
  
  df = pd.DataFrame({'customer_ids':[customer_id],
             'meeting_date':[meeting_date],
             'meeting_time':[meeting_time],
             'short_term_goals':[information["ShortTermGoals"]], 
             'long_term_goals':[information["LongTermGoals"]],
             'happy_about':[information["HappyAbout"]],
             'not_happy_about':[information["NotHappyAbout"]],
             'next_steps':[information["NextSteps"]],
             'updated_on': [datetime.datetime.today()]
  })
  df = df_base.append(df,ignore_index=True)
  return {rm_df: df,
          df_out: df}



In [None]:
with gr.Blocks(theme=gr.themes.Default()) as demo:
  with gr.Tab("Rcord Meeting"):  
    gr.Markdown("## Emp Name: Miss Nobody")
    with gr.Row():
      gr.Markdown("Emp ID: 324")
      gr.Markdown("Branch: Holland Village")
    gr.Markdown("---")
    with gr.Row():
      with gr.Column():
        customer_id = gr.Textbox(label="Customer ID",type="text", interactive=True)
        customer_name = gr.Textbox(label="Customer Name",type="text", interactive=True)
      with gr.Column():
        meeting_date = gr.Textbox(placeholder=datetime.date.today(),label="Meeting time", type="text",interactive=True)
        meeting_time = gr.Textbox(placeholder=datetime.datetime.now().strftime("%H:%M"), label="Meeting time", type="text", interactive=True)
    audio_record = gr.Audio(source="microphone",label="Record the meeting",type="filepath")
    transcription = gr.Textbox(label="Meeting notes", type="text", interactive=True, max_lines=5)
    with gr.Row():
      clear_button = gr.Button("Clear")
      parse_button = gr.Button("Parse Report")

    with gr.Accordion("See parsed report"):
      highlight_text = gr.HighlightedText()
    submit_button = gr.Button("Submit Report")


    
  with gr.Tab("Data"):
    rm_df = gr.State(pd.DataFrame())
    df_out = gr.Dataframe()
    
  audio_record.change(transcribe,inputs=audio_record,outputs=transcription)
  clear_button.click(lambda: (gr.update(value=None), 
                              gr.update(value=None), 
                              gr.update(value=None),
                              gr.update(value=None)), 
                    None, 
                    [customer_id,
                      customer_name, 
                      audio_record, 
                      transcription])
  parse_button.click(find_information,inputs=transcription,outputs=[highlight_text])
  submit_button.click(gather_information,inputs=[rm_df,customer_id, 
                                                 customer_name,
                                                 meeting_date,
                                                 meeting_time, 
                                                 transcription],outputs=[rm_df,df_out],show_progress=True)


demo.launch(debug=False)

In [None]:
with gr.Blocks(title="Voice To Text") as demo:
  rm_df = gr.State(pd.DataFrame())
  with gr.Row():
    customer_id = gr.Textbox(label="Customer ID",type="text")
    meeting_date = gr.Textbox(value=datetime.date.today(),label="Meeting date", type="text")
  short_term_goals = gr.Markdown("## Short Term Goals")
  with gr.Row():
    input_audio1 = gr.Audio(source='microphone',label="Short Term Goals", type="filepath")
    text_output1 = gr.Textbox(label="Short term goals",interactive=True,type="text")
  
  long_term_goals = gr.Markdown("## Long Term Goals")
  with gr.Row():
    input_audio2 = gr.Audio(source='microphone',label="Long Term Goals", type="filepath",)
    text_output2 = gr.Textbox(label="Long term goals",interactive=True,type="text")
  
  happy_about = gr.Markdown("## Things Client is happy about :)")
  with gr.Row():
    input_audio3 = gr.Audio(source='microphone',label="Happy About", type="filepath",)
    text_output3 = gr.Textbox(label="Things client is happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Things Client is not happy about :(")
  with gr.Row():
    input_audio4 = gr.Audio(source='microphone',label="Not Happy About", type="filepath",)
    text_output4 = gr.Textbox(label="Things client is not happy about",interactive=True, type="text")

  happy_about = gr.Markdown("## Next Steps ...")
  with gr.Row():
    input_audio5 = gr.Audio(source='microphone',label="Next Steps", type="filepath",)
    text_output5 = gr.Textbox(label="Next steps ...",interactive=True, type="text")
  submit_button = gr.Button("Submit report")
  df_out = gr.Dataframe()



  input_audio1.change(transcribe,inputs=input_audio1,outputs=text_output1)
  input_audio2.change(transcribe,inputs=input_audio2,outputs=text_output2)
  input_audio3.change(transcribe,inputs=input_audio3,outputs=text_output3)
  input_audio4.change(transcribe,inputs=input_audio4,outputs=text_output4)
  input_audio5.change(transcribe,inputs=input_audio5,outputs=text_output5)
  submit_button.click(gather_information,
                      inputs=[rm_df, customer_id,meeting_date,text_output1, text_output2, text_output3, text_output4, text_output5],
                      outputs=[rm_df,df_out]
                      )


demo.launch(inbrowser=True, share=True)

In [None]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher

In [None]:
text = """Hi, today I met a client at the Revo City branch and we had a discussion for about one hour. 
The name of the client was Mr. XYZ and we talked about various financial aspects of the client. 
Based on our discussion the short term goals of the client were to buy a house in the near future 
and he has been saving for that for quite a long time and now he has enough money for the down payment 
and he is looking for somebody, some bank to fund the loan. The long term goals of this client to the few 
long term plans like my sending his kids to college in the US and buying a few more investment properties as well as 
he wants to invest in bond. He is also concerned about insurance so currently he has medical insurance but he 
wants to take a life insurance as well that will cover him for life. He is looking for a term for maybe around 
99 years or something. The client is happy about  with HSBC that HSBC is in multiple countries so he wants to
fund the EGA account using which he can have his money in various currencies. He also likes the network of HSBC and the 
services we provide. The thing he is not happy about the latency between fund transfers. Sometimes it takes one or 
two days for funds to reflect. He was asking if we have any plans to change that. On the next step I suggest 
that we should plan the insurance part of this client since the other plans are more merely short terms and 
for the long term plans we need to put the right insurance in place, and he requires more FX services in the future"""

In [None]:
nlp=spacy.load("en_core_web_sm")

In [None]:
doc = nlp(text)

In [None]:
matcher = Matcher(nlp.vocab)

In [None]:
short_term_pattern = [{"LOWER":"short"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
long_term_pattern = [{"LOWER":"long"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
happy_pattern = [{"LEMMA": "be"}, {"LOWER": "happy"}, {"LOWER": "about"}]
not_happy_pattern = [{"LEMMA": "be"}, {"LOWER": "not"}, {"LOWER": "happy"}, {"LOWER": "about"}]
next_steps_pattern = [{"LOWER": "next"}, {"LOWER": "step"}]

In [None]:
whisper_pattern = [short_term_pattern,
                   long_term_pattern,
                   happy_pattern,
                   not_happy_pattern,
                   next_steps_pattern]

In [None]:
matcher.add("WhisperPattern", [short_term_pattern,
                               long_term_pattern,
                               happy_pattern,
                               not_happy_pattern,
                               next_steps_pattern])

In [None]:
matches = matcher(doc)

In [None]:
matches

In [None]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)
    

In [None]:
matches[0][1]

In [None]:
def find_information(text_transcript):
  short_term_pattern = [{"LOWER":"short"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
  long_term_pattern = [{"LOWER":"long"}, {"LOWER": "term"}, {"LEMMA": "goal"}]
  happy_pattern = [{"LEMMA": "be"}, {"LOWER": "happy"}, {"LOWER": "about"}]
  not_happy_pattern = [{"LEMMA": "be"}, {"LOWER": "not"}, {"LOWER": "happy"}, {"LOWER": "about"}]
  next_steps_pattern = [{"LOWER": "next"}, {"LOWER": "step"}]
  whisper_pattern = [short_term_pattern,
                   long_term_pattern,
                   happy_pattern,
                   not_happy_pattern,
                   next_steps_pattern]

  nlp=spacy.load("en_core_web_sm")
  doc = nlp(text_transcript)
  matcher = Matcher(nlp.vocab)
  matcher.add("WhisperPattern", whisper_pattern)
  matches = matcher(doc)
  
  initial_span = doc[0: matches[0][1]]
  short_term_span = doc[matches[0][1]:matches[1][1]]
  long_term_span = doc[matches[1][1]:matches[2][1]]
  happy_span = doc[matches[2][1]:matches[3][1]]
  not_happy_span = doc[matches[3][1]:matches[4][1]]
  next_step_span = doc[matches[4][1]:]
  
  return [(initial_span.text, None),
          (short_term_span.text, "ShortTermGoals"),
          (long_term_span.text, "LongTermGoals"),
          (happy_span.text, "HappyAbout"),
          (not_happy_span.text, "NotHappyAbout"),
          (next_step_span.text,"NextSteps")
          ]

In [None]:
from zipfile import ZipFile

import gradio as gr


def zip_to_json(file_obj):
    files = []
    with ZipFile(file_obj.name) as zfile:
        for zinfo in zfile.infolist():
            files.append(
                {
                    "name": zinfo.filename,
                    "file_size": zinfo.file_size,
                    "compressed_size": zinfo.compress_size,
                }
            )
    return files


demo = gr.Interface(zip_to_json, "file", "json")

if __name__ == "__main__":
    demo.launch()
