In [None]:
%%writefile stream.py
import streamlit as st
import get_speakers as gs
import transcribe as t
import chatgpt as gpt
import os
import yt_dlp

# set the layout wide
st.set_page_config(layout = "wide")

# title of the webpage
st.title("Hi! We recommend a better conversation for you!")

# get an audio file; .wav are allowed to be uploaded
audiofile = st.file_uploader("Upload an audio file! For now, only .wav is allowed", type = ["wav"] )

# or get a YouTube link
url = st.text_input("Or copy and paste a YouTube link!")

# when audio file is received
if audiofile is not None and len(url) == 0:

    # when an audio file is given, show the name of it
    st.write(f"We got \"{audiofile.name}\" file from you. Will be right back with a better conversation!")

    num_speakers = gs.get_num_speakers(audiofile)

    T = t.Transcribe(audiofile, num_speakers, True)
    result = T.get_results()

    G = gpt.ChatGPT_part(result)
    gpt = G.ChatGPT()

    with st.container():
      col1, col2 = st.columns(2, gap="large")


      # output the transcript of the given audio file
      with col1:
        st.header("Transcription of your file")  
        st.write(result)


      # get a better conversation transcribtion from ChatGPT
      with col2:
        st.header("Here is a better conversation you may try!")
        st.write(gpt)
        

        
# when a YouTube link is received
elif len(url) != 0 and audiofile is None:
  st.write(f"We got the following link: {url}. Will be right back with a better conversation!")

  # extract audio part from the given url(YouTube video)
  ydl_opts = {
    'format': 'm4a/bestaudio/best',
    "outtmpl" : 'conversation',
    'postprocessors': [{  # Extract audio using ffmpeg
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }]
 }
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download(url)

  num_speakers = gs.get_num_speakers("conversation.wav")
  T = t.Transcribe("conversation.wav", num_speakers, False)
  result = T.get_results()

  G = gpt.ChatGPT_part(result)
  gpt = G.ChatGPT()


  with st.container():
    col1, col2 = st.columns(2, gap="large")

    with col1:
      st.header("Transcription of your link")
      st.write(result)
    
    with col2:
      st.header("Here is a better conversation you may try!")
      #st.write(gpt)
      st.text(gpt)
    

In [None]:
%%writefile transcribe.py

from get_speakers import get_num_speakers
import whisper
import datetime

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding( 
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

# I added this codes
import locale
locale.getpreferredencoding = lambda: "UTF-8"


class Transcribe():
    '''
    this class is for getting audio and number of speakers from stream.py
    given those, transcribe the audio
    '''
    def __init__(self, audio, num_speakers, is_file):
        
        if is_file:
        # when given an audio file
          self.audio = audio
          self.num_speakers = num_speakers
          self.path = audio.name
          self.language = "any"
          self.model_size = 'large-v2'

        else:
        # when given a link
          self.path = audio
          self.num_speakers = num_speakers
          self.language = "any"
          self.model_size = 'large-v2'


    # available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large']
    def load_whisper_model(self):
      model = whisper.load_model(self.model_size)
      return model

    # execute the trascribtion
    def execute(self):
        model = self.load_whisper_model()
        result = model.transcribe("conversation.wav") 
        segments = result["segments"]
        return segments
    
    def clustering(self):
        with contextlib.closing(wave.open("conversation.wav",'r')) as f: 
          frames = f.getnframes()
          rate = f.getframerate()
        duration = frames / float(rate)
        audio = Audio()
        return audio

    def segment_embedding(self, segment):
      audio = self.clustering()
      start = segment["start"]
      # Whisper overshoots the end timestamp in the last segment
      #end = min(duration, segment["end"])
      with contextlib.closing(wave.open("conversation.wav",'r')) as f: 
          frames = f.getnframes()
          rate = f.getframerate()
      duration = frames / float(rate)
      end = min(duration, segment["end"])
      clip = Segment(start, end)
      waveform, sample_rate = audio.crop(self.path, clip)
      
      return embedding_model(waveform[None])
    
    def get_results(self):
      segments = self.execute()
      embeddings = np.zeros(shape=(len(segments), 192))
      for i, segment in enumerate(segments):
          embeddings[i] = self.segment_embedding(segment)

      embeddings = np.nan_to_num(embeddings)

      clustering = AgglomerativeClustering(self.num_speakers).fit(embeddings)
      labels = clustering.labels_
      for i in range(len(segments)):
          segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

      result = [ seg["speaker"] + ": " + seg["text"] for c, seg in enumerate(segments)]
      return result

In [None]:
%%writefile chatgpt.py
import openai
openai.api_key = "your ChatGPT API key here"

class ChatGPT_part():
  def __init__(self, transcript):
    self.transcript = transcript
  
  def ChatGPT(self):
    content = '''다음의 대화에서 1.화자들이 어떤 관계인지 추측해보고 2.대화에 어떤 갈등이 있는지 추측해봐
    3. i-message로 주어진 대화를 개선해봐 4. 개선된 대화를 대화형식으로 반환해줘.'''

    for c in self.transcript:
      content = content + c + "\n"
      
    messages = []
    messages.append({"role": "user", "content": content})
    completion = openai.ChatCompletion.create(model = "gpt-3.5-turbo", messages = messages)
    chat_response = completion.choices[0].message.content
    return(chat_response[chat_response.index("4.") + 3: ])

In [None]:
%%writefile get_speakers.py
'''
add 2 seconds silence to the existing conversation.wav
this makes a better diarization
'''

from huggingface_hub import login
login("your huggingface API here")

from pydub import AudioSegment
from pyannote.audio import Pipeline

# authorization key should not be exposed
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="your huggingface API here")


def get_num_speakers(original_audio):
    '''
    given an audio file,
    return the number of speakers in an audio file: original_audio
    '''

    t1 = 0 * 1000 # Works in milliseconds
    t2 = 10 * 60 * 1000 # t1:t2 is total 10mins

    newAudio = AudioSegment.from_wav(original_audio)
    a = newAudio[t1:t2]
    a.export("conversation.wav", format="wav") 

    audio = AudioSegment.from_wav("conversation.wav")
    spacermilli = 2000
    spacer = AudioSegment.silent(duration=spacermilli)
    audio = spacer.append(audio, crossfade=0)

    audio.export('audio.wav', format='wav')

    # 4. apply pretrained pipeline
    diarization = pipeline("audio.wav")

    how_many = set()
    # 5. print the result
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
        how_many.add(int(speaker[-2:]))

    # the length of how_many is the total number of speakers in an audio file
    #print(len(how_many))

    return len(how_many)