# core

> Transcribe multilingual audio files with multiple speakers and get a Pandas dataframe.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
import whisper
from pydub import AudioSegment
from pyannote.audio import Pipeline
diarize_pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization')

ModuleNotFoundError: No module named 'whisper'

In [None]:
#| export
class Dictation:
    
    def __init__(self): pass
    
    def transcribe(self, file, model="medium", task="transcribe"):

        audio = AudioSegment.from_mp3(file)
        audio.export('audio.wav', format='wav')
        
        transcriber = whisper.load_model(model)
        
        diarization = diarize_pipeline('audio.wav')
        
        df = pd.DataFrame()
        
        count = 0
        for turn, track, speaker in diarization.itertracks(yield_label=True):
            df.loc[count,'start'] = turn.start
            df.loc[count,'end'] = turn.end
            df.loc[count,'speaker'] = speaker
            
            count += 1
        
        df['difference'] = df['end'] - df['start']
        
        #make a DF that merges tracks of the same speaker and tries to weed out overlaps
        simplified_df = pd.DataFrame()
        simplified_df.loc[0,'start'] = df.loc[0,'start']
        simplified_df.loc[0,'speaker'] = df.loc[0,'speaker']

        count = 0
        for index, row in df.iterrows():
            if df.loc[index,'speaker'] == simplified_df.loc[count,'speaker']:
                last_end_time = df.loc[index,'end']
            else:
                #if there has been a change in speaker, check if there is an overlap.
                #ignore overlaps if the track is less than 2s
                #if it is longer than 2 seconds, start it where the overlap ends
                if df.loc[index,'start'] < last_end_time:
                    if (df.loc[index,'difference'] > 2.0) and (df.loc[index,'end'] > last_end_time):
                        simplified_df.loc[count,'end'] = last_end_time
                        count += 1
                        simplified_df.loc[count,'start'] = last_end_time
                        simplified_df.loc[count,'speaker'] = df.loc[index,'speaker']
                        last_end_time = df.loc[index,'end']
                    else:
                        pass
                else:
                    simplified_df.loc[count,'end'] = last_end_time
                    count += 1
                    simplified_df.loc[count,'start'] = df.loc[index,'start']
                    simplified_df.loc[count,'speaker'] = df.loc[index,'speaker']
                    last_end_time = df.loc[index,'end']

        simplified_df.loc[count,'end'] = df.loc[index,'end']
        
        df = simplified_df

        #transcribe
        for index,row in df.iterrows():
            start = df.loc[index,'start']
            end = df.loc[index,'end']
            clip = audio[(start * 1000):(end * 1000)]
            clip.export('for_transcription.wav', format='wav')
            result = transcriber.transcribe('for_transcription.wav', task=task)
            df.loc[index,'text'] = result['text']
            df.loc[index,'language'] = result['language']
        
        self.results = df


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()