# Transcribe Recordings
paid service [Goodtape](http://goodtape.io) will be used for the transcriptions

## Imports

In [1]:
%reload_ext autoreload
%autoreload 2

In [283]:
import requests
import pandas as pd
import os
import shutil
import io
import json
import math

from requests.utils import quote

# audio
from pydub import AudioSegment

# local imports
from lib_henryk.logger import *
from lib_henryk.config import *

## Params

In [3]:
# load api keys
from dotenv import load_dotenv
_ = load_dotenv()

# goodtape api key
api_key = os.getenv("API_KEY_GOODTAPE")

## Load Dataset

In [188]:
df = pd.read_csv(FILE_AUDIO_STATS_CSV)
df_transcription = df.copy()
df_transcription['transcription_id'] = ''

## Run Transcriptions

In [259]:
# re-read transcription log
if os.path.exists(FILE_TRANSCRIPTION_CSV):
    df_transcription = pd.read_csv(FILE_TRANSCRIPTION_CSV)

# set up callback
token_id = '301e3cc2-1072-4d10-8662-75ad29978cde'
callback_url = f'https://webhook.site/{token_id}'

# load just few records
for index, row in df_transcription[35:100].iterrows():
    file_path = DIR_WIADOMOSCI_DO_HENRYCZKA + '/' + row['path']
    file_name = row['file']
    transcription_id = row['transcription_id']

    # copy file to /tmp, curl has issue with some paths
    temp_ext = file_name.split('.')[-1]
    temp_path = f'/tmp/recording.{temp_ext}'
    shutil.copy(file_path, temp_path)

    # run transcription API if needed
    if transcription_id == '' \
    or type(transcription_id) != str:
        print(f'processing [{index}]: {file_name}')
        
        response = !curl -X POST "https://api.goodtape.io/transcribe" \
        -H "Authorization: {api_key}" \
        -F "audio=@{temp_path}" \
        -F "callbackUrl={callback_url}" \
        -F "languageCode=pl" \
        -F "speakerLabels=true" \
        -F "timeStamps=false"

        try:
            response_json = json.loads(response[-1])
            transcription_id = response_json['transcriptionId']
            df_transcription.loc[index, 'transcription_id'] = transcription_id
        except:
            print(response)
            break
    else:
        print(f'file [{index}]: {file_name} already submitted for transcription: {transcription_id}')

# save transcription log
df_transcription.to_csv(FILE_TRANSCRIPTION_CSV)
print(f'submitting transcriptions completed')

file [35]: Henryk 2022-07-04 Bajka o Wiewiórce.m4a already submitted for transcription: e7f97d32-9fea-494c-9dab-a3ac57156c6b
file [36]: Henryk 2022-07-05 Bajka o pogodowym królestwie.m4a already submitted for transcription: 1a9dbe63-8252-40f4-9c67-3ee5e87b7606
file [37]: Henryk 2022-07-06 Bajka o Choince.m4a already submitted for transcription: b6391df4-9d17-4a2b-b9e6-f2325797f80c
file [38]: Henryk 2022-07-07 Bajka o Ciuchci.m4a already submitted for transcription: d2bc1452-4d68-4071-97ac-0cb85fcc89a9
processing [39]: Henryk 2022-07-08 Bajka o jelonku.m4a
processing [40]: Henryk 2022-07-11 Bajka o Filemonie i Bonifacym.m4a
processing [41]: Henryk 2022-07-12 Puchatek i pszczoły.m4a
processing [42]: Henryk 2022-07-13 Puchatek utknął.m4a
processing [43]: Henryk 2022-07-14 Ja Bonifacy.m4a
processing [44]: Henryk 2022-07-15 Nazywam sie Filemon.m4a
processing [45]: Henryk 2022-07-16 Filemon Czarne myszy.m4a
processing [46]: Henryk 2022-07-18 Jak wygląda Filemon.m4a
processing [47]: Henry

## Fetch Transcriptions

In [290]:
# create new column if needed
if not 'processed' in  df_transcription.columns.to_list():
    df_transcription['processed'] = False

# use webhooks.site API to pull latest requests list
headers = {}
r = requests.get('https://webhook.site/token/'+ TOKEN_ID +'/requests?sorting=newest', headers=headers)
print(f'found {len(r.json()["data"])} requests to fetch')

# process requests one by one
for i, request in enumerate(r.json()['data']):
    # fetch the data         
    response = requests.get(f'https://webhook.site/token/{token_id}/request/{request["uuid"]}/raw')
    json_content = response.json()
    transcription_id = json_content['transcription_id']
    json_filename = f'{DIR_TRANSCRIPTIONS_JSON}/{transcription_id}.json'

    # write json file
    out_file = open(f'{json_filename}', "w") 
    json.dump(json_content, out_file, indent = 4) 
    out_file.close() 

    # find record with the transcription id and get the content from json
    row = df_transcription[df_transcription['transcription_id'] == transcription_id].iloc[0]
    file_name = row['file']
    file_name_without_ext = '.'.join(row['file'].split('.')[:-1])
    transcription_filename = f'{DIR_TRANSCRIPTIONS}/{file_name_without_ext}.txt'
    transcription_text = json_content['content']['text']
    
    # write to the final destination
    with open(transcription_filename, 'w') as file:
        file.write(transcription_text)
        file.close()

    # mark as done in the transcription log
    df_transcription.loc[index, 'processed'] = True
    print(f'processed request [{i}] {transcription_id}: {transcription_filename.split("/")[-1]}')

    # clean up this request as it was fetched
    response = requests.delete(f'https://webhook.site/token/{token_id}/request/{request["uuid"]}')

# save transription log
df_transcription.to_csv(FILE_TRANSCRIPTION_CSV)

# print message that we are done
print(f'transcriptions were processed, there are {len(df_transcription[df_transcription["processed"] == True])} transcriptions available')

found 0 requests to fetch
transcriptions were processed, there are 39 transcriptions available


## See how we are