# Transcribe Recordings
paid service [Goodtape](http://goodtape.io) will be used for the transcriptions

## Imports

In [1]:
%reload_ext autoreload
%autoreload 2

In [240]:
import requests
import pandas as pd
import os
import shutil
import io
import json
import math

from requests.utils import quote

# audio
from pydub import AudioSegment

# local imports
from lib_henryk.logger import *
from lib_henryk.config import *

## Params

In [3]:
# load api keys
from dotenv import load_dotenv
_ = load_dotenv()

# goodtape api key
api_key = os.getenv("API_KEY_GOODTAPE")

## Load Dataset

In [188]:
df = pd.read_csv(FILE_AUDIO_STATS_CSV)
df_transcription = df.copy()
df_transcription['transcription_id'] = ''

## Run Transcriptions

In [254]:
# re-read transcription log
if os.path.exists(FILE_TRANSCRIPTION_CSV):
    df_transcription = pd.read_csv(FILE_TRANSCRIPTION_CSV)

# set up callback
callback_url = 'https://webhook.site/d3c707b2-c8cb-44cf-97a7-a27e9cfda38c'

# load just few records
for index, row in df_transcription[100:150].iterrows():
    file_path = DIR_WIADOMOSCI_DO_HENRYCZKA + '/' + row['path']
    file_name = row['file']
    transcription_id = row['transcription_id']

    # copy file to /tmp, curl has issue with some paths
    temp_ext = file_name.split('.')[-1]
    temp_path = f'/tmp/recording.{temp_ext}'
    shutil.copy(file_path, temp_path)

    # run transcription API if needed
    if transcription_id == '' \
    or type(transcription_id) != str:
        print(f'processing [{index}]: {file_name}')
        
        response = !curl -X POST "https://api.goodtape.io/transcribe" \
        -H "Authorization: {api_key}" \
        -F "audio=@{temp_path}" \
        -F "callbackUrl={callback_url}" \
        -F "languageCode=pl" \
        -F "transcription_id=121231312" \
        -F "speakerLabels=true" \
        -F "timeStamps=false"

        try:
            response_json = json.loads(response[-1])
            transcription_id = response_json['transcriptionId']
            df_transcription.loc[index, 'transcription_id'] = transcription_id
        except:
            print(response)
            break
    else:
        print(f'file [{index}]: {file_name} already submitted for transcription: {transcription_id}')

# save transcription log
df_transcription.to_csv(FILE_TRANSCRIPTION_CSV)
print(f'submitting transcriptions completed')

processing [100]: Henryk 2022-10-04 Kubuś Puchatek utknął w drzwiach frontowych u Królika i przyjaciele go później uratowali.m4a
processing [101]: Henryk 2022-10-06 Kubuś Puchatek z Prosiaczkiem tropią łasicę.m4a
processing [102]: Henryk 2022-10-08 Kubuś Puchatek i Prosiaczek spotykają Krzysia podczas gdy tropią zwierze.m4a
processing [103]: Henryk 2022-10-10 Kubuś Puchatek odwiedza Kłapouchego i zauważa, że ten zgubił swój ogon.m4a
processing [104]: Henryk 2022-10-12 Urodziny Henryka i bajka w której Puchatek znajduje ogon Kłapouchego.m4a
processing [105]: Henryk 2022-10-13 Kubuś Puchatek razem z Prosiaczkiem planują złapać słonia.m4a
processing [106]: Henryk 2022-10-14 Puchatek i Prosiaczek budują pułapkę na słonia.m4a
processing [107]: Henryk 2022-10-15 Puchatek i Prosiaczek chcą sprawdzić czy udało im się złapać słonia.m4a
processing [108]: Henryk 2022-10-16 Puchatek dał się złapać w pułapkę na słonie.m4a
processing [109]: Henryk 2022-10-19 Kłapouchy ozn

## Reconcile Transcriptions

In [255]:
# create new column if needed
if not 'processed' in  df_transcription.columns.to_list():
    df_transcription['processed'] = False

# process records
for index, row in df_transcription[0:100].iterrows():
    file_path = DIR_WIADOMOSCI_DO_HENRYCZKA + '/' + row['path']
    file_name_without_ext = '.'.join(row['file'].split('.')[:-1])
    transcription_id = row['transcription_id']
    transcription_id_filename = f'{DIR_WHCLI}/{transcription_id}.json'
    transcription_filename = f'{DIR_TRANSCRIPTIONS}/{file_name_without_ext}.txt'

    # skip if already done
    if row['processed'] == True:
        continue
    
    # find out if a transcription was created and set the flag
    # and copy the file with the final name
    if os.path.exists(transcription_id_filename):
        # load json and extract text component
        json_txt = open(transcription_id_filename, 'r').read()
        json_content = json.loads(json_txt)
        transcription_text = json_content['content']['text']

        # write to the final destination
        with open(transcription_filename, 'w') as file:
            file.write(transcription_text)
            file.close()
        
        # mark as done
        df_transcription.loc[index, 'processed'] = True
        print(f'processed [{index}]: {transcription_filename.split("/")[-1]}')

# save transription log
df_transcription.to_csv(FILE_TRANSCRIPTION_CSV)

processed [0]: Henryk 2022-05-26 Bajka o nowym samochodziku.txt
processed [1]: Henryk 2022-05-27 Bajka o Eryku r1.txt
processed [2]: Henryk 2022-05-28 Bajka o swince peppie i domu rebeki.txt
processed [3]: Henryk 2022-05-30 Bajka o parku dinozaurow.txt
processed [4]: Henryk 2022-05-31 Bajka o drużynie koszykowki.txt
processed [5]: Henryk 2022-06-01 Bajka o bananach.txt
processed [6]: Henryk 2022-06-02 Bajka o kotkach i majówce.txt
processed [7]: Henryk 2022-06-03 Bajka o jeżyku cyprianie.txt
processed [8]: Henryk 2022-06-04 W starym gaju.txt
processed [9]: Henryk 2022-06-05 Bajka o okruchach słońca.txt
processed [10]: Henryk 2022-06-06 Bajka po peppie i perfumach.txt
processed [11]: Henryk 2022-06-07 Bajka o robotach drogowych.txt
processed [12]: Henryk 2022-06-08 Bajka o gimnastyce.txt
processed [13]: Henryk 2022-06-09 Bajka o peppie i misiu.txt
processed [14]: Henryk 2022-06-10 Bajka o czkawce.txt
processed [15]: Henryk 2022-06-13 Bajka o sloneczku.txt
processed [16]: Henryk 2022