# Silver Layer
- Conversation of m4a files to mp3 files
- Calculating audio duration in seconds
- Generating transcripts using [OpenAI Whisper](https://openai.com/index/whisper/)

In [0]:
%pip install pydub mutagen openai-whisper numpy>=1.24
dbutils.library.restartPython()

In [0]:
%run "./resources/init" 

In [0]:
from pydub import AudioSegment
import os

file_reference_df = spark.table(f"{CATALOG}.{SCHEMA}.recordings_file_reference_bronze")

mp3_path = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/mp3_audio_recordings/"
if not dbutils.fs.mkdirs(mp3_path):
    dbutils.fs.mkdirs(mp3_path)

    # Convert each file to mp3 and save to the new volume
    for row in file_reference_df.collect():
        file_path = row['file_path']
        audio = AudioSegment.from_file(file_path)
        new_file_path = os.path.join(mp3_path, os.path.basename(file_path).replace(os.path.splitext(file_path)[1], ".mp3"))
        audio.export(new_file_path, format="mp3")

In [0]:
from mutagen.mp3 import MP3
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

mp3_file_reference_df = spark.createDataFrame(
    dbutils.fs.ls(f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}/mp3_audio_recordings")
).withColumn("file_path", F.expr("substring(path, 6, length(path))"))

def get_audio_duration(file_path):
    audio = MP3(file_path)
    return audio.info.length

get_audio_duration_udf = F.udf(get_audio_duration, FloatType())

mp3_file_reference_df = mp3_file_reference_df.withColumn("audio_duration", F.round(get_audio_duration_udf("file_path"), 0))

display(mp3_file_reference_df)

In [0]:
import whisper

# Load Whisper model (choose "small" for CPU, "medium" or "large" for GPU)
model = whisper.load_model("small")
print("Model loaded successfully!")

In [0]:
def transcribe_audio(file_path: str, model: whisper.Whisper) -> str:
    """
    Transcribe audio using Whisper model.

    Args:
        file_path (str): Path to the audio file.
        model (whisper.Whisper): Whisper model instance.

    Returns:
        str: Transcribed text from the audio file.
    """
    result = model.transcribe(file_path)
    return result["text"]

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

transcribe_udf = udf(lambda file_path: transcribe_audio(file_path, model), StringType())

transcriptions_df = mp3_file_reference_df.withColumn("transcription", transcribe_udf("file_path")) \
                                         .select("path", "modificationTime", "file_path", "transcription", "audio_duration")

display(transcriptions_df)

In [0]:
# existing_transcriptions_df = spark.table(f"{CATALOG}.{SCHEMA}.simulated_transcriptions")

# combined_transcriptions_df = existing_transcriptions_df.unionByName(transcriptions_df)

# display(combined_transcriptions_df)

# combined_transcriptions_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{CATALOG}.{SCHEMA}.transcriptions_silver")

In [0]:
if spark.catalog.tableExists(f"{CATALOG}.{SCHEMA}.transcriptions_silver"):
    transcriptions_silver_df = spark.table(f"{CATALOG}.{SCHEMA}.transcriptions_silver")
    combined_transcriptions_df = transcriptions_silver_df.unionByName(transcriptions_df).dropDuplicates()
else:
    combined_transcriptions_df = transcriptions_df.dropDuplicates()

combined_transcriptions_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{CATALOG}.{SCHEMA}.transcriptions_silver")

display(combined_transcriptions_df)