# Imports

In [122]:
import json, re, os, wave, contextlib
import pandas as pd

import vertexai
from vertexai import generative_models, init
from vertexai.generative_models import GenerativeModel, GenerationConfig, Part

In [123]:
# Temporary secrets manager
with open("../../../sun/secrets/configs.json", 'r') as secrets_file:
    configs = json.load(secrets_file)
    
project_id = configs.get('gcp_project_id')
location = configs.get('gcp_prjct_location')

vertexai.init(project=project_id, location=location)

# Tranlate: Gemini 1.5 Pro

In [124]:
# Initialize model
model = generative_models.GenerativeModel("gemini-1.5-flash-002")
# model = GenerativeModel("gemini-1.5-pro")

generation_config = {
    "temperature": 0.2,
    "max_output_tokens": 2048,
    "top_p": 1.0,
    "top_k": 40
}

# Transcipt: Translate

### Agent: Language Detection

In [125]:
def detect_caption_language(caption_dict: dict[str, str]) -> str:
    prompt = f"""
    You are a language detection agent.

    Given the JSON dictionary of captions (from a call), detect the language:
    - If all captions are English, respond with: "all_english"
    - If all are Spanish, respond with: "all_spanish"
    - If it's a mix, respond with: "partial_spanish"

    Captions JSON: {json.dumps(caption_dict, ensure_ascii=False)}
    """

    response = model.generate_content(prompt, generation_config=generation_config)
    detected = response.text.strip().lower()

    # Normalize
    if "all_english" in detected:
        return "all_english"
    elif "all_spanish" in detected:
        return "all_spanish"
    elif "partial_spanish" in detected:
        return "partial_spanish"
    else:
        return "unknown"

### Agent: Translation

In [126]:
def translate_caption_dict(caption_dict: dict[str, str], target_language="English") -> dict:
    prompt = f"""
    You are a highly accurate AI translator specialized in correcting faulty or mis-transcribed Spanish text.

    Translate the following **badly transcribed Spanish captions** into **fluent, grammatically correct English**.

    Some words or phrases may be:
    - Misspelled
    - Incomplete
    - Mixed with background noise artifacts

    Your job:
    - **Interpret what the speaker most likely intended to say in Spanish**
    - Use your best logical judgment to "guess and correct" transcription errors
    - Translate only the **intended meaning** into **natural English**

    --- INPUT FORMAT ---
    A JSON dictionary:
    {{
      "0": "transcribed Spanish caption",
      "1": "transcribed Spanish caption",
      ...
    }}

    --- OUTPUT FORMAT ---
    Return only a clean JSON dictionary with **English translations**, same structure:
    {{
      "0": "Fluent English translation",
      "1": "Fluent English translation",
      ...
    }}

    --- RULES ---
    1. Translate each caption individually and intelligently
    2. Output must be in **English only**
    3. No Spanish words should remain
    4. Do not include any comments, explanations, or markdown
    5. Keep the structure: {{"index": "translated caption", ...}} — exactly as shown
    6. Always return valid JSON

    --- INPUT CAPTIONS ---
    {json.dumps(caption_dict, ensure_ascii=False)}
    """


    response = model.generate_content(prompt, generation_config=generation_config)

    try:
        match = re.search(r'```json\s*([\s\S]*?)\s*```', response.text)
        if match:
            json_str = match.group(1)
        else:
            json_str = response.strip()

        return json.loads(json_str)
    except Exception as e:
        print("Translation JSON parse error:", e)
        return {}

### Extract JSON

In [127]:
def extract_json(response):
    """Extract valid JSON from response"""
    try:
        match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
        if match:
            json_str = match.group(1)
        else:
            json_str = response.strip()
        return json.loads(json_str)

    except Exception as e:
        raise RuntimeError(f"extract_json() failed: {str(e)}")

In [128]:
def process_call_transcript(df_call, caption_col="CAPTION"):
    caption_dict = {str(i): v for i, v in df_call[caption_col].items()}

    lang_status = detect_caption_language(caption_dict)
    print(f"[Language] Status: {lang_status}")

    if lang_status == "all_english":
        df_call["CAPTION_EN"] = df_call[caption_col]
        return df_call

    elif lang_status in ["all_spanish", "partial_spanish"]:
        translated_dict = translate_caption_dict(caption_dict)
        df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)
        return df_call

    else:
        df_call["CAPTION_EN"] = None
        return df_call

## Main

In [157]:
df_transcript = pd.DataFrame()

In [158]:
df = pd.read_csv("es_df_intra_calls.csv")
df = df[['CONTACT_ID','SPEAKER_TAG','CAPTION']]
print(len(df))
print()
print(df.groupby('CONTACT_ID').size().reset_index(name='count'))
print()
df.head(2)

208

                             CONTACT_ID  count
0  20b2738a-12f7-40aa-b0dd-e6f6c39fe931     37
1  9944cf65-823e-4c24-83bc-789a33b9d599      9
2  c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8     26
3  ce245ccd-afa1-4e07-8848-4291f4c2b50b      3
4  df3baa03-7ead-4268-864d-4ffd639df1ed    133



Unnamed: 0,CONTACT_ID,SPEAKER_TAG,CAPTION
0,df3baa03-7ead-4268-864d-4ffd639df1ed,AGENT,When I started for passage.
1,df3baa03-7ead-4268-864d-4ffd639df1ed,CUSTOMER,blando porque no mega bill de los panel solare...


In [159]:
df.CONTACT_ID.unique().tolist()[0]

'df3baa03-7ead-4268-864d-4ffd639df1ed'

In [164]:
for contact_id in df.CONTACT_ID.unique().tolist():
    df_call = df[df.CONTACT_ID==contact_id]
    df_translated = process_call_transcript(df_call)

    # # Get the index mask where CONTACT_ID matches
    # mask = (df_transcript["CONTACT_ID"] == contact_id)
    # # Assign translated values
    # df_transcript.loc[mask, "CAPTION_EN"] = df_translated["CAPTION_EN"].values
    
    df_translated.columns = df_translated.columns.str.upper()

    df_transcript = pd.concat([df_transcript, df_translated], ignore_index=True)

[Language] Status: partial_spanish


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)


[Language] Status: partial_spanish


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)


[Language] Status: partial_spanish


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)


[Language] Status: partial_spanish


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)


[Language] Status: partial_spanish


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_call["CAPTION_EN"] = df_call.index.astype(str).map(translated_dict)


In [171]:
print(df_transcript.CONTACT_ID.unique())
df_transcript

['df3baa03-7ead-4268-864d-4ffd639df1ed'
 '9944cf65-823e-4c24-83bc-789a33b9d599'
 'c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8'
 'ce245ccd-afa1-4e07-8848-4291f4c2b50b'
 '20b2738a-12f7-40aa-b0dd-e6f6c39fe931']


Unnamed: 0,CONTACT_ID,SPEAKER_TAG,CAPTION,CAPTION_EN
0,df3baa03-7ead-4268-864d-4ffd639df1ed,AGENT,When I started for passage.,When I started the process.
1,df3baa03-7ead-4268-864d-4ffd639df1ed,CUSTOMER,blando porque no mega bill de los panel solare...,Softly because I didn't get the mega bill for ...
2,df3baa03-7ead-4268-864d-4ffd639df1ed,AGENT,Gula el nombrelaimfor.,Gula is the name of the information.
3,df3baa03-7ead-4268-864d-4ffd639df1ed,CUSTOMER,The Adolfo Gomez in Maria Sandovalla direction...,The address is Adolfo Gomez and Maria Sandoval...
4,df3baa03-7ead-4268-864d-4ffd639df1ed,AGENT,"OK, let's confirmation.","OK, let's confirm."
...,...,...,...,...
469,20b2738a-12f7-40aa-b0dd-e6f6c39fe931,AGENT,Mhm.,Mhm.
470,20b2738a-12f7-40aa-b0dd-e6f6c39fe931,CUSTOMER,Well.,Well.
471,20b2738a-12f7-40aa-b0dd-e6f6c39fe931,AGENT,Mhm. Is there one there.,Mhm. Is there one there?
472,20b2738a-12f7-40aa-b0dd-e6f6c39fe931,CUSTOMER,Go it. We ain't getting away what I say that.,Got it. We're not getting away with what I said.


In [172]:
# Write to Excel file with a named sheet
# with pd.ExcelWriter("SpanishCalls-POC.xlsx", engine="xlsxwriter") as writer:
#     df_transcript.to_excel(writer, sheet_name="UsingTranscript", index=False)

with pd.ExcelWriter("SpanishCalls-POC.xlsx", engine="openpyxl", mode="a", if_sheet_exists="new") as writer:
    df_transcript.to_excel(writer, sheet_name="FromTranscript", index=False)

# Audio Files: Transcribe and Translate

In [25]:
import json, os
import io
from pydub import AudioSegment
from vertexai.generative_models import GenerativeModel, Part
from vertexai import init

In [5]:
def get_all_wav_files(folder_path: str) -> list[str]:
    wav_files = []
    for file in os.listdir(folder_path):
        if file.lower().endswith(".wav"):
            # full_path = os.path.join(folder_path, file)
            wav_files.append(file)
    return wav_files

In [6]:
def get_audio_duration(audio_path: str) -> float:
    with contextlib.closing(wave.open(audio_path, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
    return duration

In [30]:
model = GenerativeModel("gemini-1.5-pro")

generation_config = {
    "temperature": 0.2,
    "max_output_tokens": 2048,
    "top_p": 1.0,
    "top_k": 40
}

In [110]:
def transcribe_audio_with_gemini(
    audio_path: str
) -> str:
    """
    Transcribes a full audio file using Gemini 1.5 model via Vertex AI.

    Args:
        audio_path (str): Path to the audio file.

    Returns:
        str: Transcription text in the format speaker_tag<->caption<->caption_en.
    """

    # Load and preprocess the full audio
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)

    # Export audio to in-memory buffer
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    buffer.seek(0)

    # Prepare prompt and audio blob
    prompt = Part.from_text(
        """
        You are a Transcribing Agent with capabilities to recognize the words spoken by the speaker in the language spoken.
        You can also translate the captions to English.

        Some words or phrases may be:
        - Misspelled
        - Incomplete
        - Mixed with background noise artifacts

        Your job:
        - **Interpret what the speaker most likely intended to say in Spanish**
        - Use your best logical judgment to "guess and correct" transcription errors
        - Translate only the **intended meaning** into **natural English**

        Transcribe the spoken words from this audio, using <-> as separator: start_time<->end_time<->speaker_tag<->caption<->caption_en.
        start_time<->end_time<->Agent<->caption<->caption_en
        start_time<->end_time<->Customer<->caption<->caption_en
        start_time<->end_time<->Agent<->caption<->caption_en
        start_time<->end_time<->Customer<->caption<->caption_en
        """
    )

    audio_blob = Part.from_data(data=buffer.read(), mime_type="audio/wav")

    # Run Gemini transcription
    model = GenerativeModel("gemini-1.5-pro-preview-0409")
    try:
        response = model.generate_content([prompt, audio_blob])
        return response.text.strip()
    except Exception as e:
        print(f"Transcription failed: {e}")
        return ""

In [111]:
def convert_transcript_to_df(text_list):
    # Split each line into a list of elements
    data_list = []
    for line in text_list:
        elements = line.split("<->")
        if len(elements) == 5:
            data_list.append({
                "start_time": elements[0],
                "end_time": elements[1],
                "speaker_tag": elements[2],
                "caption": elements[3],
                "caption_en": elements[4]
            })

    # Create DataFrame from list of dictionaries
    df_transcription = pd.DataFrame(data_list)
    return df_transcription

## Main

In [115]:
wav_file_list = get_all_wav_files(os.getcwd())

df_master = pd.DataFrame()

wav_file_list

['9944cf65-823e-4c24-83bc-789a33b9d599_20250313T14_06_UTC.wav',
 'c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8_20250314T22_24_UTC.wav',
 '20b2738a-12f7-40aa-b0dd-e6f6c39fe931_20250304T14_18_UTC.wav',
 'df3baa03-7ead-4268-864d-4ffd639df1ed_20250304T19_44_UTC.wav',
 'ce245ccd-afa1-4e07-8848-4291f4c2b50b_20250317T15_34_UTC.wav']

In [117]:
for audio_path in wav_file_list:
    print(audio_path)
    contact_id = audio_path.split('_')[0]
    print(f"Processing call: {contact_id}")

    transcript = transcribe_audio_with_gemini(audio_path)

    df = convert_transcript_to_df(transcript.split('\n'))
    df.columns = df.columns.str.upper()
    df.insert(0, "CONTACT_ID", contact_id)

    df_master = pd.concat([df_master, df], ignore_index=True)

9944cf65-823e-4c24-83bc-789a33b9d599_20250313T14_06_UTC.wav
Processing call: 9944cf65-823e-4c24-83bc-789a33b9d599
c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8_20250314T22_24_UTC.wav
Processing call: c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8
20b2738a-12f7-40aa-b0dd-e6f6c39fe931_20250304T14_18_UTC.wav
Processing call: 20b2738a-12f7-40aa-b0dd-e6f6c39fe931
df3baa03-7ead-4268-864d-4ffd639df1ed_20250304T19_44_UTC.wav
Processing call: df3baa03-7ead-4268-864d-4ffd639df1ed
ce245ccd-afa1-4e07-8848-4291f4c2b50b_20250317T15_34_UTC.wav
Processing call: ce245ccd-afa1-4e07-8848-4291f4c2b50b


In [170]:
print(df_master.shape)

(116, 6)


In [121]:
# Write to Excel file with a named sheet
with pd.ExcelWriter("SpanishCalls-POC.xlsx", engine="openpyxl", mode="a", if_sheet_exists="new") as writer:
    df_master.to_excel(writer, sheet_name="UsingAudio", index=False)

In [169]:
df_master.CONTACT_ID.unique()

array(['9944cf65-823e-4c24-83bc-789a33b9d599',
       'c937dfd1-4d37-4e8f-a7fa-6684f8ebfce8',
       '20b2738a-12f7-40aa-b0dd-e6f6c39fe931',
       'df3baa03-7ead-4268-864d-4ffd639df1ed',
       'ce245ccd-afa1-4e07-8848-4291f4c2b50b'], dtype=object)