# 1. Setup and Libraries

### 1.1 Installing Libraries

In [1]:
!pip uninstall -qqy jupyterlab  # Remove unused conflicting packages
!pip install -U -q "google-genai==1.7.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### 1.2 Importing Libraries

In [2]:
import pandas as pd
import os
import base64
from IPython.display import display, Markdown, HTML, Audio
import librosa
import soundfile as sf
import io

from google import genai
import vertexai
from vertexai.generative_models import GenerativeModel, Part    
from kaggle_secrets import UserSecretsClient

### 1.3 Preparing Static and Environment Variables

In [3]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
BASE_URL = "/kaggle/input/audio-data/audio/audio/audio_samples"
DOMAIN_MAP = {'telco': 'telecom'}

In [4]:
# Path to the service account key file within the Kaggle environment
key_path = '/kaggle/input/bpi-internship-key/bpi-internship-462819-ece668c2abd2.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_path

# Verify the environment variable is set
print(f"GOOGLE_APPLICATION_CREDENTIALS set to: {os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')}")

GOOGLE_APPLICATION_CREDENTIALS set to: /kaggle/input/bpi-internship-key/bpi-internship-462819-ece668c2abd2.json


### 1.4 Helper Functions

In [5]:
def display_df(df: pd.DataFrame):
    html = df.head(300).to_html(index=False)

    scrollable = f'''
    <div style="
        max-height: 300px;
        max-width: 100%;
        overflow: auto;
        border: 1px solid #ccc;
        white-space: nowrap;
    ">
        {html}
    </div>
    '''

    display(HTML(scrollable))

In [6]:
def get_audio_and_path(df, idx):
    """
    Fetches the audio file from BASE_URL/<mapped_domain>/<AudioFile>,
    creates an IPython Audio player, and returns the Base64-encoded audio bytes.
    """
    # Ensure required columns exist
    if 'AudioFile' not in df.columns or 'Domain' not in df.columns:
        raise KeyError("DataFrame must contain both 'AudioFile' and 'Domain' columns")

    # Grab the raw domain and map it if needed
    raw_domain = df.at[idx, 'Domain']
    mapped_domain = DOMAIN_MAP.get(raw_domain, raw_domain)

    # Relative path to the audio file
    rel_path = df.at[idx, 'AudioFile']

    # Construct full path/URL
    full_path = f"{BASE_URL.rstrip('/')}/{mapped_domain}/{rel_path.lstrip('/')}"

    # Create the Audio player (URL or local file)
    if BASE_URL.startswith(("http://", "https://")):
        player = Audio(url=full_path)
    else:
        player = Audio(filename=full_path)

    # Read the audio bytes and Base64-encode
    with open(full_path, "rb") as f:
        audio_bytes = f.read()
    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")

    return player, full_path, audio_base64

# 2. Loading the Dataset

In [7]:
df = pd.read_csv('/kaggle/input/audio-data/synthetic_audio_bert.csv')
display_df(df)

case_id,case_text_value_type,case_text,case_source,case_type,case_transaction_type,case_status,case_priority_level,date_time_created,date_time_closed,customer_tier,RecordingId,SpeakerId,SegmentDuration,Domain,SampleRate,BitDepth,Role,IsNative,Age,Gender,Country,Accent,MainAudioFile,AudioFile
328659,Single message,"Maraming salamat sa pag tawag sa ""Bangko ng Makati"" ako po si ""Teresita"" ano pong pwedeng maitulong ko sa inyo",Phone,Deposits,Inquiry,New,Medium,2024-01-26T08:00:00+08:00,,Mid,12a95507-8727-497f-a599-a65d12c3ba42,E1EBF390-D319-46CE-96D2-1CEBC0F7396C,00:00:5.735,banking,8000,16,agent,True,33,Female,Philippines,National Capital Region (Metro Manila),Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328659.wav
328660,Single message,Teresita [filler/] may tanong lang ako tungkol sa perang nalikom ko para sa kawang-gawa,Phone,Deposits,Inquiry,New,Low,2024-01-26T08:01:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,6106C194-2F98-4C04-B0DC-666E2B3D10B9,00:00:6.024,banking,8000,16,customer,True,28,Female,Philippines,Cavite,Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328660.wav
328661,Single message,[filler/] Ganun ho ba sige ho [filler/] susubukan ko po kayong tulungan tungkol dyan [filler/] marami [filler/] maaari ko po bang ma-tanong muna kung may [eng_start/] account [eng_end/] na po ba kayo sa bangko,Phone,Deposits,Inquiry,New,Low,2024-01-26T08:02:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,E1EBF390-D319-46CE-96D2-1CEBC0F7396C,00:00:13.792,banking,8000,16,agent,True,33,Female,Philippines,National Capital Region (Metro Manila),Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328661.wav
328662,Single message,Oo meron naman na akong [eng_start/] account [eng_end/] sa inyong bangko,Phone,Deposits,Inquiry,New,Low,2024-01-26T08:03:00+08:00,,High,12a95507-8727-497f-a599-a65d12c3ba42,6106C194-2F98-4C04-B0DC-666E2B3D10B9,00:00:3.162,banking,8000,16,customer,True,28,Female,Philippines,Cavite,Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328662.wav
328663,Single message,"[filler/] Ganun ho ba pe- maaari ko po bang mahingi ang buo ninyong pangalan maraming salamat ho maaari ko rin po bang mahingi ang numero po ng [eng_start/] ""National Insurance"" [eng_end/]",Phone,Deposits,Inquiry,New,Medium,2024-01-26T08:04:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,E1EBF390-D319-46CE-96D2-1CEBC0F7396C,00:00:11.985,banking,8000,16,agent,True,33,Female,Philippines,National Capital Region (Metro Manila),Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328663.wav
328664,Single message,"Ang pangalan ko ay ""Doris Kelly""",Phone,Deposits,Request,New,High,2024-01-26T08:05:00+08:00,,High,12a95507-8727-497f-a599-a65d12c3ba42,6106C194-2F98-4C04-B0DC-666E2B3D10B9,00:00:2.928,banking,8000,16,customer,True,28,Female,Philippines,Cavite,Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328664.wav
328665,Single message,"Ang numero ng aking [eng_start/] ""National Insurance"" [eng_end/] ay [eng_start/] one three two four five six nine eight [eng_end/]",Phone,Credit Cards,Inquiry,New,Low,2024-01-26T08:06:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,6106C194-2F98-4C04-B0DC-666E2B3D10B9,00:00:5.617,banking,8000,16,customer,True,28,Female,Philippines,Cavite,Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328665.wav
328666,Single message,Maraming salamat ho [n_s/] Maaari niyo rin po bang pakiberipika sa akin ang petsa ng inyong kapanganakan,Phone,Deposits,Request,New,Low,2024-01-26T08:07:00+08:00,,Mid,12a95507-8727-497f-a599-a65d12c3ba42,E1EBF390-D319-46CE-96D2-1CEBC0F7396C,00:00:5.752,banking,8000,16,agent,True,33,Female,Philippines,National Capital Region (Metro Manila),Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328666.wav
328667,Single message,"Ang petsa ng aking kapanganakan ay [eng_start/] ""January"" twenty-seven nineteen eighty-six [eng_end/]",Phone,Deposits,Inquiry,New,High,2024-01-26T08:08:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,6106C194-2F98-4C04-B0DC-666E2B3D10B9,00:00:5.792,banking,8000,16,customer,True,28,Female,Philippines,Cavite,Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328667.wav
328668,Single message,Maraming salamat po sa pag-beberepika ng inyong [eng_start/] account [eng_end/] ngayon ho pwede ko po bang malaman kung ano po ba yung partikular na kailangan niyo pong detalye tungkol nga po dun sa organisasyong kawanggawa na gusto niyong [filler/] mang- [filler/] hiling,Phone,Deposits,Request,New,Low,2024-01-26T08:09:00+08:00,,Low,12a95507-8727-497f-a599-a65d12c3ba42,E1EBF390-D319-46CE-96D2-1CEBC0F7396C,00:00:13.536,banking,8000,16,agent,True,33,Female,Philippines,National Capital Region (Metro Manila),Audio/12a95507-8727-497f-a599-a65d12c3ba42.wav,sliced_audio/banking_328668.wav


# 3. Google [Gemini](https://ai.google.dev/api?lang=python) Pipeline
> Using `Gemini 2.5 Pro Preview 06-05`

### 3.1 Prepare Model and Credentials

In [8]:
model = "gemini-2.5-pro-preview-06-05"

In [9]:
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

print("Authenticated" if client else "Error Getting API Key")

Authenticated


### 3.2 Voice Input

In [10]:
# audio_idx = 123
audio_idx = 124

sample_audio, sample_audio_url, sample_audio_base64 = get_audio_and_path(df, audio_idx)
print((sample_audio_url))
sample_audio

print("File Path: ", sample_audio_url)

display(Markdown("---\n### Audio:"))
display(sample_audio)

display(Markdown("---\n### Actual Transcription:"))
display(Markdown(df.loc[audio_idx, 'case_text']))

/kaggle/input/audio-data/audio/audio/audio_samples/banking/sliced_audio/banking_330051.wav
File Path:  /kaggle/input/audio-data/audio/audio/audio_samples/banking/sliced_audio/banking_330051.wav


---
### Audio:

---
### Actual Transcription:

[n_s/] [filler/] Magandang araw po gusto ko sanang i-activate ang isang bagong [eng_start/] credit card [eng_end/]

### 3.3 Transcribe

In [11]:
def transcribe_audio(audio_b64):
    """
    Sends base64-encoded WAV audio to the model and returns the transcription as plain text.
    """
    audio_part = {
        "inlineData": {
            "mimeType": "audio/wav",
            "data": audio_b64
        }
    }
    text_part = {
        "text": (
            "You are a world-class transcription engine. "
            "Transcribe the following audio to plain text only, with no extra formatting:\n\n"
            "(Begin audio input)"
        )
    }

    resp = client.models.generate_content(
        model=model,
        contents=[audio_part, text_part]
    )
    return resp.text.strip()


In [12]:
transcription = transcribe_audio(sample_audio_base64)

# Gemini Tagalog transcription
display(Markdown(f"**Tagalog transcription:**  \n\n{transcription}"))

**Tagalog transcription:**  

ah magandang araw po gusto ko sanang i-activate ang isang bagong credit card

### 3.4 Translate

In [13]:
def translate_to_english(text):
    """
    Detects the language of the input and translates it into English.
    """
    prompt = (
        "You are a world-class translation engine. "
        "Detect the language of the following text and translate it into English. "
        "Return ONLY the translated English text with no extra commentary:\n\n"
        f"{text}"
    )
    resp = client.models.generate_content(
        model=model,
        contents=prompt
    )
    return resp.text.strip()

In [14]:
translation = translate_to_english(transcription)

# English translation
display(Markdown(f"**English translation:**  \n\n{translation}"))

**English translation:**  

Hello, I would like to activate a new credit card.

### 3.5 Detect Audio Emotions

In [15]:
def predict_tone_from_audio(audio_file_path):
    """Sends a prediction request to a deployed, fine-tuned Gemini model for tone analysis."""

    # --- Your project and model details ---
    project_id = "bpi-internship-462819"
    endpoint_id = "9129549610145021952"
    location = "us-central1"

    # --- 1. Initialize the Vertex AI SDK ---
    vertexai.init(project=project_id, location=location)

    # --- 2. Load your fine-tuned model from the endpoint ---
    # The SDK knows how to talk to a Gemini model endpoint.
    tuned_model = GenerativeModel(f"projects/{project_id}/locations/{location}/endpoints/{endpoint_id}")

    # --- 3. Load and prepare the audio file ---
    # This part is simplified using the SDK's 'Part' object.
    try:
        # Load audio data into memory
        audio, sr = librosa.load(audio_file_path, sr=None)
        
        # Create an in-memory WAV file
        buffer = io.BytesIO()
        sf.write(buffer, audio, sr, format='wav')
        audio_bytes = buffer.getvalue()

        # Create a Part object for the audio
        audio_part = Part.from_data(data=audio_bytes, mime_type="audio/wav")
        
    except Exception as e:
        print(f"Error processing audio file: {e}")
        return None

    # --- 4. Prepare the prompt and make the prediction request ---
    # The structure [audio_part, text_part] mirrors your training data.
    prompt = "Detect the emotion of the tone: one of euphoric, joyfully, sad, surprised."
    
    # The generate_content method is the correct way to call Gemini
    response = tuned_model.generate_content([audio_part, prompt])

    # --- 5. Return the result ---
    # The prediction is directly available in the 'text' attribute of the response.
    try:
        return response.text
    except Exception as e:
        print(f"Failed to get text from response. Full response: {response}")
        return None

In [16]:
tone_prediction = predict_tone_from_audio(sample_audio_url)

if tone_prediction:
    print(f"Predicted Tone: {tone_prediction}")

Predicted Tone: joyfully


### 3.6 Consolidated Output

In [17]:
result_df = pd.DataFrame(
    [
        {
            'Actual Transcription': df.loc[audio_idx, 'case_text'],
            'Gemini Transcription': transcription,
            'Gemini Translation': translation,
            'Gemini Tone Emotion': tone_prediction
        }
    ]
)

display_df(result_df)

Actual Transcription,Gemini Transcription,Gemini Translation,Gemini Tone Emotion
[n_s/] [filler/] Magandang araw po gusto ko sanang i-activate ang isang bagong [eng_start/] credit card [eng_end/],ah magandang araw po gusto ko sanang i-activate ang isang bagong credit card,"Hello, I would like to activate a new credit card.",joyfully
