# Inferencing

## Library

In [1]:
#Library
import pandas as pd
import numpy as np
from datasets import load_dataset, Audio, DatasetDict, concatenate_datasets
import seaborn as sns
import matplotlib.pyplot as plt
import os
import librosa
from IPython.display import Audio
from scipy.signal import medfilt
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperTokenizer, WhisperFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
#!pip install librosa soundfile (if not installed)

  from .autonotebook import tqdm as notebook_tqdm


## Set CUDA to CPU

In [2]:
import torch
#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu", torch.cuda.is_available()
print(f"Using device: {device}") 

import GPUtil
GPUtil.getAvailable()

Using device: ('cpu', True)


[]

## Load Data


In [3]:
# Load Dataset
minds_us_data = load_dataset('csv', data_files={
    'train': 'F:\\AI Portfolio Project\\Project-3\\datasets_split\\minds_traindf.csv',
    'test': 'F:\\AI Portfolio Project\\Project-3\\datasets_split\\minds_testdf.csv',
    'validation': 'F:\\AI Portfolio Project\\Project-3\\datasets_split\\minds_valdf.csv'
})

# Print available keys to check what splits are loaded
print("Available splits:", minds_us_data.keys())

# Combine the datasets into one DatasetDict
ds = DatasetDict({
    'train': minds_us_data['train'],
    'test': minds_us_data.get('test'),  # Use .get() to avoid KeyError if 'test' doesn't exist
    'valid': minds_us_data.get('validation')
})

# Check if 'test' split exists before proceeding
if ds['test'] is None:
    print("Warning: 'test' split not found. Please check your dataset.")


Generating train split: 450 examples [00:00, 43765.64 examples/s]
Generating test split: 57 examples [00:00, 18996.85 examples/s]
Generating validation split: 56 examples [00:00, 27982.01 examples/s]

Available splits: dict_keys(['train', 'test', 'validation'])





In [4]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 450
    })
    test: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 57
    })
    valid: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent'],
        num_rows: 56
    })
})


In [5]:
# Defining Load Audio
def load_audio_data(batch, audio_base_path):
    audio_files = [os.path.join(audio_base_path, filepath) for filepath in batch['filepath']]
    audio_data = [librosa.load(file_path, sr=None) for file_path in audio_files]
    
    # Separate audio data and sample rates
    audio_signals = [data[0] for data in audio_data]
    sample_rates = [data[1] for data in audio_data]
    
    batch['audio'] = [{'path': file_path, 'array': audio, 'sampling_rate': sr} for file_path, audio, sr in zip(audio_files, audio_signals, sample_rates)]
    return batch


In [6]:
# Load Audio Path
audio_base_path = "F:\\AI Portfolio Project\\Project-3\\datasets\\MInDS-14\\audio"
ds = ds.map(load_audio_data, fn_kwargs={'audio_base_path': audio_base_path}, batched=True)

print(ds)

Map: 100%|██████████| 450/450 [00:04<00:00, 103.40 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 153.46 examples/s]
Map: 100%|██████████| 56/56 [00:00<00:00, 142.59 examples/s]

DatasetDict({
    train: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 450
    })
    test: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 57
    })
    valid: Dataset({
        features: ['filepath', 'text_asr', 'text_translated', 'intent', 'audio'],
        num_rows: 56
    })
})





In [7]:
def update_audio_paths(batch):
    # Generate full path for each audio file
    batch['audio'] = [
        {'path': os.path.join(audio_base_path, filepath),
         'array': audio['array'],
         'sampling_rate': audio['sampling_rate']}
        for filepath, audio in zip(batch['filepath'], batch['audio'])
    ]
    return batch

# Apply the preprocessing function to the dataset
ds = ds.map(update_audio_paths, batched=True)

Map: 100%|██████████| 450/450 [00:15<00:00, 28.17 examples/s]
Map: 100%|██████████| 57/57 [00:01<00:00, 29.08 examples/s]
Map: 100%|██████████| 56/56 [00:01<00:00, 32.28 examples/s]


In [8]:
ds = ds.remove_columns(["text_translated", "intent"])

In [9]:
minds_fix = ds
print(minds_fix)

DatasetDict({
    train: Dataset({
        features: ['filepath', 'text_asr', 'audio'],
        num_rows: 450
    })
    test: Dataset({
        features: ['filepath', 'text_asr', 'audio'],
        num_rows: 57
    })
    valid: Dataset({
        features: ['filepath', 'text_asr', 'audio'],
        num_rows: 56
    })
})


## Setting MER (Metrics)

In [10]:
import evaluate
metric = evaluate.load("wer")

In [11]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Inference Settings

In [12]:
#inferencing
import time
from transformers import WhisperForConditionalGeneration, WhisperProcessor

# Define the path to the model checkpoint (For local, using checkpoint-48)
model_path = "F:\AI Portfolio Project\Project-3\output"

try:
    model = WhisperForConditionalGeneration.from_pretrained(model_path)
    processor = WhisperProcessor.from_pretrained(model_path)
    print("Model and processor loaded successfully.")
except OSError as e:
    print(f"Error loading model or processor: {e}")

def transcribe(audio):
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features.to("cpu")
    
    forced_decoder_ids = processor.get_decoder_prompt_ids(language="de", task="transcribe")
    
    # Generate predictions and ensure the model's computations are on the correct device
    with torch.no_grad():
        predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    
    # Decode the predictions into text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]


Model and processor loaded successfully.


In [13]:
from datasets import Audio

minds_fix = minds_fix.cast_column("audio", Audio(sampling_rate=16_000))

## Inferencing

In [15]:
# Run inference on five samples
for i in range(5):
    sample = minds_fix["test"][i]
    
    start_time = time.time()
    transcription = transcribe(sample["audio"])
    end_time = time.time()
    
    inference_time = end_time - start_time
    
    print(f"Sample {i+1}:")
    print(f"Reference: {sample['text_asr']}")
    print(f"Prediction: {transcription}")
    print(f"Inference time: {inference_time:.4f} seconds")
    print()

# Calculate overall WER for these five samples
wer = metric.compute(predictions=[transcribe(minds_fix["test"][i]["audio"]) for i in range(5)],
                     references=[minds_fix["test"][i]["text_asr"] for i in range(5)])
print(f"WER for 5 samples: {wer}")

Sample 1:
Reference: hi I can't use my car because it doesn't work I don't know why my payment was declined and you help
Prediction: hi I can't use my car because it doesn't work I don't know why my payment was declined and you help
Inference time: 2.8306 seconds

Sample 2:
Reference: hi I am trying to make a big payment online and it says I'll get a text message from you guys to confirm my identity so I guess I'm standing by whenever you send it I'll be
Prediction: hi I am trying to make a big payment online and it says I'll get a text message from you guys to confirm my identity so I guess I'm standing by whenever you send it I'll be here and I just tell you guys the number right all right thanks
Inference time: 3.9469 seconds

Sample 3:
Reference: transfer money to the account
Prediction: transfer money to the account
Inference time: 2.0175 seconds

Sample 4:
Reference: good morning I am going to be taking a trip to Germany shortly and wanted to know whether my bank card will work w