In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.arpa.bin?download=true

In [None]:
!pip install python_speech_features

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
from python_speech_features import mfcc
from librosa.feature import spectral_centroid, chroma_stft
from sklearn.preprocessing import LabelEncoder
import kenlm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences

In [50]:
def extract_features(audio_file, num_mfcc=13, n_fft=2048):
    audio, sr = librosa.load(audio_file, sr=None)

    # Extract MFCC features
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=num_mfcc, n_fft=n_fft)
    #print("MFCC features shape:", mfcc_features.shape)

    # Extract spectral centroid
    #spectral_centroid_features = librosa.feature.spectral_centroid(y=audio, sr=sr)

    # Extract chroma features
    #chroma_features = librosa.feature.chroma_stft(y=audio, sr=sr)

    # Ensure all features have the same length
    max_length = 317
    mfcc_features = pad_or_truncate(mfcc_features, max_length)
    #spectral_centroid_features = pad_or_truncate(spectral_centroid_features, max_length)
    #chroma_features = pad_or_truncate(chroma_features, max_length)

    # Concatenate features
#    features = np.concatenate([mfcc_features, spectral_centroid_features, chroma_features], axis=0)

    return mfcc_features

def pad_or_truncate(array, target_length):
    current_length = array.shape[1]
    if current_length < target_length:
        # Pad the array
        padding = target_length - current_length
        pad_width = ((0, 0), (0, padding))
        padded_array = np.pad(array, pad_width=pad_width, mode='constant', constant_values=0)
        return padded_array
    elif current_length > target_length:
        # Truncate the array
        truncated_array = array[:, :target_length]
        return truncated_array
    else:
        return array

# Function to handle OOV words
def handle_oov_words(transcript, language_model):
    tokens = transcript.split()
    corrected_transcript = []

    for token in tokens:
        if not language_model.score(token):  # Check if token is OOV
            # Get candidate words from language model
            candidates = generate_candidates(token, language_model)

            # Choose the most likely candidate
            corrected_token = max(candidates, key=lambda x: x[1])[0]
            corrected_transcript.append(corrected_token)
        else:
            corrected_transcript.append(token)

    return ' '.join(corrected_transcript)

# Function to generate candidate corrections for OOV word
def generate_candidates(oov_word, language_model, num_candidates=5):
    candidates = []

    # Score candidate words based on the language model
    for word in language_model:
        score = language_model.score(oov_word + ' ' + word)
        candidates.append((word, score))

    # Sort candidates by score
    candidates.sort(key=lambda x: x[1], reverse=True)

    # Return top num_candidates candidates
    return candidates[:num_candidates]


In [62]:
# Directory containing audio files and transcripts
data_dir = '/content/drive/MyDrive/Capstone/TestFiles/'

# Load transcripts
transcripts = []
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(data_dir, filename), 'r') as f:
            transcripts.append({'audio_file': filename.replace('.txt', '.wav'), 'transcript': f.read()})

transcripts_df = pd.DataFrame(transcripts)

In [None]:
# Load label encoder
label_encoder = LabelEncoder()
label_encoder.fit(transcripts_df['transcript'])

# Load language model
language_model = kenlm.LanguageModel('/content/en.arpa.bin')

In [None]:
# Preprocess transcripts to handle OOV words
transcripts_df['processed_transcript'] = transcripts_df['transcript'].apply(lambda x: handle_oov_words(x, language_model))

In [51]:
# Extract features from audio files
max_length = 0
features = []

for audio_file in transcripts_df['audio_file']:
    feature = extract_features(os.path.join(data_dir, audio_file))
    features.append(feature)
    max_length = max(max_length, feature.shape[1])

print(max_length)

X = np.array(features)
y = label_encoder.transform(transcripts_df['processed_transcript'])

317


In [75]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the ASR model
model = Sequential([
    Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])),
    Bidirectional(LSTM(32)),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [76]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7cda519c2440>

In [56]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.8.1


In [77]:
from jiwer import wer
from sklearn.metrics import accuracy_score

# Get raw predictions
y_pred_prob = model.predict(X_test)

# Convert probabilities to class predictions
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert the integer-encoded predictions back to words using the label encoder
y_pred_words = label_encoder.inverse_transform(y_pred)

# Convert the integer-encoded ground truth transcriptions back to words
y_test_words = label_encoder.inverse_transform(y_test)

# Convert predictions and ground truth transcriptions to lists of strings
y_pred_words = list(map(str, y_pred_words))
y_test_words = list(map(str, y_test_words))

# Calculate Word Error Rate (WER)
wer_score = wer(y_test_words, y_pred_words)
print("Word Error Rate (WER):", wer_score)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Word Error Rate (WER): 0.0205761316872428
Accuracy: 0.9722222222222222


In [79]:
audio_file = "/content/sample_data/Test Audio - K T - U S - 2.wav"

# Extract features from audio file
features = extract_features(audio_file)

# Add batch dimension to the features
features = np.expand_dims(features, axis=0)

# Get raw predictions from a file
y_pred_prob = model.predict(features)

# Convert probabilities to class predictions
y_pred = np.argmax(y_pred_prob, axis=1)

# Convert the integer-encoded predictions back to words using the label encoder
y_pred_words = label_encoder.inverse_transform(y_pred)

print(y_pred_words)


['He is a quality player.']
