In [6]:
# Install required packages
!pip install torchaudio --quiet
!pip install sounddevice --quiet
!pip install soundfile --quiet
!apt-get install libportaudio2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  libportaudio2
0 upgraded, 1 newly installed, 0 to remove and 1 not upgraded.
Need to get 65.3 kB of archives.
After this operation, 223 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]
Fetched 65.3 kB in 1s (111 kB/s)
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../libportaudio2_19.6.0-1.1_amd64.deb ...
Unpacking libportaudio2:amd64 (19.6.0-1.1) ...
Setting up libportaudio2:amd64 (19.6.0-1.1) ...
Processing triggers for libc-bin (2.35-0ubuntu3.11) ...
/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libur_adapter_level_zero.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/l

In [7]:
import os
import torchaudio

# Set the path where you want to download LibriSpeech
DATA_DIR = "/content/LibriSpeech"
os.makedirs(DATA_DIR, exist_ok=True)

# Choose the subset you want to download
# Options: "train-clean-100", "train-clean-360", "train-other-500", "dev-clean", "test-clean", etc.
subset = "train-clean-100"

# Download the dataset using torchaudio
dataset = torchaudio.datasets.LIBRISPEECH(
    root=DATA_DIR,
    url=subset,
    download=True
)

print(f"LibriSpeech {subset} downloaded at {DATA_DIR}")


LibriSpeech train-clean-100 downloaded at /content/LibriSpeech


In [8]:
# ==============================
# 1. IMPORTS
# ==============================
import os
import numpy as np
import librosa
from scipy.io.wavfile import write
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Reshape, LSTM, Dense
import sounddevice as sd
from collections import Counter

# ==============================
# 2. PARAMETERS
# ==============================
SAMPLE_RATE = 16000
N_MFCC = 13
MAX_LEN = 100
WINDOW_DURATION = 1.5  # seconds
WINDOW_OVERLAP = 0.5   # seconds
NUM_CLASSES = 50       # top 50 words

# ==============================
# 3. PATHS
# ==============================
LIBRI_PATH = "/content/LibriSpeech/LibriSpeech/train-clean-100"  # adjust to your downloaded subset

# ==============================
# 4. PARSE TRANSCRIPTS & COLLECT AUDIO FILES
# ==============================
file_paths = []
sentences = []

for root, dirs, files in os.walk(LIBRI_PATH):
    for file in files:
        if file.endswith(".txt"):
            txt_path = os.path.join(root, file)
            with open(txt_path, "r") as f:
                for line in f:
                    parts = line.strip().split(" ", 1)
                    if len(parts) < 2:
                        continue
                    file_id, sentence = parts
                    wav_file = os.path.join(root, file_id + ".flac")
                    if os.path.exists(wav_file):
                        file_paths.append(wav_file)
                        sentences.append(sentence.lower())

# ==============================
# 5. TOP 50 WORDS
# ==============================
all_text = " ".join(sentences).split()
word_counts = Counter(all_text)
CLASSES = [w for w,_ in word_counts.most_common(NUM_CLASSES)]

def word_to_index(word):
    return CLASSES.index(word) if word in CLASSES else None

print("Top 50 words:", CLASSES)

# ==============================
# 6. AUDIO PREPROCESSING
# ==============================
def preprocess_audio(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
        if np.max(np.abs(audio)) > 0:
            audio = audio / np.max(np.abs(audio))
        return audio, sr
    except Exception as e:
        print(f"[SKIP] Error loading {file_path}: {e}")
        return None, None

def extract_mfcc(audio, sr):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=N_MFCC)
    if mfcc.shape[1] < MAX_LEN:
        pad_width = MAX_LEN - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0,0),(0,pad_width)))
    else:
        mfcc = mfcc[:, :MAX_LEN]
    return mfcc

# ==============================
# 7. CREATE TRAINING DATA
# ==============================
X, y = [], []

for i, file_path in enumerate(file_paths):
    sentence = sentences[i].split()
    audio, sr = preprocess_audio(file_path)
    if audio is None:
        continue

    window_samples = int(WINDOW_DURATION * sr)
    step = int((WINDOW_DURATION - WINDOW_OVERLAP) * sr)
    start_idx = 0

    for word in sentence:
        if word not in CLASSES:
            continue
        end_idx = start_idx + window_samples
        if end_idx > len(audio):
            break
        chunk = audio[start_idx:end_idx]
        mfcc = extract_mfcc(chunk, sr)
        X.append(mfcc)
        y.append(word_to_index(word))
        start_idx += step

X = np.array(X)[..., np.newaxis]
y = to_categorical(y, NUM_CLASSES)
print("Dataset shape:", X.shape)

# ==============================
# 8. TRAIN/TEST SPLIT
# ==============================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ==============================
# 9. CNN + LSTM MODEL
# ==============================
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(N_MFCC, MAX_LEN,1)),
    MaxPooling2D((2,2)),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),
    Reshape((-1, 64)),  # automatically infer time steps
    LSTM(128),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ==============================
# 10. TRAIN MODEL
# ==============================
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# ==============================
# 11. RECORD LIVE AUDIO
# ==============================
def record_live_audio(filename="live.wav", duration=10, device=None):
    try:
        print(f"Recording {duration} seconds...")
        audio = sd.rec(int(duration*SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32', device=device)
        sd.wait()
        write(filename, SAMPLE_RATE, audio)
        print("Recording finished:", filename)
        return filename
    except Exception as e:
        print(f"[SKIP] Could not record live audio: {e}")
        return None

# ==============================
# 12. PREDICT CONTINUOUS SPEECH
# ==============================
def sliding_windows(audio, sr, window_duration=WINDOW_DURATION, overlap=WINDOW_OVERLAP):
    window_samples = int(window_duration*sr)
    step = int((window_duration-overlap)*sr)
    chunks = []
    for start in range(0, len(audio)-window_samples+1, step):
        chunks.append(audio[start:start+window_samples])
    return chunks

def predict_continuous_speech(file_path):
    audio, sr = preprocess_audio(file_path)
    if audio is None:
        return ""
    chunks = sliding_windows(audio, sr)
    results = []
    for chunk in chunks:
        mfcc = extract_mfcc(chunk, sr)
        mfcc = mfcc[np.newaxis, ..., np.newaxis]
        pred = model.predict(mfcc, verbose=0)
        index = np.argmax(pred)
        results.append(CLASSES[index])
    return " ".join(results)

# ==============================
# 13. RUN LIVE TEST
# ==============================
live_file = '/content/audiootesting.m4a'
if live_file:
    recognized_text = predict_continuous_speech(live_file)
    print("Recognized Text:", recognized_text)
else:
    print("Live audio skipped. You can use a pre-recorded file for testing.")

Top 50 words: ['the', 'and', 'of', 'to', 'a', 'in', 'i', 'was', 'he', 'that', 'it', 'his', 'had', 'as', 'with', 'you', 'for', 'her', 'but', 'is', 'not', 'she', 'at', 'on', 'be', 'him', 'they', 'by', 'have', 'this', 'my', 'were', 'which', 'all', 'from', 'so', 'said', 'one', 'me', 'we', 'there', 'their', 'no', 'when', 'an', 'or', 'them', 'would', 'if', 'who']
Dataset shape: (322878, 13, 100, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 53ms/step - accuracy: 0.1361 - loss: 3.5137 - val_accuracy: 0.1371 - val_loss: 3.5040
Epoch 2/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 52ms/step - accuracy: 0.1355 - loss: 3.5057 - val_accuracy: 0.1371 - val_loss: 3.5035
Epoch 3/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 51ms/step - accuracy: 0.1366 - loss: 3.5049 - val_accuracy: 0.1371 - val_loss: 3.5030
Epoch 4/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 52ms/step - accuracy: 0.1351 - loss: 3.5048 - val_accuracy: 0.1371 - val_loss: 3.5027
Epoch 5/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 51ms/step - accuracy: 0.1342 - loss: 3.5086 - val_accuracy: 0.1371 - val_loss: 3.5029
Epoch 6/20
[1m8072/8072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m409s[0m 51ms/step - accuracy: 0.1365 - loss: 3.5044 - val_accuracy: 0.1371 - val_loss: 3.502

  audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Recognized Text: the the the
