In [5]:
!pip install transformers datasets librosa soundfile numpy




In [6]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
PROJECT_DIR = "/content/drive/MyDrive/Projects/IntelligentSpeechTherapy_NLP"

DATA_DIR = f"{PROJECT_DIR}/data"
REFERENCE_AUDIO_DIR = f"{DATA_DIR}/reference_audio"
PHONEME_AUDIO_DIR = f"{REFERENCE_AUDIO_DIR}/phonemes"
PHONEME_METADATA = f"{PROJECT_DIR}/metadata/phoneme_audio_index.json"

print("Project folder:", PROJECT_DIR)


Project folder: /content/drive/MyDrive/Projects/IntelligentSpeechTherapy_NLP


In [9]:
import torch, numpy as np, soundfile as sf, os, json
import torchaudio

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# Load torchaudio's wav2vec2 base model
bundle = torchaudio.pipelines.WAV2VEC2_BASE
model = bundle.get_model().to(DEVICE)
model.eval()

print("Loaded torchaudio bundle: WAV2VEC2_BASE")

# helpers
def load_audio_as_array(path, target_sr=16000):
    data, sr = sf.read(path, dtype='float32')
    if sr != target_sr:
        import librosa
        data = librosa.resample(data, sr, target_sr)
        sr = target_sr
    if data.ndim > 1:
        data = data.mean(axis=1)
    return data, sr

def wav2vec_embedding_torchaudio(audio_array, sr=16000):
    tensor = torch.from_numpy(audio_array).float().to(DEVICE)
    if tensor.dim() == 1:
        tensor = tensor.unsqueeze(0)

    with torch.no_grad():
        features, _ = model.extract_features(tensor)
        last = features[-1]               # [1, T, D]
        emb = last.mean(dim=1).squeeze()  # (D,)
        return emb.cpu().numpy()


Using device: cuda
Loaded torchaudio bundle: WAV2VEC2_BASE


In [15]:
files = sorted([f for f in os.listdir(PHONEME_DIR) if f.lower().endswith(".wav")])
print("Processing", len(files), "files")

embeddings = []
index_map = {}

for i, fname in enumerate(files):
    phoneme = os.path.splitext(fname)[0]
    path = os.path.join(PHONEME_DIR, fname)
    print(f"[{i+1}/{len(files)}] {phoneme}")
    audio, sr = load_audio_as_array(path, target_sr=16000)
    emb = wav2vec_embedding_torchaudio(audio, sr=sr)
    embeddings.append(emb)
    index_map[phoneme] = i
    # optional: save per-phoneme numpy
    np.save(os.path.join(OUT_MODELS, f"{phoneme}.npy"), emb)

emb_matrix = np.stack(embeddings, axis=0)  # (N, D)
OUT_EMB = os.path.join(META_DIR, "phoneme_embeddings.npy")
OUT_IDX = os.path.join(META_DIR, "phoneme_embeddings_index.json")

np.save(OUT_EMB, emb_matrix)
with open(OUT_IDX, "w") as f:
    json.dump(index_map, f, indent=2)

print("Saved embeddings matrix to:", OUT_EMB)
print("Saved index map to:", OUT_IDX)
print("Embeddings shape:", emb_matrix.shape)


Processing 42 files
[1/42] AA1
[2/42] AE1
[3/42] AH0
[4/42] AH1
[5/42] AO1
[6/42] AW1
[7/42] AY1
[8/42] B
[9/42] CH
[10/42] D
[11/42] DH
[12/42] EH0
[13/42] EH1
[14/42] ER0
[15/42] ER1
[16/42] F
[17/42] G
[18/42] HH
[19/42] IH0
[20/42] IH1
[21/42] IY0
[22/42] IY1
[23/42] JH
[24/42] K
[25/42] L
[26/42] M
[27/42] N
[28/42] NG
[29/42] OY1
[30/42] P
[31/42] R
[32/42] S
[33/42] SH
[34/42] T
[35/42] TH
[36/42] UH1
[37/42] UW1
[38/42] V
[39/42] W
[40/42] Y
[41/42] Z
[42/42] ZH
Saved embeddings matrix to: /content/drive/MyDrive/Projects/IntelligentSpeechTherapy_NLP/metadata/phoneme_embeddings.npy
Saved index map to: /content/drive/MyDrive/Projects/IntelligentSpeechTherapy_NLP/metadata/phoneme_embeddings_index.json
Embeddings shape: (42, 768)


In [16]:
EMB = np.load(OUT_EMB)
with open(OUT_IDX) as f:
    idx = json.load(f)
print("Loaded embedding matrix:", EMB.shape)
print("Number of phonemes in index:", len(idx))

# Print sample norms
for k in list(idx.keys())[:6]:
    i = idx[k]
    vec = EMB[i]
    print(k, "-> index", i, ", norm=", round(np.linalg.norm(vec), 3))


Loaded embedding matrix: (42, 768)
Number of phonemes in index: 42
AA1 -> index 0 , norm= 7.155
AE1 -> index 1 , norm= 7.587
AH0 -> index 2 , norm= 7.68
AH1 -> index 3 , norm= 7.79
AO1 -> index 4 , norm= 7.127
AW1 -> index 5 , norm= 7.578


In [None]:
from google.colab import files
files.download(OUT_EMB)
files.download(OUT_IDX)

# Zip per-phoneme .npy files for download
import shutil
shzip = os.path.join(OUT_MODELS, "phoneme_npy.zip")
shutil.make_archive(shzip.replace(".zip",""), 'zip', OUT_MODELS)
files.download(shzip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import numpy as np

v = np.load("/content/drive/MyDrive/Projects/IntelligentSpeechTherapy_NLP/models/embeddings/AH0.npy")
print("Shape:", v.shape)
print("Norm:", np.linalg.norm(v))
print("First 5 values:", v[:5])


Shape: (768,)
Norm: 7.6802974
First 5 values: [ 0.42316183  0.37638757 -0.02638518  0.20448925  0.27085596]
