In [None]:
import glob
import json
import librosa
import torch
from pathlib import Path
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Install and import necessary libraries
!pip install transformers

# Load pretrained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

def process_audio(audio_path):
    """
    Process audio file to extract input values for model.

    Args:
    audio_path (str): Path to the audio file.

    Returns:
    torch.Tensor: Input values for the model.
    """
    target_duration = 83  # seconds

    # Load the audio waveform and resample if necessary
    waveform, sample_rate = librosa.load(audio_path, sr=16000, mono=True)

    # Extract the desired segment
    segment = waveform[:int(target_duration * sample_rate)]
    input_values = processor(waveform, return_tensors="pt").input_values.squeeze()
    return input_values

def get_phone_posteriors(audio_path):
    """
    Get phone posteriors from audio file.

    Args:
    audio_path (str): Path to the audio file.

    Returns:
    torch.Tensor: Phone posteriors.
    """
    input_values = process_audio(audio_path)

    with torch.no_grad():
        logits = model(input_values.unsqueeze(0)).logits
        posteriors = torch.softmax(logits, dim=-1)

    phone_posteriors = posteriors[0]
    return phone_posteriors

# Process each audio file in the specified directory
a1 = []
for file in glob.glob("/content/drive/MyDrive/DATA/11/train/*.wav"):
    posterior_probs = get_phone_posteriors(file)
    tensor_list = posterior_probs.squeeze().tolist()
    filename1 = Path(file).stem
    x1 = filename1.split("_")
    a1.append([x1[0], tensor_list])

# Save the results to a JSON file
file_path = '/content/a1.json'
with open(file_path, 'w') as json_file:
    json.dump(a1, json_file)