In [None]:
from datasets import load_dataset, Dataset, load_from_disk
import pandas as pd
import os
from pydub import AudioSegment
import json
import torch
from transformers import pipeline
import os
from transformers import pipeline
import torch
from datasets import Dataset
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

In [2]:
df = load_dataset("json",data_files = "/home/snp2453/slt/LibriSQA-PartI-test.json")['train']

In [None]:
df[0]

In [4]:
import os
from pydub import AudioSegment
import pandas as pd
from datasets import Dataset
import numpy as np

def merge_audio_files_from_dataset(df, batch_size=10):
    """
    Merge audio files based on dataset rows and update paths
    
    Args:
        df: HuggingFace dataset
        batch_size: Number of files to merge together (default: 10)
    """
    # Convert dataset to pandas for easier manipulation
    df_pd = df.to_pandas()
    
    # Create output directory
    output_dir = "merged_audio_files"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Process in batches
    for i in range(0, len(df_pd), batch_size):
        batch = df_pd.iloc[i:i+batch_size]
        
        # Skip if batch is incomplete
        if len(batch) < batch_size:
            continue
            
        # Initialize combined audio
        combined = None
        
        # Get the files in this batch
        for _, row in batch.iterrows():
            # Convert wav path to flac path
            flac_path = row['speech_path'].replace('.wav', '.flac')
            
            try:
                # Load the FLAC file
                audio = AudioSegment.from_file(flac_path, format='flac')
                
                if combined is None:
                    combined = audio
                else:
                    combined += audio
                    
            except Exception as e:
                print(f"Error processing {flac_path}: {str(e)}")
                continue
        
        if combined is not None:
            # Generate output filename based on batch index
            batch_num = i // batch_size
            output_wav = f"merged_{batch_num:04d}.wav"
            output_path = os.path.join(output_dir, output_wav)
            
            # Export as WAV
            combined.export(output_path, format='wav')
            
            # Update paths in the dataframe
            new_path = os.path.join('merged_audio_files', output_wav)
            df_pd.loc[i:i+batch_size-1, 'speech_path'] = new_path
            
    # Convert back to HuggingFace dataset
    return Dataset.from_pandas(df_pd)

def process_dataset(df):
    """
    Main processing function
    
    Args:
        df: HuggingFace dataset
    Returns:
        Updated dataset with new paths
    """
    print("Starting audio file merging process...")
    
    # Merge files and update dataset
    updated_df = merge_audio_files_from_dataset(df)
    
    print("Processing complete!")
    return updated_df

In [None]:
updated_dataset = process_dataset(df)

In [None]:
df[0]

In [None]:
updated_dataset[0]

In [12]:
def transcribe_audio_files(df, use_gpu=True):
    """
    Transcribe merged audio files using Whisper and update the dataset
    
    Args:
        df: HuggingFace dataset with merged audio paths
        use_gpu: Whether to use GPU for transcription
    Returns:
        Updated dataset with transcriptions
    """
    # Convert dataset to pandas for easier manipulation
    df_pd = df.to_pandas()
    
    # Create transcriptions directory
    transcript_dir = "transcriptions"
    if not os.path.exists(transcript_dir):
        os.makedirs(transcript_dir)
    
    # Initialize Whisper
    device = "cuda:0" if use_gpu and torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    whisper = pipeline("automatic-speech-recognition",
                      "openai/whisper-tiny",
                      return_timestamps=True,
                      device=device)
    
    # Add new column for Whisper transcriptions
    df_pd['whisper_transcription'] = ''
    
    # Get unique merged audio files from the dataset
    unique_audio_paths = df_pd['speech_path'].unique()
    
    # Process each unique audio file
    for audio_path in tqdm(unique_audio_paths, desc="Transcribing audio files"):
        # File is already a full path like '/merged_audio_files/merged_0000.wav'
        file_name = os.path.basename(audio_path)
        transcript_path = os.path.join(transcript_dir, file_name.replace('.wav', '.txt'))
        
        try:
            # Remove leading slash if present for correct path joining
            audio_path = audio_path.lstrip('/')
            
            # Transcribe using Whisper
            transcription = whisper(audio_path)
            transcript_text = transcription["text"].strip()
            
            # Save transcription to file
            with open(transcript_path, 'w', encoding='utf-8') as f:
                f.write(transcript_text)
            
            # Update all matching rows in the dataframe
            mask = df_pd['speech_path'] == audio_path
            df_pd.loc[mask, 'whisper_transcription'] = transcript_text
            
            print(f"\nProcessed {file_name}")
            print(f"Transcription: {transcript_text[:100]}...")  # Print first 100 chars
            
        except Exception as e:
            print(f"\nError transcribing {audio_path}: {str(e)}")
            continue
    
    # Convert back to HuggingFace dataset
    return Dataset.from_pandas(df_pd)

def compare_transcriptions(df):
    """
    Compare original text with Whisper transcriptions
    """
    comparison_df = pd.DataFrame({
        'File': df['speech_path'],
        'Original Text': df['text'],
        'Whisper Transcription': df['whisper_transcription']
    })
    
    comparison_df.to_csv('transcription_comparison.csv', index=False)
    print("\nSaved transcription comparison to transcription_comparison.csv")

In [None]:
use_gpu = torch.cuda.is_available()
print(f"CUDA available: {use_gpu}")

try:
    # Process the dataset
        updated_dataset_trans = transcribe_audio_files(updated_dataset, use_gpu=use_gpu)
    
    # Create comparison file
    # compare_transcriptions(updated_dataset_trans)
    
    # # Example of accessing the results
    # print("\nSample results:")
    # first_item = updated_dataset_trans[:1]
    # print(f"File: {first_item['speech_path'][0]}")
    # print(f"Original text: {first_item['text'][0]}")
    # print(f"Whisper transcription: {first_item['whisper_transcription'][0]}")
    
except Exception as e:
    print(f"Error during processing: {str(e)}")

In [None]:
updated_dataset[0]

### Doing QA with LLama model

In [None]:
df = load_from_disk("/home/snp2453/slt/merged_df")
df[0]

In [4]:
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct",device="cuda:0")

In [None]:
context0 = "A lover to clock at struck. It was a fine clear night that were the only persons on the road, and they sonchered leisurely along, to avoid paying the price of fatigue for the recreation provided for the Toledans in the valley or on the banks of their river. Secure as he thought, in the careful administration of justice in that city, and the character of its well-disposed inhabitants, the good Hidalgo was far from thinking that any disaster could be followed as family. Rudolfo and his companions, with their faces muffled in their cloaks, stared rudely and insulently at the mother, the daughter and the servant made. In a moment, he communicated his thoughts to his companions, and in the next moment, they resolved to turn back and carry her off to please Rudolfo. For the rich who are open-handed, always find parasites ready to encourage their bad propensities. And thus to conceive this wicked design to communicate it, approve it, resolve on ravishing Leo Cadia, and to carry that design into effect was the work of a moment. They drew their swords, hid their faces in the flaps of their cloaks, turned back and soon came in front of the little party, who had not yet done giving thanks to God for their escape from those audacious men. Finally the one party went off exalting, and the other was left in desolation and woe. Rudolfo arrived at his own house without any impediment. And Leo Kadia's parents reached there, his heart broken and despairing. Meanwhile Rudolfo had Leo Kadia's safe in his custody. And in his own apartment, who touches me? Am I in bed? Mother, dear father, do you hear me?"
question = "Where am I in this situation described in the text?"
gold_answer = "You are in bed."
messages = [
    {"role": "user", "content": f"You are an expert in precise question answering task. Given a context and a question, you need to give a direct answer, dont be verbose. Just give the answer directly. Context: {context0}. Question: {question}. Answer: "},
]
pipe(messages,max_length=1024)

In [6]:
context0 = "A lover to clock at struck. It was a fine clear night that were the only persons on the road, and they sonchered leisurely along, to avoid paying the price of fatigue for the recreation provided for the Toledans in the valley or on the banks of their river. Secure as he thought, in the careful administration of justice in that city, and the character of its well-disposed inhabitants, the good Hidalgo was far from thinking that any disaster could be followed as family. Rudolfo and his companions, with their faces muffled in their cloaks, stared rudely and insulently at the mother, the daughter and the servant made. In a moment, he communicated his thoughts to his companions, and in the next moment, they resolved to turn back and carry her off to please Rudolfo. For the rich who are open-handed, always find parasites ready to encourage their bad propensities. And thus to conceive this wicked design to communicate it, approve it, resolve on ravishing Leo Cadia, and to carry that design into effect was the work of a moment. They drew their swords, hid their faces in the flaps of their cloaks, turned back and soon came in front of the little party, who had not yet done giving thanks to God for their escape from those audacious men. Finally the one party went off exalting, and the other was left in desolation and woe. Rudolfo arrived at his own house without any impediment. And Leo Kadia's parents reached there, his heart broken and despairing. Meanwhile Rudolfo had Leo Kadia's safe in his custody. And in his own apartment, who touches me? Am I in bed? Mother, dear father, do you hear me?"
question = "Who is in Rodolfo's custody and where are they being kept?"
gold_answer = "Leocadia is safe in Rodolfo's custody, and they are being kept in his own apartment."
messages = [
    {"role": "user", "content": f"You are an expert in precise question answering task. Given a context and a question, you need to give a direct answer, dont be verbose. Just give the answer directly. Context: {context0}. Question: {question}. Answer: "},
]
ans = pipe(messages,max_length=1024)

In [None]:
answers = []
for i in tqdm(range(len(df))):
    question = df[i]['question']
    gt_answer = df[i]['answer']
    text_file_path = df[i]['speech_path'].split("/")[-1].split(".")[0]
    full_path = "/home/snp2453/slt/transcriptions/" + text_file_path + ".txt"
    with open(full_path, 'r') as file:
        content = file.readlines()
    
    messages = [
    {"role": "user", "content": f"You are an expert in precise question answering task. Given a context and a question, you need to give a direct answer, dont be verbose. Just give the answer directly. Context: {content}. Question: {question}. Answer: "},
    ]
    answer = pipe(messages,max_length=1024)
    answer = answer[0]['generated_text'][1]['content']
    answers.append(answer)    
    # print(answer)

: 

In [None]:
df = df.add_column("LLama_Answers",answers)

In [None]:
df.push_to_hub("SP2001/SLT_merged_df",private=True)

### Retrieval in Text Space

In [33]:
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [37]:
sentences = context0.split(".")

In [36]:
def hf_embedding_store(model_name: str):
    return HuggingFaceEmbedding(model_name=model_name, device = 'cuda:0',cache_folder="./")

def hf_embedding_ret(model_name: str):
    return HuggingFaceEmbedding(model_name=model_name, device = 'cuda:1',cache_folder="./")

store_model = hf_embedding_store(model_name="BAAI/bge-small-en-v1.5")
ret_model = hf_embedding_ret(model_name="BAAI/bge-small-en-v1.5")

def retriever(documents, embedding_type="float", model_name="embed-english-v3.0"):
    index = VectorStoreIndex.from_documents(
        documents,
        embed_model= store_model,
    )
    return VectorIndexRetriever(
        index=index,
        similarity_top_k=3,
        embed_model = ret_model,
    )

In [None]:
sentences

In [None]:
try:
    query = "Where am I in this situation described in the text?"
    documents = [Document(text=context) for context in sentences]
    retriever_int8 = retriever(documents, "int8")
    retrieved_docs = retriever_int8.retrieve(query)
    print(f"retrieved_sentence is {[doc.text for doc in retrieved_docs]}")
except Exception as e:
    print(f"Error: {e}")


In [None]:
try:
    query = "Who is in Rodolfo's custody and where are they being kept?"
    documents = [Document(text=context) for context in sentences]
    retriever_int8 = retriever(documents, "int8")
    retrieved_docs = retriever_int8.retrieve(query)
    print(f"retrieved_sentence is {[doc.text for doc in retrieved_docs]}")
except Exception as e:
    print(f"Error: {e}")

### Calculating Speech Embeddings

In [None]:
model_name = "facebook/wav2vec2-large-xlsr-53-german"
feature_extractor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

i= feature_extractor(train_dataset[:10]["speech"], return_tensors="pt", padding=True, 
                                 feature_size=1, sampling_rate=16000 )
model(**i)