<a href="https://colab.research.google.com/github/sarayu-patel/Task-Assignment-AI-Researcher-Intern--Speech-Audio-Josh-Talks/blob/main/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import requests
import pandas as pd
from pydub import AudioSegment
from io import BytesIO, StringIO
from tqdm.auto import tqdm

# --- Helper Functions (Modified for clarity) ---

def load_data_from_sheet_url(url):
    """Loads a pandas DataFrame from a Google Sheet share URL."""
    print(f"Attempting to load data from: {url}")
    try:
        # Modify the Google Sheet URL to create a direct CSV download link
        file_id = url.split('/d/')[1].split('/')[0]
        gid = url.split('#gid=')[1] if '#gid=' in url else '0'
        download_url = f'https://docs.google.com/spreadsheets/d/{file_id}/export?format=csv&gid={gid}'

        response = requests.get(download_url)
        response.raise_for_status()

        data = StringIO(response.text)
        df = pd.read_csv(data)
        print("Data loaded successfully from Google Sheet.")
        return df
    except Exception as e:
        print(f"Error loading data from URL: {e}")
        return None

def download_audio_from_url(url):
    """Downloads an audio file from a URL and returns it as a pydub AudioSegment."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        audio_file = BytesIO(response.content)
        return AudioSegment.from_file(audio_file)
    except Exception as e:
        print(f"Failed to download or process audio from {url}. Error: {e}")
        return None

def download_transcription_json(url):
    """Downloads and parses a transcription JSON file from a URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Failed to download or parse JSON from {url}. Error: {e}")
        return None


# --- Corrected Question 3 Pipeline ---

def question_3_pipeline_corrected(data_url):
    """
    Correctly analyzes segments, detects disfluencies, clips the specific
    audio for that segment, and creates a structured CSV.
    """
    print("\n--- Starting Corrected Question 3 Pipeline: Disfluency Clipping ---\n")

    # 1. Load the main dataset
    print("Step 1: Loading main dataset...")
    df = load_data_from_sheet_url(data_url)
    if df is None or df.empty:
        print("Failed to load data. Aborting pipeline.")
        return

    # Using a small sample for demonstration purposes
    df_sample = df.head(10)
    print(f"Processing a sample of {len(df_sample)} recordings for this demonstration.")

    # 2. Define disfluency patterns
    # FIX: Corrected the regex pattern to avoid the "cannot refer to an open group" error.
    # We now use a non-capturing group (?:...) for the list of filler words.
    filler_words = r'\b(?:मतलब|जैसे|वो|तो|यू नो|बेसिकली)\b'
    # The pattern for repetitions is self-contained and correct.
    repetitions = r'\b(\w+)\s+\1\b'
    # Combine them with a simple OR. No outer capturing group is needed, which resolves the error.
    disfluency_pattern = f"{filler_words}|{repetitions}"
    print(f"Using disfluency pattern: Looking for filler words or repeated words.")

    # 3. Process each recording
    output_data = []
    processed_audio_cache = {} # Cache to avoid re-downloading the same large audio file
    output_clips_dir = "disfluency_clips"
    os.makedirs(output_clips_dir, exist_ok=True)

    print("\nStep 2: Analyzing segments and clipping audio...")
    for index, row in tqdm(df_sample.iterrows(), total=df_sample.shape[0], desc="Processing Recordings"):
        recording_id = row.get('recording_id', f'rec_{index}')
        audio_url = row.get('rec_url_gcp')
        transcription_url = row.get('transcription_url_gcp')

        if not all([audio_url, transcription_url]):
            continue

        # Download the transcription JSON containing segment timestamps
        segments = download_transcription_json(transcription_url)
        if not segments:
            continue

        # Analyze each segment within the recording
        for i, segment in enumerate(segments):
            text = segment.get('text', '')
            start_time = segment.get('start')
            end_time = segment.get('end')

            # Check if segment contains a disfluency and has valid timestamps
            if re.search(disfluency_pattern, text, re.IGNORECASE) and start_time is not None and end_time is not None:

                # Download the full audio only if we haven't already processed this recording
                if recording_id not in processed_audio_cache:
                    print(f"\nDownloading full audio for {recording_id}...")
                    full_audio = download_audio_from_url(audio_url)
                    if not full_audio:
                        break # Move to the next recording if audio download fails
                    processed_audio_cache[recording_id] = full_audio

                full_audio = processed_audio_cache[recording_id]

                # Clip the audio segment using pydub (requires milliseconds)
                start_ms = int(start_time * 1000)
                end_ms = int(end_time * 1000)
                clipped_segment = full_audio[start_ms:end_ms]

                # Save the clipped segment to a unique file
                clip_filename = f"{recording_id}_segment_{i}_{start_time:.2f}s_to_{end_time:.2f}s.wav"
                clip_filepath = os.path.join(output_clips_dir, clip_filename)
                clipped_segment.export(clip_filepath, format="wav")

                # Record the result
                output_data.append({
                    "recording_id": recording_id,
                    "disfluency_text": text,
                    "clipped_audio_path": clip_filepath
                })
                tqdm.write(f"  -> Detected disfluency in segment {i}. Saved clip: {clip_filepath}")

    # 4. Create the structured output CSV
    print("\nStep 3: Creating structured output sheet...")
    if not output_data:
        print("No disfluencies were detected in the sample. The output CSV will be empty.")
        # Create an empty CSV with the correct headers
        output_df = pd.DataFrame(columns=["recording_id", "disfluency_text", "clipped_audio_path"])
    else:
        output_df = pd.DataFrame(output_data)

    output_df.to_csv("disfluency_dataset.csv", index=False)
    print("Output dataset saved to 'disfluency_dataset.csv':")
    print(output_df)


if __name__ == '__main__':
    # URL to the Google Sheet containing the dataset links
    HINDI_ASR_DATA_URL = "https://docs.google.com/spreadsheets/d/1bujiO2NgtHlgqPlNvYAQf5_7ZcXARlIfNX5HNb9f8cI/edit#gid=1786138861"

    # Run the corrected pipeline
    question_3_pipeline_corrected(HINDI_ASR_DATA_URL)



  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):



--- Starting Corrected Question 3 Pipeline: Disfluency Clipping ---

Step 1: Loading main dataset...
Attempting to load data from: https://docs.google.com/spreadsheets/d/1bujiO2NgtHlgqPlNvYAQf5_7ZcXARlIfNX5HNb9f8cI/edit#gid=1786138861
Data loaded successfully from Google Sheet.
Processing a sample of 10 recordings for this demonstration.
Using disfluency pattern: Looking for filler words or repeated words.

Step 2: Analyzing segments and clipping audio...


Processing Recordings:   0%|          | 0/10 [00:00<?, ?it/s]


Downloading full audio for 825780...
  -> Detected disfluency in segment 0. Saved clip: disfluency_clips/825780_segment_0_0.11s_to_14.42s.wav
  -> Detected disfluency in segment 1. Saved clip: disfluency_clips/825780_segment_1_14.42s_to_29.03s.wav
  -> Detected disfluency in segment 4. Saved clip: disfluency_clips/825780_segment_4_52.70s_to_66.83s.wav
  -> Detected disfluency in segment 6. Saved clip: disfluency_clips/825780_segment_6_89.06s_to_103.19s.wav
  -> Detected disfluency in segment 7. Saved clip: disfluency_clips/825780_segment_7_103.19s_to_117.29s.wav
  -> Detected disfluency in segment 9. Saved clip: disfluency_clips/825780_segment_9_130.25s_to_144.83s.wav
  -> Detected disfluency in segment 10. Saved clip: disfluency_clips/825780_segment_10_146.18s_to_158.21s.wav
  -> Detected disfluency in segment 13. Saved clip: disfluency_clips/825780_segment_13_178.34s_to_190.85s.wav
  -> Detected disfluency in segment 16. Saved clip: disfluency_clips/825780_segment_16_224.42s_to_229.