In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-4zclen_v
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-4zclen_v
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
from glob import glob
from pathlib import Path
import json
import logging
from typing import List, Dict, Any, Tuple
import numpy as np
from collections import Counter

In [4]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Configuration
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [6]:
class ImprovedTranscriptionSystem:
    def __init__(self, model_size="large"):
        """Initialize the transcription system with improved parameters"""
        self.model = whisper.load_model(model_size)
        self.diarization_pipeline = None
        self.output_dir = Path("processed_output")
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def get_audio_duration(self, audio_path: str) -> float:
        """Get audio duration using ffprobe"""
        try:
            cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
                   "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            return float(result.stdout.strip())
        except Exception as e:
            logger.error(f"Could not get duration: {e}")
            return 0

    def smart_audio_preprocessing(self, input_path: str, output_path: str) -> bool:
        """Improved audio preprocessing with better noise reduction"""
        logger.info("Starting improved audio preprocessing...")

        # Advanced preprocessing command with better parameters for call center audio
        ffmpeg_command = [
            "ffmpeg", "-i", input_path,
            "-acodec", "pcm_s16le",
            "-ac", "1",  # Mono
            "-ar", "16000",  # 16kHz sample rate
            # Improved audio filters for call center quality
            "-af", (
                "highpass=f=300,"  # Remove low frequency noise
                "lowpass=f=3400,"  # Remove high frequency noise (telephone bandwidth)
                "loudnorm=I=-16:TP=-1.5:LRA=11,"  # Normalize loudness
                "afftdn=nr=20:nf=-25,"  # Noise reduction
                "compand=0.3,1:6:-70/-60,-20/-20,0/-6:0.5:0.1"  # Dynamic range compression
            ),
            "-y", output_path
        ]

        try:
            result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
            logger.info("✅ Advanced preprocessing successful")
            return True
        except subprocess.CalledProcessError as e:
            logger.error(f"❌ Advanced preprocessing failed: {e}")
            # Fallback to simpler preprocessing
            return self._fallback_preprocessing(input_path, output_path)

    def _fallback_preprocessing(self, input_path: str, output_path: str) -> bool:
        """Fallback preprocessing method"""
        logger.info("Trying fallback preprocessing...")
        ffmpeg_command = [
            "ffmpeg", "-i", input_path,
            "-acodec", "pcm_s16le",
            "-ac", "1",
            "-ar", "16000",
            "-af", "loudnorm",
            "-y", output_path
        ]

        try:
            subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
            logger.info("✅ Fallback preprocessing successful")
            return True
        except subprocess.CalledProcessError as e:
            logger.error(f"❌ Fallback preprocessing failed: {e}")
            return False

    def detect_language_and_transcribe(self, audio_path: str) -> Dict[str, Any]:
        """Improved transcription with automatic language detection"""
        logger.info("Starting improved transcription with language detection...")

        # First, detect the language
        detection_result = self.model.transcribe(
            audio_path,
            temperature=0.0,
            no_speech_threshold=0.6,
            condition_on_previous_text=False,
            task="transcribe"  # Don't translate yet
        )

        detected_language = detection_result.get('language', 'en')
        logger.info(f"Detected language: {detected_language}")

        # Custom prompt for insurance call context
        insurance_prompt = (
            "This is a customer service call for life insurance. "
            "Keywords: policy, premium, due date, payment, insurance, "
            "sum assured, nominee, health declaration, surrender value, "
            "lapse, coverage, maturity, fund value, late fee charges."
        )

        # Strategy 1: If English detected, transcribe directly
        if detected_language == 'en':
            logger.info("Using direct English transcription")
            result = self.model.transcribe(
                audio_path,
                language="en",
                task="transcribe",
                temperature=0.0,
                beam_size=5,
                patience=1.0,
                condition_on_previous_text=False,
                no_speech_threshold=0.6,
                compression_ratio_threshold=2.4,
                logprob_threshold=-1.0,
                initial_prompt=insurance_prompt,
                word_timestamps=True,
                verbose=False
            )
        else:
            # Strategy 2: For other languages, try both transcribe and translate
            logger.info(f"Trying both transcription and translation for {detected_language}")

            # Get original transcription
            transcribe_result = self.model.transcribe(
                audio_path,
                language=detected_language,
                task="transcribe",
                temperature=0.0,
                beam_size=5,
                patience=1.0,
                condition_on_previous_text=False,
                no_speech_threshold=0.6,
                compression_ratio_threshold=2.4,
                logprob_threshold=-1.0,
                initial_prompt=insurance_prompt,
                word_timestamps=True,
                verbose=False
            )

            # Get translation to English
            translate_result = self.model.transcribe(
                audio_path,
                language=detected_language,
                task="translate",
                temperature=0.0,
                beam_size=5,
                patience=1.0,
                condition_on_previous_text=False,
                no_speech_threshold=0.6,
                compression_ratio_threshold=2.4,
                logprob_threshold=-1.0,
                initial_prompt=insurance_prompt,
                word_timestamps=True,
                verbose=False
            )

            # Choose the result with better quality metrics
            transcribe_score = self._calculate_quality_score(transcribe_result)
            translate_score = self._calculate_quality_score(translate_result)

            if translate_score > transcribe_score:
                logger.info("Using translated result (better quality)")
                result = translate_result
                result['used_translation'] = True
            else:
                logger.info("Using original transcription (better quality)")
                result = transcribe_result
                result['used_translation'] = False

        result['detected_language'] = detected_language
        logger.info("✅ Transcription completed")
        return result

    def _calculate_quality_score(self, result: Dict[str, Any]) -> float:
        """Calculate a quality score for transcription results"""
        if not result.get('segments'):
            return 0.0

        segments = result['segments']

        # Metrics for quality assessment
        total_duration = sum(seg.get('end', 0) - seg.get('start', 0) for seg in segments)
        total_words = sum(len(seg.get('text', '').split()) for seg in segments)

        if total_duration == 0:
            return 0.0

        # Words per second (should be reasonable for speech)
        words_per_second = total_words / total_duration
        wps_score = 1.0 if 0.5 <= words_per_second <= 4.0 else 0.5

        # Check for excessive repetition
        all_text = ' '.join(seg.get('text', '') for seg in segments).lower()
        words = all_text.split()
        if words:
            word_counts = Counter(words)
            max_word_freq = max(word_counts.values())
            repetition_score = 1.0 - min(max_word_freq / len(words), 0.8)
        else:
            repetition_score = 0.0

        # Average confidence (if available)
        confidences = []
        for seg in segments:
            if 'words' in seg:
                confidences.extend([w.get('probability', 0.5) for w in seg['words'] if 'probability' in w])

        avg_confidence = np.mean(confidences) if confidences else 0.5

        # Combined score
        quality_score = (wps_score * 0.3 + repetition_score * 0.4 + avg_confidence * 0.3)
        return quality_score

    def enhanced_repetition_removal(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Enhanced repetition removal with better algorithms"""
        logger.info("🔍 Starting enhanced repetition removal...")

        if not segments:
            return segments

        cleaned_segments = []

        # First pass: Remove obvious repetitions
        for i, segment in enumerate(segments):
            text = segment.get('text', '').strip()
            if not text or len(text) < 3:
                continue

            words = text.split()
            if len(words) < 2:
                continue

            # Skip if too many consecutive repeated words
            max_consecutive_repeats = self._count_max_consecutive_repeats(words)
            if max_consecutive_repeats > 2:
                logger.debug(f"🚫 Skipping segment with {max_consecutive_repeats} consecutive repeats: {text[:30]}...")
                continue

            # Skip if dominated by single word
            word_counts = Counter(word.lower().strip('.,!?') for word in words)
            max_word_count = max(word_counts.values())
            if max_word_count / len(words) > 0.6:
                logger.debug(f"🚫 Skipping word-dominated segment: {text[:30]}...")
                continue

            cleaned_segments.append(segment)

        # Second pass: Remove near-duplicates
        final_segments = []
        for segment in cleaned_segments:
            is_duplicate = False
            current_text = segment.get('text', '').lower().strip()
            current_words = set(current_text.split())

            # Check against recent segments
            for prev_segment in final_segments[-3:]:  # Check last 3 segments
                prev_text = prev_segment.get('text', '').lower().strip()
                prev_words = set(prev_text.split())

                if current_words and prev_words:
                    # Calculate Jaccard similarity
                    intersection = len(current_words.intersection(prev_words))
                    union = len(current_words.union(prev_words))
                    similarity = intersection / union if union > 0 else 0

                    if similarity > 0.8:  # High similarity threshold
                        logger.debug(f"🚫 Removing near-duplicate: {current_text[:30]}...")
                        is_duplicate = True
                        break

            if not is_duplicate:
                final_segments.append(segment)

        removed_count = len(segments) - len(final_segments)
        logger.info(f"📊 Enhanced cleaning: {len(segments)} → {len(final_segments)} segments")
        logger.info(f"🗑️ Removed {removed_count} repetitive segments")

        return final_segments

    def _count_max_consecutive_repeats(self, words: List[str]) -> int:
        """Count maximum consecutive repeated words"""
        if len(words) < 2:
            return 0

        max_consecutive = 0
        current_consecutive = 0

        for i in range(1, len(words)):
            if words[i].lower().strip('.,!?') == words[i-1].lower().strip('.,!?'):
                current_consecutive += 1
                max_consecutive = max(max_consecutive, current_consecutive + 1)
            else:
                current_consecutive = 0

        return max_consecutive

    def advanced_text_cleanup(self, text: str) -> str:
        """Advanced text cleanup for insurance call transcriptions"""
        if not text:
            return ""

        # Insurance-specific corrections
        corrections = {
            'access max life': 'Axis Maxlife',
            'axis max life': 'Axis Maxlife',
            'max life': 'Maxlife',
            'g pay': 'GPay',
            'google pay': 'Google Pay',
            'phone pay': 'PhonePe',
            'phone pe': 'PhonePe',
            'pay tm': 'Paytm',
            'net banking': 'netbanking',
            'some assured': 'sum assured',
            'premium do': 'premium due',
            'do date': 'due date',
            'policy number': 'policy number',
            'nominee': 'nominee',
        }

        # Apply corrections
        text_lower = text.lower()
        for wrong, correct in corrections.items():
            text_lower = re.sub(rf'\b{re.escape(wrong)}\b', correct, text_lower, flags=re.IGNORECASE)

        # Fix currency symbols
        text_lower = re.sub(r'\brs[.]?\s*', '₹', text_lower)
        text_lower = re.sub(r'\brupees?\b', '₹', text_lower)

        # Clean up spacing and punctuation
        text_lower = re.sub(r'\s{2,}', ' ', text_lower)
        text_lower = re.sub(r'\s+([,.!?])', r'\1', text_lower)

        # Capitalize sentences
        text_lower = re.sub(r'(^|[.!?]\s+)([a-z])',
                           lambda m: m.group(1) + m.group(2).upper(),
                           text_lower)

        return text_lower.strip()

    def improved_speaker_diarization(self, audio_path: str) -> Any:
        """Improved speaker diarization with better parameters"""
        logger.info("🔊 Starting improved speaker diarization...")

        try:
            if self.diarization_pipeline is None:
                self.diarization_pipeline = Pipeline.from_pretrained(
                    "pyannote/speaker-diarization-3.1",
                    use_auth_token=HUGGING_FACE_ACCESS_TOKEN
                )

                if torch.cuda.is_available():
                    self.diarization_pipeline.to(torch.device("cuda"))
                    logger.info("✅ Using GPU for diarization")

            # Run diarization with improved parameters
            diarization = self.diarization_pipeline(
                audio_path,
                min_speakers=2,  # Expect at least 2 speakers (agent + customer)
                max_speakers=3   # Max 3 speakers (agent + customer + maybe supervisor)
            )

            logger.info("✅ Speaker diarization completed")
            return diarization

        except Exception as e:
            logger.error(f"⚠️ Diarization failed: {e}")
            return None

    def assign_speakers_intelligently(self, segments: List[Dict[str, Any]],
                                    diarization: Any) -> List[Dict[str, Any]]:
        """Intelligently assign speakers based on content and patterns"""

        def get_dominant_speaker(start: float, end: float) -> str:
            if not diarization:
                return "Speaker_Unknown"

            speakers = {}
            for segment, _, speaker in diarization.itertracks(yield_label=True):
                overlap = max(0, min(end, segment.end) - max(start, segment.start))
                if overlap > 0:
                    speakers[speaker] = speakers.get(speaker, 0) + overlap

            return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

        # Assign speakers to segments
        for segment in segments:
            start = segment.get('start', 0)
            end = segment.get('end', 0)
            speaker = get_dominant_speaker(start, end)
            segment['speaker'] = speaker

        # Post-process to identify Agent vs Customer based on content patterns
        agent_keywords = [
            'axis', 'maxlife', 'insurance', 'policy number', 'due date',
            'premium', 'payment', 'calling from', 'renewal', 'health declaration',
            'sir', 'madam', 'mam', 'thank you for', 'can you speak'
        ]

        customer_keywords = [
            'hello', 'yes', 'okay', 'no', 'i will', 'i have', 'i am',
            'thank you', 'my name is'
        ]

        # Score each speaker ID for being agent vs customer
        speaker_scores = {}
        for segment in segments:
            speaker = segment.get('speaker', 'Unknown')
            text = segment.get('text', '').lower()

            if speaker not in speaker_scores:
                speaker_scores[speaker] = {'agent_score': 0, 'customer_score': 0, 'total_words': 0}

            words = text.split()
            speaker_scores[speaker]['total_words'] += len(words)

            # Score based on keywords
            for keyword in agent_keywords:
                if keyword in text:
                    speaker_scores[speaker]['agent_score'] += 2

            for keyword in customer_keywords:
                if keyword in text:
                    speaker_scores[speaker]['customer_score'] += 1

        # Determine which speaker is agent vs customer
        speaker_roles = {}
        for speaker, scores in speaker_scores.items():
            if scores['total_words'] > 5:  # Only consider speakers with substantial content
                agent_ratio = scores['agent_score'] / max(scores['total_words'], 1)
                customer_ratio = scores['customer_score'] / max(scores['total_words'], 1)

                if agent_ratio > customer_ratio:
                    speaker_roles[speaker] = 'Agent'
                else:
                    speaker_roles[speaker] = 'Customer'

        # If we have exactly 2 main speakers, ensure one is Agent and one is Customer
        main_speakers = [s for s, scores in speaker_scores.items()
                        if scores['total_words'] > 10]

        if len(main_speakers) == 2:
            speakers_by_agent_score = sorted(main_speakers,
                                           key=lambda s: speaker_scores[s]['agent_score'],
                                           reverse=True)
            speaker_roles[speakers_by_agent_score[0]] = 'Agent'
            speaker_roles[speakers_by_agent_score[1]] = 'Customer'

        # Apply role assignments
        for segment in segments:
            original_speaker = segment.get('speaker', 'Unknown')
            segment['speaker'] = speaker_roles.get(original_speaker, original_speaker)

        return segments

    def merge_consecutive_segments(self, segments: List[Dict[str, Any]],
                                 max_gap: float = 2.0) -> List[Dict[str, Any]]:
        """Merge consecutive segments from the same speaker"""
        if not segments:
            return segments

        merged = []
        current_group = [segments[0]]

        for segment in segments[1:]:
            last_segment = current_group[-1]

            # Check if same speaker and close in time
            same_speaker = segment.get('speaker') == last_segment.get('speaker')
            time_gap = segment.get('start', 0) - last_segment.get('end', 0)
            close_in_time = time_gap <= max_gap

            if same_speaker and close_in_time:
                current_group.append(segment)
            else:
                # Merge current group
                if current_group:
                    merged_segment = self._merge_segment_group(current_group)
                    merged.append(merged_segment)
                current_group = [segment]

        # Don't forget the last group
        if current_group:
            merged_segment = self._merge_segment_group(current_group)
            merged.append(merged_segment)

        return merged

    def _merge_segment_group(self, segments: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Merge a group of segments into one"""
        if not segments:
            return {}

        if len(segments) == 1:
            return segments[0].copy()

        merged = segments[0].copy()
        texts = []

        for segment in segments:
            text = segment.get('text', '').strip()
            if text:
                texts.append(text)

        merged['text'] = ' '.join(texts)
        merged['start'] = segments[0].get('start', 0)
        merged['end'] = segments[-1].get('end', 0)

        return merged

    def process_audio_file(self, audio_file_path: Path) -> Dict[str, Any]:
        """Process a single audio file with improved pipeline"""
        logger.info(f"\n📁 Processing file: {audio_file_path.name}")

        # Prepare output paths
        clean_audio_path = self.output_dir / f"{audio_file_path.stem}_clean.wav"
        json_output_path = self.output_dir / f"{audio_file_path.stem}_transcription.json"

        # Step 1: Audio preprocessing
        if not self.smart_audio_preprocessing(str(audio_file_path), str(clean_audio_path)):
            logger.error("❌ Preprocessing failed, skipping file.")
            return {}

        try:
            # Step 2: Improved transcription
            whisper_result = self.detect_language_and_transcribe(str(clean_audio_path))

            # Step 3: Enhanced repetition removal
            cleaned_segments = self.enhanced_repetition_removal(whisper_result.get("segments", []))

            # Step 4: Text cleanup
            for segment in cleaned_segments:
                original_text = segment.get('text', '')
                cleaned_text = self.advanced_text_cleanup(original_text)
                segment['text'] = cleaned_text

            # Step 5: Speaker diarization
            diarization = self.improved_speaker_diarization(str(clean_audio_path))

            # Step 6: Intelligent speaker assignment
            segments_with_speakers = self.assign_speakers_intelligently(cleaned_segments, diarization)

            # Step 7: Merge consecutive segments
            final_segments = self.merge_consecutive_segments(segments_with_speakers)

            # Filter out very short segments
            final_segments = [seg for seg in final_segments
                            if len(seg.get('text', '').strip()) > 5]

            # Prepare output data
            output_data = {
                'metadata': {
                    'audio_file': str(audio_file_path.name),
                    'detected_language': whisper_result.get('detected_language', 'unknown'),
                    'used_translation': whisper_result.get('used_translation', False),
                    'total_duration': whisper_result.get('duration', 0),
                    'total_speakers': len(set(seg.get('speaker', 'Unknown') for seg in final_segments)),
                    'total_segments': len(final_segments),
                    'processing_successful': True,
                    'quality_score': self._calculate_quality_score(whisper_result)
                },
                'dialogue': final_segments,
                'raw_whisper_result': whisper_result
            }

            # Save results
            with open(json_output_path, 'w', encoding='utf-8') as f:
                json.dump(output_data, f, indent=2, ensure_ascii=False)

            logger.info(f"✅ Successfully processed: {json_output_path.name}")
            logger.info(f"📊 Final segments: {len(final_segments)}")

            return output_data

        except Exception as e:
            logger.error(f"❌ Processing failed for {audio_file_path.name}: {e}")
            return {}

In [7]:
def main():
    """Main processing function"""
    # Initialize the system
    system = ImprovedTranscriptionSystem()

    # Process all audio files
    input_dir = Path("training_data")
    audio_files = list(input_dir.glob("*.wav"))

    if not audio_files:
        logger.error("❌ No .wav files found in 'training_data/' folder.")
        return

    logger.info(f"🚀 Found {len(audio_files)} files to process...")

    # Process each file
    all_results = []
    for audio_file in audio_files:
        result = system.process_audio_file(audio_file)
        if result:
            all_results.append(result)

    # Generate training manifest
    manifest_path = system.output_dir / "training_manifest.jsonl"
    with open(manifest_path, 'w', encoding='utf-8') as f:
        for result in all_results:
            if result.get('dialogue'):
                # Create conversation text
                conversation_text = "\n".join([
                    f"{seg.get('speaker', 'Unknown')}: {seg.get('text', '')}"
                    for seg in result['dialogue']
                ])

                manifest_entry = {
                    "audio_filepath": str(system.output_dir / f"{Path(result['metadata']['audio_file']).stem}_clean.wav"),
                    "text": conversation_text,
                    "language": result['metadata'].get('detected_language', 'en'),
                    "task": "transcribe" if not result['metadata'].get('used_translation', False) else "translate"
                }
                f.write(json.dumps(manifest_entry, ensure_ascii=False) + "\n")

    logger.info(f"\n📄 Training manifest saved to: {manifest_path}")
    logger.info(f"✅ Successfully processed {len(all_results)} out of {len(audio_files)} files")

if __name__ == "__main__":
    main()

100%|█████████████████████████████████████| 2.88G/2.88G [01:02<00:00, 49.5MiB/s]
ERROR:__main__:❌ Advanced preprocessing failed: Command '['ffmpeg', '-i', 'training_data/call2.wav', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', '-af', 'highpass=f=300,lowpass=f=3400,loudnorm=I=-16:TP=-1.5:LRA=11,afftdn=nr=20:nf=-25,compand=0.3,1:6:-70/-60,-20/-20,0/-6:0.5:0.1', '-y', 'processed_output/call2_clean.wav']' returned non-zero exit status 1.
100%|██████████| 19076/19076 [03:01<00:00, 105.14frames/s]
100%|██████████| 19076/19076 [01:13<00:00, 260.71frames/s]


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)
ERROR:__main__:❌ Advanced preprocessing failed: Command '['ffmpeg', '-i', 'training_data/call3.wav', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', '-af', 'highpass=f=300,lowpass=f=3400,loudnorm=I=-16:TP=-1.5:LRA=11,afftdn=nr=20:nf=-25,compand=0.3,1:6:-70/-60,-20/-20,0/-6:0.5:0.1', '-y', 'processed_output/call3_clean.wav']' returned non-zero exit status 1.
100%|██████████| 14072/14072 [00:58<00:00, 238.77frames/s]
ERROR:__main__:❌ Advanced preprocessing failed: Command '['ffmpeg', '-i', 'training_data/call5.wav', '-acodec', 'pcm_s16le', '-ac', '1', '-ar', '16000', '-af', 'highpass=f=300,lowpass=f=3400,loudnorm=I=-16:TP=-1.5:LRA=11,afftdn=nr=20:nf=-25,compand=0.3,1:6:-70/-60,-20/-20,0/-6:0.5:0.1', '-y', 'processed_

In [8]:
import shutil

# Zip the folder into processed_outputs.zip
shutil.make_archive('processed_output-claude', 'zip', 'processed_output')

'/content/processed_output-claude.zip'

In [9]:
from google.colab import files

# Download the zipped folder
files.download('processed_output-claude.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>