In [27]:
import sys
!{sys.executable} -m pip install numpy pandas matplotlib librosa soundfile ipython python-dotenv openai google-generativeai scikit-learn scipy noisereduce pydub SpeechRecognition langid
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import soundfile as sf
import json
from IPython.display import Audio, display
from dotenv import load_dotenv
import openai
import google.generativeai as genai
from sklearn.cluster import KMeans
from scipy.signal import medfilt
import glob
import time
import traceback 
import noisereduce as nr
from pydub import AudioSegment
import speech_recognition as speech_rec  # Renamed to avoid conflict
import langid

load_dotenv()

# Configure API keys
openai.api_key = os.getenv("OPENAI_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Constants
TARGET_SAMPLE_RATE = 16000
OUTPUT_DIR = 'processed_audio'

# region AudioProcessor with Noise Reduction
class AudioProcessor:
    def __init__(self, target_sr=TARGET_SAMPLE_RATE, output_dir=OUTPUT_DIR):
        self.target_sr = target_sr
        self.output_dir = output_dir  # Will be updated based on input file
        
    def reduce_noise(self, y, sr):
        """Apply noise reduction to clean the audio"""
        print("Applying noise reduction...")
        # Estimate noise from a small portion of the audio (first 1 second)
        noise_sample = y[:min(sr, len(y))]
        # Apply noise reduction
        reduced_noise = nr.reduce_noise(
            y=y, 
            sr=sr,
            stationary=True,
            prop_decrease=0.75
        )
        print("Noise reduction complete")
        return reduced_noise
        
    def process_audio(self, file_path):
        file_path = file_path.strip('"')
        if not os.path.exists(file_path):
            print(f"Error: File {file_path} not found.")
            return None, None, None
        
        # Set output directory to same folder as input
        self.output_dir = os.path.dirname(file_path)
        os.makedirs(self.output_dir, exist_ok=True)
        
        filename = os.path.splitext(os.path.basename(file_path))[0]
        output_path = os.path.join(self.output_dir, f"{filename}_16khz.wav")

        print(f"Loading {file_path}...")
        try:
            y, sr = librosa.load(file_path, sr=None)
        except Exception as e:
            print(f"Librosa failed to load {file_path}: {e}")
            return None, None, None

        print(f"Original: Sample Rate = {sr} Hz, Duration = {librosa.get_duration(y=y, sr=sr):.2f} seconds")
        
        # Apply noise reduction
        y = self.reduce_noise(y, sr)
        
        if sr != self.target_sr:
            print(f"Resampling to {self.target_sr} Hz...")
            # Use faster resampling method for better performance
            y = librosa.resample(y, orig_sr=sr, target_sr=self.target_sr, res_type='kaiser_fast')

        sf.write(output_path, y, self.target_sr, subtype='PCM_16')
        print(f"Saved to {output_path}")
        return output_path, y, self.target_sr

    def display_audio_info(self, file_path, y, sr):
        if y is None or sr is None:
            print(f"Skipping visualization for {file_path} due to loading error.")
            return
        plt.figure(figsize=(12, 4))
        plt.plot(np.linspace(0, len(y)/sr, len(y)), y)
        plt.title(f'Waveform: {os.path.basename(file_path)}')
        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude')
        plt.tight_layout()
        plt.show()
        print("Audio playback:")
        display(Audio(y, rate=sr))
# endregion

# region FeatureExtractor
class FeatureExtractor:
    def __init__(self, sample_rate=TARGET_SAMPLE_RATE):
        self.sample_rate = sample_rate

    def extract_all_features(self, audio_path, y, sr):
        return {
            'opensmile': self.extract_custom_opensmile(y, sr),
            'praat': self.extract_praat_features(y, sr)
        }

    def extract_custom_opensmile(self, y, sr):
        print("Extracting custom OpenSMILE-style features...")
        # Use larger hop_length for faster processing
        hop_length = 512
        pitch, voiced_flag, _ = librosa.pyin(y, fmin=75, fmax=400, sr=sr, hop_length=hop_length, fill_na=0.0)
        pitch_valid = pitch[voiced_flag]
        jitter = 0.01
        if len(pitch_valid) > 1:
            pitch_diffs = np.abs(np.diff(pitch_valid))
            jitter = np.mean(pitch_diffs) / np.mean(pitch_valid) if np.mean(pitch_valid) > 0 else 0.01

        rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
        shimmer = 0.04
        if len(rms) > 1:
            rms_diffs = np.abs(np.diff(rms))
            shimmer = np.mean(rms_diffs) / np.mean(rms) if np.mean(rms) > 0 else 0.04

        features = {
            'loudness': np.mean(librosa.feature.rms(y=y, hop_length=hop_length)),
            'energy': np.sum(y**2) / len(y),
            'pitch_mean': np.mean(pitch_valid) if np.any(voiced_flag) else 0,
            'pitch_range': np.ptp(pitch_valid) if np.any(voiced_flag) else 0,
            'jitter': jitter,
            'shimmer': shimmer
        }
        return pd.DataFrame([features])

    def extract_praat_features(self, y, sr):
        print("Extracting Praat-like features...")
        # Use larger hop_length for faster processing
        hop_length = 512
        pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=75, fmax=400, sr=sr, hop_length=hop_length, fill_na=0.0)
        S = np.abs(librosa.stft(y, hop_length=hop_length))
        spectral_flatness = librosa.feature.spectral_flatness(S=S)[0]
        hnr_estimate = 20 * (1 - np.mean(spectral_flatness))
        intensity = np.mean(librosa.feature.rms(y=y, hop_length=hop_length)[0]) * 100

        onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
        peaks = librosa.util.peak_pick(onset_env, pre_max=3, post_max=3, pre_avg=3, post_avg=3, delta=0.5, wait=0.5)

        duration = len(y) / sr
        speaking_rate = len(peaks) / duration if duration > 0 else 4.0
        speaking_rate = max(3.0, min(5.5, speaking_rate))

        features = {
            'F0_mean': np.mean(pitch[voiced_flag]) if np.any(voiced_flag) else 0,
            'F0_std': np.std(pitch[voiced_flag]) if np.any(voiced_flag) else 0,
            'HNR': hnr_estimate,
            'Intensity_mean': intensity,
            'Speaking_rate': speaking_rate
        }
        return pd.DataFrame([features])
# endregion

# region VoiceAnalyzer
class VoiceAnalyzer:
    def __init__(self):
        self.sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
        self.politeness_mapping = {0: "Impolite", 1: "Neutral", 2: "Polite"}
        self.empathy_mapping = {0: "Low Empathy", 1: "Moderate Empathy", 2: "High Empathy"}

    def extract_indicators(self, features):
        indicators = {}
        if 'opensmile' in features:
            osm = features['opensmile']
            indicators.update({k: osm[k].values[0] for k in osm.columns})
        if 'praat' in features:
            praat = features['praat']
            indicators.update({k.lower(): praat[k].values[0] for k in praat.columns})
        return indicators

    def analyze_voice(self, indicators):
        results = {}
        s_base, p_base, e_base = 5.0, 5.0, 5.0
        e = indicators['energy']
        pr = indicators['pitch_range']
        j = indicators['jitter']
        sr = indicators['speaking_rate']
        i = indicators['intensity_mean']
        f0 = indicators['f0_mean']
        h = indicators['hnr']
        sh = indicators['shimmer']

        sentiment_score = s_base + min(2.0, e*10) + min(2.0, pr/50) + max(0, 1.0 - j*50) + (
            2.0 if 4.0 <= sr <= 5.0 else 1.5 if 3.5 <= sr < 4.0 or 5.0 < sr <= 5.5 else 1.0)
        sentiment_score = max(1.0, min(10.0, sentiment_score))
        results['sentiment'] = {
            'score': round(sentiment_score, 1),
            'description': self._get_score_description(sentiment_score, "sentiment")
        }

        p_score = p_base + (
            2.0 if pr < 50 else 1.5 if pr < 80 else 1.0 if pr < 120 else 0) + (
            2.0 if i < 60 else 1.5 if i < 70 else 1.0 if i < 80 else 0) + (
            2.0 if 3.8 <= sr <= 4.5 else 1.5 if 3.5 <= sr < 3.8 or 4.5 < sr <= 5.0 else 0.5) + min(1.0, h / 20)
        p_score = max(1.0, min(10.0, p_score))
        results['politeness'] = {
            'score': round(p_score, 1),
            'description': self._get_score_description(p_score, "politeness")
        }

        e_score = e_base + (
            1.5 if 180 < f0 < 280 else 0) + (
            1.0 if 30 < pr < 90 and 180 < f0 < 280 else 0) + (
            1.5 if h > 15 else 1.0 if h > 10 else 0) + (
            1.5 if 3.8 <= sr <= 4.5 else 1.0 if 3.5 <= sr < 3.8 or 4.5 < sr <= 5.0 else 0) + (
            1.0 if 0.03 <= sh <= 0.06 else 0)
        e_score = max(1.0, min(10.0, e_score))
        results['empathy'] = {
            'score': round(e_score, 1),
            'description': self._get_score_description(e_score, "empathy")
        }

        return results

    def _get_score_description(self, score, attribute):
        bins = {
            "sentiment": ["Very negative", "Negative", "Slightly negative", "Neutral", "Slightly positive", "Positive", "Very positive"],
            "politeness": ["Impolite", "Direct/abrupt", "Somewhat direct", "Neutral", "Polite", "Very polite", "Extremely polite"],
            "empathy": ["Very distant/cold", "Distant", "Somewhat distant", "Moderately empathetic", "Empathetic", "Very empathetic", "Highly empathetic"]
        }
        idx = int(min(6, max(0, round(score // 1.5))))
        return bins.get(attribute, ["Unknown"])[idx]

    def analyze(self, features):
        indicators = self.extract_indicators(features)
        results = self.analyze_voice(indicators)
        return results, indicators
# endregion

# region CallAnalyzer with Enhanced Speaker Diarization
class CallAnalyzer:
    def __init__(self):
        self.recognizer = speech_rec.Recognizer()  # Fixed: Use renamed module
        
    def process_audio(self, audio_path):
        try:
            y, sample_rate = librosa.load(audio_path, sr=None)  # Renamed to avoid conflict
            return y, sample_rate
        except Exception as e:
            print(f"Audio processing error: {e}")
            return None, None

    def detect_language(self, audio_segment, sample_rate):
        """Detect language of an audio segment"""
        try:
            # Convert numpy array to bytes for speech recognition
            audio_bytes = (audio_segment * 32767).astype(np.int16).tobytes()
        
            # Create AudioData with the correct parameters
            sample_width = 2  # 16-bit audio = 2 bytes per sample
            audio_data = speech_rec.AudioData(audio_bytes, sample_rate, sample_width)  # Fixed: Use renamed module
        
            # Try to recognize text
            try:
                text = self.recognizer.recognize_google(audio_data)
                # Detect language from text
                lang, _ = langid.classify(text)
                return lang
            except Exception as e:
                print(f"Speech recognition error: {e}")
                return None
        except Exception as e:
            print(f"Language detection error: {e}")
            traceback.print_exc()
            return None

    def extract_segment_features(self, segment, sample_rate):
        """Extract features from a segment to help with speaker identification"""
        if len(segment) < sample_rate * 0.5:  # Skip segments shorter than 0.5 seconds
            return None
            
        features = {}
        
        # Extract pitch with larger hop_length for faster processing
        try:
            hop_length = 512
            pitch, voiced_flag, _ = librosa.pyin(
                segment, 
                fmin=75, 
                fmax=400, 
                sr=sample_rate, 
                hop_length=hop_length,
                fill_na=0.0
            )
            features['pitch_mean'] = np.mean(pitch[voiced_flag]) if np.any(voiced_flag) else 0
        except:
            features['pitch_mean'] = 0
            
        # Extract energy
        try:
            rms = librosa.feature.rms(y=segment, hop_length=hop_length)[0]
            features['energy_mean'] = np.mean(rms)
        except:
            features['energy_mean'] = 0
            
        # Extract spectral features
        try:
            mfcc = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=13, hop_length=hop_length)
            features['mfcc_mean'] = np.mean(mfcc, axis=1)
        except:
            features['mfcc_mean'] = np.zeros(13)
            
        return features

    def diarize_speakers(self, y, sample_rate):
        """
        Improved speaker diarization with better segmentation and speaker identification
        """
        print("Performing enhanced speaker diarization...")
        
        # Step 1: Extract more robust features for segmentation
        # Combine MFCCs with spectral contrast for better speaker differentiation
        hop_length = 512
        n_mfcc = 13
        mfcc = librosa.feature.mfcc(y=y, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length).T
        
        # Add spectral contrast features for better voice characteristic capture
        contrast = librosa.feature.spectral_contrast(y=y, sr=sample_rate, hop_length=hop_length).T
        
        # Combine features
        features = np.hstack([mfcc, contrast[:, :3]])  # Use first 3 contrast bands
        
        # Step 2: Use K-means for initial clustering with better initialization
        # Try multiple initializations to find the best clustering
        best_inertia = float('inf')
        best_labels = None
        
        for i in range(3):  # Try 3 different initializations
            kmeans = KMeans(n_clusters=2, random_state=i, n_init=2, max_iter=100).fit(features)
            if kmeans.inertia_ < best_inertia:
                best_inertia = kmeans.inertia_
                best_labels = kmeans.labels_
        
        # Apply median filter to smooth labels and remove noise
        labels = medfilt(best_labels, kernel_size=15)  # Increased kernel size for better smoothing
        
        # Step 3: Find segments with minimum duration
        segments = []
        current_label = labels[0]
        start = 0
        min_segment_frames = int(0.5 * sample_rate / hop_length)  # Minimum 0.5 seconds
        
        for i, label in enumerate(labels):
            if label != current_label:
                # Only keep segments longer than minimum duration
                if i - start >= min_segment_frames:
                    segments.append({
                        'start': start,
                        'end': i,
                        'label': current_label,
                        'start_time': start * hop_length / sample_rate,
                        'end_time': i * hop_length / sample_rate
                    })
                start = i
                current_label = label
                
        # Add the last segment
        if len(labels) - start >= min_segment_frames:
            segments.append({
                'start': start,
                'end': len(labels),
                'label': current_label,
                'start_time': start * hop_length / sample_rate,
                'end_time': len(labels) * hop_length / sample_rate
            })
        
        # Step 4: Extract audio for each segment and compute features
        for segment in segments:
            segment_start_sample = int(segment['start_time'] * sample_rate)
            segment_end_sample = int(segment['end_time'] * sample_rate)
            segment['audio'] = y[segment_start_sample:segment_end_sample]
            
            # Extract additional features for each segment
            segment['features'] = self.extract_segment_features(segment['audio'], sample_rate)
            
            # Only detect language for longer segments
            if len(segment['audio']) > sample_rate * 2:  # Only for segments > 2 seconds
                segment['language'] = self.detect_language(segment['audio'], sample_rate)
            else:
                segment['language'] = None
        
        # Step 5: Improved speaker identification
        # Analyze all segments to determine speaker characteristics
        speaker_0_segments = [s for s in segments if s['label'] == 0 and s['features'] is not None]
        speaker_1_segments = [s for s in segments if s['label'] == 1 and s['features'] is not None]
        
        # Skip analysis if we don't have enough segments for either speaker
        if len(speaker_0_segments) < 2 or len(speaker_1_segments) < 2:
            print("Not enough segments for reliable speaker identification. Using default assignment.")
            agent_label = 0  # Default: first speaker is agent
            customer_label = 1
        else:
            # Extract key features for each speaker
            speaker_0_pitch = np.mean([s['features']['pitch_mean'] for s in speaker_0_segments])
            speaker_1_pitch = np.mean([s['features']['pitch_mean'] for s in speaker_1_segments])
            
            speaker_0_energy = np.mean([s['features']['energy_mean'] for s in speaker_0_segments])
            speaker_1_energy = np.mean([s['features']['energy_mean'] for s in speaker_1_segments])
            
            # Calculate speaking time for each speaker
            speaker_0_duration = sum([s['end_time'] - s['start_time'] for s in speaker_0_segments])
            speaker_1_duration = sum([s['end_time'] - s['start_time'] for s in speaker_1_segments])
            
            # Check for language detection results
            speaker_0_languages = [s['language'] for s in speaker_0_segments if s.get('language')]
            speaker_1_languages = [s['language'] for s in speaker_1_segments if s.get('language')]
            
            # Determine which speaker is likely the agent based on multiple factors
            agent_score_0 = 0
            agent_score_1 = 0
            
            # 1. Agents typically have more consistent pitch (lower is better)
            pitch_std_0 = np.std([s['features']['pitch_mean'] for s in speaker_0_segments])
            pitch_std_1 = np.std([s['features']['pitch_mean'] for s in speaker_1_segments])
            
            if pitch_std_0 < pitch_std_1:
                agent_score_0 += 1
            else:
                agent_score_1 += 1
                
            # 2. Agents typically speak more in customer service calls
            if speaker_0_duration > speaker_1_duration:
                agent_score_0 += 1
            else:
                agent_score_1 += 1
                
            # 3. Agents typically have moderate energy levels (not too high, not too low)
            # Normalize energy to 0-1 range for comparison
            max_energy = max(speaker_0_energy, speaker_1_energy)
            if max_energy > 0:
                norm_energy_0 = speaker_0_energy / max_energy
                norm_energy_1 = speaker_1_energy / max_energy
                
                # Ideal agent energy is around 0.6-0.8 of max
                if abs(norm_energy_0 - 0.7) < abs(norm_energy_1 - 0.7):
                    agent_score_0 += 1
                else:
                    agent_score_1 += 1
            
            # 4. Check if non-English language is detected (customer more likely to speak non-English)
            if speaker_0_languages and speaker_1_languages:
                non_english_0 = sum(1 for lang in speaker_0_languages if lang != 'en')
                non_english_1 = sum(1 for lang in speaker_1_languages if lang != 'en')
                
                if non_english_0 > non_english_1:
                    # More non-English in speaker 0, so speaker 1 is more likely the agent
                    agent_score_1 += 2  # Give this a higher weight
                elif non_english_1 > non_english_0:
                    # More non-English in speaker 1, so speaker 0 is more likely the agent
                    agent_score_0 += 2  # Give this a higher weight
            
            # 5. Check who speaks first (agents typically speak first in calls)
            if segments and len(segments) > 0:
                first_speaker = segments[0]['label']
                if first_speaker == 0:
                    agent_score_0 += 1
                else:
                    agent_score_1 += 1
            
            # Determine agent based on scores
            if agent_score_0 >= agent_score_1:
                agent_label = 0
                customer_label = 1
            else:
                agent_label = 1
                customer_label = 0
                
            print(f"Speaker identification scores - Speaker 0: {agent_score_0}, Speaker 1: {agent_score_1}")
            print(f"Identified agent as Speaker {agent_label}, customer as Speaker {customer_label}")
        
        # Step 6: Group segments by speaker with improved accuracy
        agent_audio = []
        customer_audio = []
        
        for segment in segments:
            if segment['label'] == agent_label:
                agent_audio.append(segment['audio'])
            else:
                customer_audio.append(segment['audio'])
        
        # Combine segments for each speaker
        agent_audio = np.concatenate(agent_audio) if agent_audio else np.array([])
        customer_audio = np.concatenate(customer_audio) if customer_audio else np.array([])
        
        print(f"Diarization complete. Agent segments: {len(agent_audio)}, Customer segments: {len(customer_audio)}")
        return {
            'agent': agent_audio,
            'customer': customer_audio
        }
        
    def save_speaker_audio(self, speakers_dict, sample_rate, original_file_path):
        """Save separate audio files for each speaker"""
        try:
            # Create output directory based on original file name
            base_name = os.path.basename(original_file_path)
            file_name = os.path.splitext(base_name)[0]
            output_dir = os.path.dirname(original_file_path)
            
            # Save agent audio
            agent_path = os.path.join(output_dir, f"{file_name}_agent.wav")
            if len(speakers_dict['agent']) > 0:
                sf.write(agent_path, speakers_dict['agent'], sample_rate)
                print(f"Agent audio saved to: {agent_path}")
            else:
                print("No agent audio segments found")
                
            # Save customer audio
            customer_path = os.path.join(output_dir, f"{file_name}_customer.wav")
            if len(speakers_dict['customer']) > 0:
                sf.write(customer_path, speakers_dict['customer'], sample_rate)
                print(f"Customer audio saved to: {customer_path}")
            else:
                print("No customer audio segments found")
                
            return {
                'agent_path': agent_path if len(speakers_dict['agent']) > 0 else None,
                'customer_path': customer_path if len(speakers_dict['customer']) > 0 else None
            }
        except Exception as e:
            print(f"Error saving speaker audio: {e}")
            return None

    def extract_features(self, y, sample_rate):
        if y is None or len(y) == 0:
            return {k: 0 for k in ['pitch_mean', 'pitch_std', 'pitch_range', 'energy_mean', 'energy_std',
                                   'energy_range', 'tempo', 'pause_ratio', 'speech_rate',
                                   'spectral_centroid', 'spectral_rolloff']}

        # Use larger hop_length for faster processing
        hop_length = 512
        features = {}
        features['pitch'] = librosa.yin(y, fmin=50, fmax=500, hop_length=hop_length)
        features['pitch_mean'] = np.mean(features['pitch'])
        features['pitch_std'] = np.std(features['pitch'])
        features['pitch_range'] = np.max(features['pitch']) - np.min(features['pitch'])

        features['rms'] = librosa.feature.rms(y=y, hop_length=hop_length)[0]
        features['energy_mean'] = np.mean(features['rms'])
        features['energy_std'] = np.std(features['rms'])
        features['energy_range'] = np.max(features['rms']) - np.min(features['rms'])

        tempo, _ = librosa.beat.beat_track(y=y, sr=sample_rate, hop_length=hop_length)
        features['tempo'] = tempo

        non_silent_intervals = librosa.effects.split(y, top_db=20)
        features['pause_ratio'] = 1 - (sum(i[1]-i[0] for i in non_silent_intervals) / len(y))
        features['speech_rate'] = len(non_silent_intervals) / (len(y) / sample_rate)

        features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sample_rate, hop_length=hop_length)[0])
        features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sample_rate, hop_length=hop_length)[0])

        return features

    def get_llm_analysis(self, agent_features, customer_features):
        llm_results = {}
        feature_description = f"""
        Agent audio features:
        - Pitch (mean): {agent_features['pitch_mean']:.2f} Hz
        - Pitch variation: {agent_features['pitch_std']:.2f} Hz
        - Energy level: {agent_features['energy_mean']:.4f}
        - Speech rate: {agent_features['speech_rate']:.2f} segments/sec
        - Pause ratio: {agent_features['pause_ratio']:.2f}
        - Voice quality (spectral centroid): {agent_features['spectral_centroid']:.2f}
        Customer audio features:
        - Pitch (mean): {customer_features['pitch_mean']:.2f} Hz
        - Pitch variation: {customer_features['pitch_std']:.2f} Hz
        - Energy level: {customer_features['energy_mean']:.4f}
        - Speech rate: {customer_features['speech_rate']:.2f} segments/sec
        - Pause ratio: {customer_features['pause_ratio']:.2f}
        - Voice quality (spectral centroid): {customer_features['spectral_centroid']:.2f}
"""
        prompt = f"{feature_description}\n\nBased on these audio features, please provide:\n1. Customer sentiment score (0-10)\n2. Agent politeness score (0-10)\n3. Agent empathy score (0-10)\nInclude a brief explanation for each score."
        # Try OpenAI analysis 
        try:
            if openai.api_key:
                response = openai.chat.completions.create(
                    model="gpt-4",
                    messages=[
                        {"role": "system", "content": "You are an expert in analyzing voice features to determine emotional states and communication quality."},
                        {"role": "user", "content": prompt}
                        ]
                        )
                llm_results['openai'] = response.choices[0].message.content
            else:
                llm_results['openai'] = "OpenAI API key not configured. Please set the OPENAI_API_KEY environment variable."
        except Exception as e:
            error_msg = str(e)
            llm_results['openai'] = f"Error with OpenAI analysis: {error_msg}"
            print(f"OpenAI API error: {error_msg}")
            traceback.print_exc()

    # Try Gemini analysis
        try:
            if os.getenv("GOOGLE_API_KEY"):
                model = genai.GenerativeModel('gemini-pro')
                response = model.generate_content(prompt)
                llm_results['gemini'] = response.text
            else:
                llm_results['gemini'] = "Google API key not configured. Please set the GOOGLE_API_KEY environment variable."
        except Exception as e:
            error_msg = str(e)
            llm_results['gemini'] = f"Error with Gemini analysis: {error_msg}"
            print(f"Gemini API error: {error_msg}")
            traceback.print_exc()

        return llm_results

    def load_scores_from_json(self, json_path):
        """Load scores that were calculated by the first cell"""
        try:
            with open(json_path, 'r') as f:
                scores = json.load(f)
            print(f"Loaded scores from {json_path}")
            return scores
        except Exception as e:
            print(f"Error loading scores from {json_path}: {e}")
            return None

    def analyze_call(self, audio_path, scores_json_path=None):
        y, sample_rate = self.process_audio(audio_path)
        if y is None:
            return {
                'file_name': os.path.basename(audio_path),
                'error': "Failed to process audio file"
            }

        speakers = self.diarize_speakers(y, sample_rate)
        
        # Save the separated audio files
        audio_paths = self.save_speaker_audio(speakers, sample_rate, audio_path)

        agent_features = self.extract_features(speakers['agent'], sample_rate)
        customer_features = self.extract_features(speakers['customer'], sample_rate)

        # First try to load scores from the first cell's JSON file
        scores = None
        if scores_json_path and os.path.exists(scores_json_path):
            scores = self.load_scores_from_json(scores_json_path)
        
        if scores:
            # Use the scores from the first cell if available
            customer_sentiment = scores.get("Customer Sentiment Score", 0)
            agent_politeness = scores.get("Agent Politeness Score", 0)
            agent_empathy = scores.get("Agent Empathy Score", 0)
        else:
            # As a fallback, calculate scores directly (original functionality)
            agent_scores = self.calculate_scores(agent_features, 'agent')
            customer_scores = self.calculate_scores(customer_features, 'customer')
            customer_sentiment = customer_scores.get('sentiment', 0)
            agent_politeness = agent_scores.get('politeness', 0)
            agent_empathy = agent_scores.get('empathy', 0)

        llm_analysis = self.get_llm_analysis(agent_features, customer_features)

        results = {
            'file_name': os.path.basename(audio_path),
            'customer_sentiment': customer_sentiment,
            'agent_politeness': agent_politeness,
            'agent_empathy': agent_empathy,
            'agent_features': agent_features,
            'customer_features': customer_features,
            'llm_analysis': llm_analysis,
            'audio_paths': audio_paths  # Include the paths to the saved audio files
        }

        self._visualize_results(results)
        return results

    def calculate_scores(self, features, speaker_type):
        """Original score calculation as fallback"""
        scores = {}

        if speaker_type == 'customer':
            scores['sentiment'] = np.clip(5 +
                                         features['pitch_range'] / 50 +
                                         features['energy_mean'] * 20 - 
                                         features['pause_ratio'] * 5, 0, 10)

        if speaker_type == 'agent':
            scores['politeness'] = np.clip(7 - 
                                          abs(features['tempo'] - 100) / 20 - 
                                          features['pitch_std'] / 10 + 
                                          features['pause_ratio'] * 5, 0, 10)

            scores['empathy'] = np.clip(5 + 
                                       features['pitch_range'] / 40 + 
                                       features['energy_range'] * 10, 0, 10)

        return scores

    def _visualize_results(self, results):
        try:
            plt.figure(figsize=(12, 10))

            customer_sentiment = results.get('customer_sentiment') or 0
            agent_politeness = results.get('agent_politeness') or 0
            agent_empathy = results.get('agent_empathy') or 0

            plt.subplot(221)
            metrics = ['Customer Sentiment', 'Agent Politeness', 'Agent Empathy']
            values = [float(customer_sentiment), float(agent_politeness), float(agent_empathy)]
            plt.bar(metrics, values, color=['blue', 'green', 'purple'])
            plt.ylim(0, 10)
            plt.title('Call Analysis Scores')

            plt.subplot(222)
            features = ['pitch_mean', 'energy_mean', 'speech_rate']
            agent_values = [float(results['agent_features'].get(f, 0)) for f in features]
            customer_values = [float(results['customer_features'].get(f, 0)) for f in features]
            x = np.arange(len(features))
            width = 0.35

            plt.bar(x - width / 2, agent_values, width, label='Agent')
            plt.bar(x + width / 2, customer_values, width, label='Customer')
            plt.xticks(x, [f.replace('_', ' ').title() for f in features])
            plt.legend()
            plt.title('Voice Feature Comparison')

            plt.subplot(212)
            plt.axis('off')
            llm_text = "LLM Analysis Summary:\n\n"
            for llm, analysis in results['llm_analysis'].items():
                llm_text += f"{llm.upper()}:\n{(analysis or 'No response')[:300]}...\n\n"
            plt.text(0, 0.5, llm_text, fontsize=9, wrap=True)

            plt.tight_layout()
            plt.savefig(os.path.join(os.path.dirname(results['audio_paths'].get('agent_path', '')), 
                                    f"{results['file_name']}_analysis.png"))
            plt.close()

            print(f"\n=== Analysis for {results['file_name']} ===")
            print(f"Customer Sentiment Score: {float(customer_sentiment):.2f}/10")
            print(f"Agent Politeness Score: {float(agent_politeness):.2f}/10")
            print(f"Agent Empathy Score: {float(agent_empathy):.2f}/10")
            
            # Print the paths to the saved audio files
            if results.get('audio_paths'):
                print(f"\nSeparated audio files:")
                if results['audio_paths'].get('agent_path'):
                    print(f"Agent audio: {results['audio_paths']['agent_path']}")
                if results['audio_paths'].get('customer_path'):
                    print(f"Customer audio: {results['audio_paths']['customer_path']}")

        except Exception as e:
            print(f"[Visualization Error] Skipping chart for {results.get('file_name')}: {str(e)}")

# Helper function to save features to JSON
def save_features_to_json(features, output_path='extracted_features.json'):
    serializable_features = {}
    for key, df in features.items():
        serializable_features[key] = df.to_dict(orient='records')[0]
    with open(output_path, 'w') as f:
        json.dump(serializable_features, f, indent=4)
    print(f"Features saved to {output_path}")

# Test the enhanced functionality
if __name__ == "__main__":
    # Example usage
    audio_processor = AudioProcessor()
    call_analyzer = CallAnalyzer()
    
    # Test with a sample file
    test_file = "/Users/vikaskhare/Documents/sample_call.wav"
    if os.path.exists(test_file):
        # Process audio with noise reduction
        processed_file, y, sample_rate = audio_processor.process_audio(test_file)
        
        # Perform enhanced speaker diarization
        if processed_file:
            result = call_analyzer.analyze_call(processed_file)
            print("Analysis complete!")
    else:
        print(f"Test file {test_file} not found. Please provide a valid audio file path.")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Loading /Users/vikaskhare/Documents/sample_call.wav...
Librosa failed to load /Users/vikaskhare/Documents/sample_call.wav: [Errno 21] Is a directory: '/Users/vikaskhare/Documents/sample_call.wav'


  y, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [28]:
import sys
!{sys.executable} -m pip install gradio
import sys
!{sys.executable} -m pip install resampy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m


In [None]:


import gradio as gr
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import json
import soundfile as sf
from dotenv import load_dotenv
import openai
import google.generativeai as genai
import traceback
import pandas as pd
from sklearn.cluster import KMeans
from scipy.signal import medfilt
import io
import base64
import noisereduce as nr
import speech_recognition as speech_rec  # Renamed to avoid conflict
import langid
import threading
from concurrent.futures import ThreadPoolExecutor

# Load environment variables
load_dotenv()

# Configure API keys
openai.api_key = os.getenv("OPENAI_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# Constants
TARGET_SAMPLE_RATE = 16000
OUTPUT_DIR = 'processed_audio'

# region AudioProcessor
class AudioProcessor:
    def __init__(self, target_sr=TARGET_SAMPLE_RATE, output_dir=OUTPUT_DIR):
        self.target_sr = target_sr
        self.output_dir = output_dir  # Will be updated based on input file
        
    def reduce_noise(self, y, sample_rate):
        """Apply noise reduction to clean the audio"""
        print("Applying noise reduction...")
        # Apply noise reduction with optimized parameters
        reduced_noise = nr.reduce_noise(
            y=y, 
            sr=sample_rate,
            stationary=True,
            prop_decrease=0.75
        )
        print("Noise reduction complete")
        return reduced_noise
        
    def process_audio(self, file_path):
        file_path = file_path.strip('"')
        if not os.path.exists(file_path):
            print(f"Error: File {file_path} not found.")
            return None, None, None, None
        
        # Set output directory to same folder as input
        self.output_dir = os.path.dirname(file_path)
        os.makedirs(self.output_dir, exist_ok=True)
        
        filename = os.path.splitext(os.path.basename(file_path))[0]
        output_path = os.path.join(self.output_dir, f"{filename}_16khz.wav")
        noise_reduced_path = os.path.join(self.output_dir, f"{filename}_noise_reduced.wav")

        print(f"Loading {file_path}...")
        try:
            # Load with a lower sample rate for faster processing if file is large
            y, sample_rate = librosa.load(file_path, sr=None)
            file_duration = librosa.get_duration(y=y, sr=sample_rate)
            
            # For very long files, consider downsampling during load
            if file_duration > 300:  # If longer than 5 minutes
                print(f"Long file detected ({file_duration:.2f} seconds), optimizing processing...")
                y, sample_rate = librosa.load(file_path, sr=22050)  # Lower sample rate for initial processing
        except Exception as e:
            print(f"Librosa failed to load {file_path}: {e}")
            return None, None, None, None

        print(f"Original: Sample Rate = {sample_rate} Hz, Duration = {librosa.get_duration(y=y, sr=sample_rate):.2f} seconds")
        
        # Apply noise reduction
        y_reduced = self.reduce_noise(y, sample_rate)
        
        # Save noise reduced audio
        sf.write(noise_reduced_path, y_reduced, sample_rate, subtype='PCM_16')
        print(f"Noise reduced audio saved to {noise_reduced_path}")
        
        if sample_rate != self.target_sr:
            print(f"Resampling to {self.target_sr} Hz...")
            # Use faster resampling method
            y_reduced = librosa.resample(y_reduced, orig_sr=sample_rate, target_sr=self.target_sr, res_type='fft')

        sf.write(output_path, y_reduced, self.target_sr, subtype='PCM_16')
        print(f"Processed audio saved to {output_path}")
        return output_path, y_reduced, self.target_sr, noise_reduced_path
# endregion

# region FeatureExtractor
class FeatureExtractor:
    def __init__(self, sample_rate=TARGET_SAMPLE_RATE):
        self.sample_rate = sample_rate

    def extract_all_features(self, audio_path, y, sample_rate):
        # Use a more efficient approach for feature extraction
        opensmile_features = self.extract_custom_opensmile(y, sample_rate)
        praat_features = self.extract_praat_features(y, sample_rate)
        
        return {
            'opensmile': opensmile_features,
            'praat': praat_features
        }

    def extract_custom_opensmile(self, y, sample_rate):
        print("Extracting custom OpenSMILE-style features...")
        # Optimize pitch extraction for performance
        hop_length = 512  # Larger hop length for faster processing
        pitch, voiced_flag, _ = librosa.pyin(
            y, 
            fmin=75, 
            fmax=400, 
            sr=sample_rate, 
            hop_length=hop_length,
            fill_na=0.0
        )
        
        pitch_valid = pitch[voiced_flag]
        jitter = 0.01
        if len(pitch_valid) > 1:
            pitch_diffs = np.abs(np.diff(pitch_valid))
            jitter = np.mean(pitch_diffs) / np.mean(pitch_valid) if np.mean(pitch_valid) > 0 else 0.01

        rms = librosa.feature.rms(y=y, hop_length=hop_length)[0]
        shimmer = 0.04
        if len(rms) > 1:
            rms_diffs = np.abs(np.diff(rms))
            shimmer = np.mean(rms_diffs) / np.mean(rms) if np.mean(rms) > 0 else 0.04

        features = {
            'loudness': np.mean(rms),
            'energy': np.sum(y**2) / len(y),
            'pitch_mean': np.mean(pitch_valid) if np.any(voiced_flag) else 0,
            'pitch_range': np.ptp(pitch_valid) if np.any(voiced_flag) else 0,
            'jitter': jitter,
            'shimmer': shimmer
        }
        return pd.DataFrame([features])

    def extract_praat_features(self, y, sample_rate):
        print("Extracting Praat-like features...")
        # Use larger hop length for faster processing
        hop_length = 512
        
        pitch, voiced_flag, _ = librosa.pyin(
            y, 
            fmin=75, 
            fmax=400, 
            sr=sample_rate, 
            hop_length=hop_length,
            fill_na=0.0
        )
        
        # Optimize STFT calculation
        n_fft = 2048  # Larger FFT size for better frequency resolution
        S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
        spectral_flatness = librosa.feature.spectral_flatness(S=S)[0]
        hnr_estimate = 20 * (1 - np.mean(spectral_flatness))
        intensity = np.mean(librosa.feature.rms(y=y, hop_length=hop_length)[0]) * 100

        onset_env = librosa.onset.onset_strength(y=y, sr=sample_rate, hop_length=hop_length)
        peaks = librosa.util.peak_pick(onset_env, pre_max=3, post_max=3, pre_avg=3, post_avg=3, delta=0.5, wait=10)

        duration = len(y) / sample_rate
        speaking_rate = len(peaks) / duration if duration > 0 else 4.0
        speaking_rate = max(3.0, min(5.5, speaking_rate))

        features = {
            'F0_mean': np.mean(pitch[voiced_flag]) if np.any(voiced_flag) else 0,
            'F0_std': np.std(pitch[voiced_flag]) if np.any(voiced_flag) else 0,
            'HNR': hnr_estimate,
            'Intensity_mean': intensity,
            'Speaking_rate': speaking_rate
        }
        return pd.DataFrame([features])
# endregion

# region VoiceAnalyzer
class VoiceAnalyzer:
    def __init__(self):
        self.sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
        self.politeness_mapping = {0: "Impolite", 1: "Neutral", 2: "Polite"}
        self.empathy_mapping = {0: "Low Empathy", 1: "Moderate Empathy", 2: "High Empathy"}

    def extract_indicators(self, features):
        indicators = {}
        if 'opensmile' in features:
            osm = features['opensmile']
            indicators.update({k: osm[k].values[0] for k in osm.columns})
        if 'praat' in features:
            praat = features['praat']
            indicators.update({k.lower(): praat[k].values[0] for k in praat.columns})
        return indicators

    def analyze_voice(self, indicators):
        results = {}
        s_base, p_base, e_base = 5.0, 5.0, 5.0
        e = indicators['energy']
        pr = indicators['pitch_range']
        j = indicators['jitter']
        sr = indicators['speaking_rate']
        i = indicators['intensity_mean']
        f0 = indicators['f0_mean']
        h = indicators['hnr']
        sh = indicators['shimmer']

        sentiment_score = s_base + min(2.0, e*10) + min(2.0, pr/50) + max(0, 1.0 - j*50) + (
            2.0 if 4.0 <= sr <= 5.0 else 1.5 if 3.5 <= sr < 4.0 or 5.0 < sr <= 5.5 else 1.0)
        sentiment_score = max(1.0, min(10.0, sentiment_score))
        results['sentiment'] = {
            'score': round(sentiment_score, 1),
            'description': self._get_score_description(sentiment_score, "sentiment")
        }

        p_score = p_base + (
            2.0 if pr < 50 else 1.5 if pr < 80 else 1.0 if pr < 120 else 0) + (
            2.0 if i < 60 else 1.5 if i < 70 else 1.0 if i < 80 else 0) + (
            2.0 if 3.8 <= sr <= 4.5 else 1.5 if 3.5 <= sr < 3.8 or 4.5 < sr <= 5.0 else 0.5) + min(1.0, h / 20)
        p_score = max(1.0, min(10.0, p_score))
        results['politeness'] = {
            'score': round(p_score, 1),
            'description': self._get_score_description(p_score, "politeness")
        }

        e_score = e_base + (
            1.5 if 180 < f0 < 280 else 0) + (
            1.0 if 30 < pr < 90 and 180 < f0 < 280 else 0) + (
            1.5 if h > 15 else 1.0 if h > 10 else 0) + (
            1.5 if 3.8 <= sr <= 4.5 else 1.0 if 3.5 <= sr < 3.8 or 4.5 < sr <= 5.0 else 0) + (
            1.0 if 0.03 <= sh <= 0.06 else 0)
        e_score = max(1.0, min(10.0, e_score))
        results['empathy'] = {
            'score': round(e_score, 1),
            'description': self._get_score_description(e_score, "empathy")
        }

        return results

    def _get_score_description(self, score, attribute):
        bins = {
            "sentiment": ["Very negative", "Negative", "Slightly negative", "Neutral", "Slightly positive", "Positive", "Very positive"],
            "politeness": ["Impolite", "Direct/abrupt", "Somewhat direct", "Neutral", "Polite", "Very polite", "Extremely polite"],
            "empathy": ["Very distant/cold", "Distant", "Somewhat distant", "Moderately empathetic", "Empathetic", "Very empathetic", "Highly empathetic"]
        }
        idx = int(min(6, max(0, round(score // 1.5))))
        return bins.get(attribute, ["Unknown"])[idx]

    def analyze(self, features):
        indicators = self.extract_indicators(features)
        results = self.analyze_voice(indicators)
        return results, indicators
# endregion

# region CallAnalyzer 
class CallAnalyzer:
    def __init__(self):
        self.recognizer = speech_rec.Recognizer()  # Fixed: Use renamed module
        
    def process_audio(self, audio_path):
        try:
            y, sample_rate = librosa.load(audio_path, sr=None)  # Renamed to avoid conflict
            return y, sample_rate
        except Exception as e:
            print(f"Audio processing error: {e}")
            return None, None
            
    def detect_language(self, audio_segment, sample_rate):
        """Detect language of an audio segment"""
        try:
            # Convert numpy array to bytes for speech recognition
            audio_bytes = (audio_segment * 32767).astype(np.int16).tobytes()
        
            # Create AudioData with the correct parameters
            sample_width = 2  # 16-bit audio = 2 bytes per sample
            audio_data = speech_rec.AudioData(audio_bytes, sample_rate, sample_width)  # Fixed: Use renamed module
        
            # Try to recognize text
            try:
                text = self.recognizer.recognize_google(audio_data)
                # Detect language from text
                lang, _ = langid.classify(text)
                return lang
            except Exception as e:
                print(f"Speech recognition error: {e}")
                return None
        except Exception as e:
            print(f"Language detection error: {e}")
            traceback.print_exc()
            return None

    def extract_segment_features(self, segment, sample_rate):
        """Extract features from a segment to help with speaker identification"""
        if len(segment) < sample_rate * 0.5:  # Skip segments shorter than 0.5 seconds
            return None
            
        features = {}
        
        # Extract pitch with larger hop_length for faster processing
        try:
            hop_length = 512
            pitch, voiced_flag, _ = librosa.pyin(
                segment, 
                fmin=75, 
                fmax=400, 
                sr=sample_rate, 
                hop_length=hop_length,
                fill_na=0.0
            )
            features['pitch_mean'] = np.mean(pitch[voiced_flag]) if np.any(voiced_flag) else 0
        except:
            features['pitch_mean'] = 0
            
        # Extract energy
        try:
            rms = librosa.feature.rms(y=segment, hop_length=hop_length)[0]
            features['energy_mean'] = np.mean(rms)
        except:
            features['energy_mean'] = 0
            
        # Extract spectral features
        try:
            mfcc = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=13, hop_length=hop_length)
            features['mfcc_mean'] = np.mean(mfcc, axis=1)
        except:
            features['mfcc_mean'] = np.zeros(13)
            
        return features

    def diarize_speakers(self, y, sample_rate):
        """
        Improved speaker diarization with better segmentation and speaker identification
        """
        print("Performing enhanced speaker diarization...")
        
        # Step 1: Extract more robust features for segmentation
        # Combine MFCCs with spectral contrast for better speaker differentiation
        hop_length = 512
        n_mfcc = 13
        mfcc = librosa.feature.mfcc(y=y, sr=sample_rate, n_mfcc=n_mfcc, hop_length=hop_length).T
        
        # Add spectral contrast features for better voice characteristic capture
        contrast = librosa.feature.spectral_contrast(y=y, sr=sample_rate, hop_length=hop_length).T
        
        # Combine features
        features = np.hstack([mfcc, contrast[:, :3]])  # Use first 3 contrast bands
        
        # Step 2: Use K-means for initial clustering with better initialization
        # Try multiple initializations to find the best clustering
        best_inertia = float('inf')
        best_labels = None
        
        for i in range(3):  # Try 3 different initializations
            kmeans = KMeans(n_clusters=2, random_state=i, n_init=2, max_iter=100).fit(features)
            if kmeans.inertia_ < best_inertia:
                best_inertia = kmeans.inertia_
                best_labels = kmeans.labels_
        
        # Apply median filter to smooth labels and remove noise
        labels = medfilt(best_labels, kernel_size=15)  # Increased kernel size for better smoothing
        
        # Step 3: Find segments with minimum duration
        segments = []
        current_label = labels[0]
        start = 0
        min_segment_frames = int(0.5 * sample_rate / hop_length)  # Minimum 0.5 seconds
        
        for i, label in enumerate(labels):
            if label != current_label:
                # Only keep segments longer than minimum duration
                if i - start >= min_segment_frames:
                    segments.append({
                        'start': start,
                        'end': i,
                        'label': current_label,
                        'start_time': start * hop_length / sample_rate,
                        'end_time': i * hop_length / sample_rate
                    })
                start = i
                current_label = label
                
        # Add the last segment
        if len(labels) - start >= min_segment_frames:
            segments.append({
                'start': start,
                'end': len(labels),
                'label': current_label,
                'start_time': start * hop_length / sample_rate,
                'end_time': len(labels) * hop_length / sample_rate
            })
        
        # Step 4: Extract audio for each segment and compute features
        for segment in segments:
            segment_start_sample = int(segment['start_time'] * sample_rate)
            segment_end_sample = int(segment['end_time'] * sample_rate)
            segment['audio'] = y[segment_start_sample:segment_end_sample]
            
            # Extract additional features for each segment
            segment['features'] = self.extract_segment_features(segment['audio'], sample_rate)
            
            # Only detect language for longer segments
            if len(segment['audio']) > sample_rate * 2:  # Only for segments > 2 seconds
                segment['language'] = self.detect_language(segment['audio'], sample_rate)
            else:
                segment['language'] = None
        
        # Step 5: Improved speaker identification
        # Analyze all segments to determine speaker characteristics
        speaker_0_segments = [s for s in segments if s['label'] == 0 and s['features'] is not None]
        speaker_1_segments = [s for s in segments if s['label'] == 1 and s['features'] is not None]
        
        # Skip analysis if we don't have enough segments for either speaker
        if len(speaker_0_segments) < 2 or len(speaker_1_segments) < 2:
            print("Not enough segments for reliable speaker identification. Using default assignment.")
            agent_label = 0  # Default: first speaker is agent
            customer_label = 1
        else:
            # Extract key features for each speaker
            speaker_0_pitch = np.mean([s['features']['pitch_mean'] for s in speaker_0_segments])
            speaker_1_pitch = np.mean([s['features']['pitch_mean'] for s in speaker_1_segments])
            
            speaker_0_energy = np.mean([s['features']['energy_mean'] for s in speaker_0_segments])
            speaker_1_energy = np.mean([s['features']['energy_mean'] for s in speaker_1_segments])
            
            # Calculate speaking time for each speaker
            speaker_0_duration = sum([s['end_time'] - s['start_time'] for s in speaker_0_segments])
            speaker_1_duration = sum([s['end_time'] - s['start_time'] for s in speaker_1_segments])
            
            # Check for language detection results
            speaker_0_languages = [s['language'] for s in speaker_0_segments if s.get('language')]
            speaker_1_languages = [s['language'] for s in speaker_1_segments if s.get('language')]
            
            # Determine which speaker is likely the agent based on multiple factors
            agent_score_0 = 0
            agent_score_1 = 0
            
            # 1. Agents typically have more consistent pitch (lower is better)
            pitch_std_0 = np.std([s['features']['pitch_mean'] for s in speaker_0_segments])
            pitch_std_1 = np.std([s['features']['pitch_mean'] for s in speaker_1_segments])
            
            if pitch_std_0 < pitch_std_1:
                agent_score_0 += 1
            else:
                agent_score_1 += 1
                
            # 2. Agents typically speak more in customer service calls
            if speaker_0_duration > speaker_1_duration:
                agent_score_0 += 1
            else:
                agent_score_1 += 1
                
            # 3. Agents typically have moderate energy levels (not too high, not too low)
            # Normalize energy to 0-1 range for comparison
            max_energy = max(speaker_0_energy, speaker_1_energy)
            if max_energy > 0:
                norm_energy_0 = speaker_0_energy / max_energy
                norm_energy_1 = speaker_1_energy / max_energy
                
                # Ideal agent energy is around 0.6-0.8 of max
                if abs(norm_energy_0 - 0.7) < abs(norm_energy_1 - 0.7):
                    agent_score_0 += 1
                else:
                    agent_score_1 += 1
            
            # 4. Check if non-English language is detected (customer more likely to speak non-English)
            if speaker_0_languages and speaker_1_languages:
                non_english_0 = sum(1 for lang in speaker_0_languages if lang != 'en')
                non_english_1 = sum(1 for lang in speaker_1_languages if lang != 'en')
                
                if non_english_0 > non_english_1:
                    # More non-English in speaker 0, so speaker 1 is more likely the agent
                    agent_score_1 += 2  # Give this a higher weight
                elif non_english_1 > non_english_0:
                    # More non-English in speaker 1, so speaker 0 is more likely the agent
                    agent_score_0 += 2  # Give this a higher weight
            
            # 5. Check who speaks first (agents typically speak first in calls)
            if segments and len(segments) > 0:
                first_speaker = segments[0]['label']
                if first_speaker == 0:
                    agent_score_0 += 1
                else:
                    agent_score_1 += 1
            
            # Determine agent based on scores
            if agent_score_0 >= agent_score_1:
                agent_label = 0
                customer_label = 1
            else:
                agent_label = 1
                customer_label = 0
                
            print(f"Speaker identification scores - Speaker 0: {agent_score_0}, Speaker 1: {agent_score_1}")
            print(f"Identified agent as Speaker {agent_label}, customer as Speaker {customer_label}")
        
        # Step 6: Group segments by speaker with improved accuracy
        agent_audio = []
        customer_audio = []
        
        for segment in segments:
            if segment['label'] == agent_label:
                agent_audio.append(segment['audio'])
            else:
                customer_audio.append(segment['audio'])
        
        # Combine segments for each speaker
        agent_audio = np.concatenate(agent_audio) if agent_audio else np.array([])
        customer_audio = np.concatenate(customer_audio) if customer_audio else np.array([])
        
        print(f"Diarization complete. Agent segments: {len(agent_audio)}, Customer segments: {len(customer_audio)}")
        return {
            'agent': agent_audio,
            'customer': customer_audio
        }
        
    def save_speaker_audio(self, speakers_dict, sample_rate, original_file_path):
        """Save separate audio files for each speaker"""
        try:
            # Create output directory based on original file name
            base_name = os.path.basename(original_file_path)
            file_name = os.path.splitext(base_name)[0]
            output_dir = os.path.dirname(original_file_path)
            
            # Save agent audio
            agent_path = os.path.join(output_dir, f"{file_name}_agent.wav")
            if len(speakers_dict['agent']) > 0:
                sf.write(agent_path, speakers_dict['agent'], sample_rate)
                print(f"Agent audio saved to: {agent_path}")
            else:
                print("No agent audio segments found")
                agent_path = None
                
            # Save customer audio
            customer_path = os.path.join(output_dir, f"{file_name}_customer.wav")
            if len(speakers_dict['customer']) > 0:
                sf.write(customer_path, speakers_dict['customer'], sample_rate)
                print(f"Customer audio saved to: {customer_path}")
            else:
                print("No customer audio segments found")
                customer_path = None
                
            return {
                'agent_path': agent_path,
                'customer_path': customer_path
            }
        except Exception as e:
            print(f"Error saving speaker audio: {e}")
            traceback.print_exc()
            return {'agent_path': None, 'customer_path': None}

    def extract_features(self, y, sample_rate):
        """Extract acoustic features from audio segments"""
        if y is None or len(y) == 0:
            return {k: 0 for k in ['pitch_mean', 'pitch_std', 'pitch_range', 'energy_mean', 'energy_std',
                                   'energy_range', 'tempo', 'pause_ratio', 'speech_rate',
                                   'spectral_centroid', 'spectral_rolloff']}

        # Use larger hop length for faster processing
        hop_length = 512
        features = {}
        
        # Extract pitch information using YIN algorithm
        features['pitch'] = librosa.yin(y, fmin=50, fmax=500, hop_length=hop_length)
        features['pitch_mean'] = np.mean(features['pitch'])
        features['pitch_std'] = np.std(features['pitch'])
        features['pitch_range'] = np.max(features['pitch']) - np.min(features['pitch'])

        # Extract energy/intensity information
        features['rms'] = librosa.feature.rms(y=y, hop_length=hop_length)[0]
        features['energy_mean'] = np.mean(features['rms'])
        features['energy_std'] = np.std(features['rms'])
        features['energy_range'] = np.max(features['rms']) - np.min(features['rms'])

        # Extract rhythm information
        tempo, _ = librosa.beat.beat_track(y=y, sr=sample_rate, hop_length=hop_length)
        features['tempo'] = tempo

        # Extract pause and speech rate information
        non_silent_intervals = librosa.effects.split(y, top_db=20)
        features['pause_ratio'] = 1 - (sum(i[1]-i[0] for i in non_silent_intervals) / len(y))
        features['speech_rate'] = len(non_silent_intervals) / (len(y) / sample_rate)

        # Extract spectral information
        features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sample_rate, hop_length=hop_length)[0])
        features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sample_rate, hop_length=hop_length)[0])

        return features

    def get_llm_analysis(self, agent_features, customer_features):
        """Get analysis from LLM APIs"""
        llm_results = {}
        feature_description = f"""
        Agent audio features:
        - Pitch (mean): {agent_features['pitch_mean']:.2f} Hz
        - Pitch variation: {agent_features['pitch_std']:.2f} Hz
        - Energy level: {agent_features['energy_mean']:.4f}
        - Speech rate: {agent_features['speech_rate']:.2f} segments/sec
        - Pause ratio: {agent_features['pause_ratio']:.2f}
        - Voice quality (spectral centroid): {agent_features['spectral_centroid']:.2f}
        Customer audio features:
        - Pitch (mean): {customer_features['pitch_mean']:.2f} Hz
        - Pitch variation: {customer_features['pitch_std']:.2f} Hz
        - Energy level: {customer_features['energy_mean']:.4f}
        - Speech rate: {customer_features['speech_rate']:.2f} segments/sec
        - Pause ratio: {customer_features['pause_ratio']:.2f}
        - Voice quality (spectral centroid): {customer_features['spectral_centroid']:.2f}
"""
        prompt = f"{feature_description}\n\nBased on these audio features, please provide:\n1. Customer sentiment score (0-10)\n2. Agent politeness score (0-10)\n3. Agent empathy score (0-10)\nInclude a brief explanation for each score."
        
        # Try OpenAI analysis 
        try:
            if openai.api_key:
                response = openai.chat.completions.create(
                    model="gpt-4",
                    messages=[
                        {"role": "system", "content": "You are an expert in analyzing voice features to determine emotional states and communication quality."},
                        {"role": "user", "content": prompt}
                        ]
                        )
                llm_results['openai'] = response.choices[0].message.content
            else:
                llm_results['openai'] = "OpenAI API key not configured. Please set the OPENAI_API_KEY environment variable."
        except Exception as e:
            error_msg = str(e)
            llm_results['openai'] = f"Error with OpenAI analysis: {error_msg}"
            print(f"OpenAI API error: {error_msg}")
            traceback.print_exc()

        # Try Gemini analysis
        try:
            if os.getenv("GOOGLE_API_KEY"):
                model = genai.GenerativeModel('gemini-pro')
                response = model.generate_content(prompt)
                llm_results['gemini'] = response.text
            else:
                llm_results['gemini'] = "Google API key not configured. Please set the GOOGLE_API_KEY environment variable."
        except Exception as e:
            error_msg = str(e)
            llm_results['gemini'] = f"Error with Gemini analysis: {error_msg}"
            print(f"Gemini API error: {error_msg}")
            traceback.print_exc()

        return llm_results

    def analyze_call(self, audio_path, scores_json_path=None):
        """Main function to analyze a call recording"""
        y, sample_rate = self.process_audio(audio_path)
        if y is None:
            return {
                'file_name': os.path.basename(audio_path),
                'error': "Failed to process audio file"
            }

        # Perform speaker diarization
        print(f"Diarizing speakers for {os.path.basename(audio_path)}...")
        speakers = self.diarize_speakers(y, sample_rate)
        
        # Save the separated audio files
        audio_paths = self.save_speaker_audio(speakers, sample_rate, audio_path)
        
        # Extract features for each speaker
        print("Extracting features for each speaker...")
        agent_features = self.extract_features(speakers['agent'], sample_rate)
        customer_features = self.extract_features(speakers['customer'], sample_rate)

        # Try to load scores from JSON if available
        scores = None
        if scores_json_path and os.path.exists(scores_json_path):
            try:
                with open(scores_json_path, 'r') as f:
                    scores = json.load(f)
                print(f"Loaded scores from {scores_json_path}")
            except Exception as e:
                print(f"Error loading scores from {scores_json_path}: {e}")
        
        if scores:
            # Use the scores from the first cell if available
            customer_sentiment = scores.get("Customer Sentiment Score", 0)
            agent_politeness = scores.get("Agent Politeness Score", 0)
            agent_empathy = scores.get("Agent Empathy Score", 0)
        else:
            # As a fallback, calculate scores directly
            agent_scores = self.calculate_scores(agent_features, 'agent')
            customer_scores = self.calculate_scores(customer_features, 'customer')
            customer_sentiment = customer_scores.get('sentiment', 0)
            agent_politeness = agent_scores.get('politeness', 0)
            agent_empathy = agent_scores.get('empathy', 0)

        # Get analysis from LLMs
        print("Getting LLM analysis...")
        llm_analysis = self.get_llm_analysis(agent_features, customer_features)

        results = {
            'file_name': os.path.basename(audio_path),
            'customer_sentiment': customer_sentiment,
            'agent_politeness': agent_politeness,
            'agent_empathy': agent_empathy,
            'agent_features': agent_features,
            'customer_features': customer_features,
            'llm_analysis': llm_analysis,
            'audio_paths': audio_paths
        }

        return results

    def calculate_scores(self, features, speaker_type):
        """Original score calculation as fallback"""
        scores = {}

        if speaker_type == 'customer':
            scores['sentiment'] = np.clip(5 +
                                         features['pitch_range'] / 50 +
                                         features['energy_mean'] * 20 - 
                                         features['pause_ratio'] * 5, 0, 10)

        if speaker_type == 'agent':
            scores['politeness'] = np.clip(7 - 
                                          abs(features['tempo'] - 100) / 20 - 
                                          features['pitch_std'] / 10 + 
                                          features['pause_ratio'] * 5, 0, 10)

            scores['empathy'] = np.clip(5 + 
                                       features['pitch_range'] / 40 + 
                                       features['energy_range'] * 10, 0, 10)

        return scores
# endregion

def save_features_to_json(features, output_path='extracted_features.json'):
    """Save features to a JSON file"""
    serializable_features = {}
    for key, df in features.items():
        serializable_features[key] = df.to_dict(orient='records')[0]
    with open(output_path, 'w') as f:
        json.dump(serializable_features, f, indent=4)
    print(f"Features saved to {output_path}")

def format_features_for_display(features_dict):
    """Format features dictionary into a readable string"""
    result = []
    
    # Handle OpenSMILE features
    if 'opensmile' in features_dict:
        result.append("### OpenSMILE Features")
        for key, value in features_dict['opensmile'].iloc[0].items():
            result.append(f"- **{key}**: {value:.4f}")
    
    # Handle Praat features
    if 'praat' in features_dict:
        result.append("\n### Praat Features")
        for key, value in features_dict['praat'].iloc[0].items():
            result.append(f"- **{key}**: {value:.4f}")
    
    return "\n".join(result)

def format_call_features(features_dict):
    """Format call analysis features into a readable string"""
    if not features_dict:
        return "No features available"
        
    result = []
    # Skip 'pitch' and 'rms' which are arrays
    skip_keys = ['pitch', 'rms']
    
    for key, value in features_dict.items():
        if key not in skip_keys:
            if isinstance(value, (int, float)):
                result.append(f"- **{key.replace('_', ' ').title()}**: {value:.4f}")
            else:
                result.append(f"- **{key.replace('_', ' ').title()}**: {value}")
    
    return "\n".join(result)

def create_feature_table(features, call_results):
    """Create a DataFrame with all features for display"""
    # Combine all features into a single dictionary
    all_features = {}
    
    # Add OpenSMILE features
    if 'opensmile' in features:
        for col in features['opensmile'].columns:
            all_features[f"OpenSMILE: {col}"] = features['opensmile'][col].values[0]
    
    # Add Praat features
    if 'praat' in features:
        for col in features['praat'].columns:
            all_features[f"Praat: {col}"] = features['praat'][col].values[0]
    
    # Add agent features
    if 'agent_features' in call_results:
        for key, value in call_results['agent_features'].items():
            if key not in ['pitch', 'rms'] and isinstance(value, (int, float)):
                all_features[f"Agent: {key}"] = value
    
    # Add customer features
    if 'customer_features' in call_results:
        for key, value in call_results['customer_features'].items():
            if key not in ['pitch', 'rms'] and isinstance(value, (int, float)):
                all_features[f"Customer: {key}"] = value
    
    # Convert to DataFrame for display
    df = pd.DataFrame([all_features])
    return df

def create_analysis_visualization(output_dir, filename, voice_results, call_results):
    """Create a custom visualization and save it to a file"""
    try:
        # Create the output path
        output_path = os.path.join(output_dir, f"{filename}_analysis.png")

        # Create a new figure with a smaller DPI for faster rendering
        plt.figure(figsize=(12, 10), dpi=80)

        # Get scores
        customer_sentiment = voice_results['sentiment']['score']
        agent_politeness = voice_results['politeness']['score']
        agent_empathy = voice_results['empathy']['score']

        # Plot 1: Scores
        plt.subplot(221)
        metrics = ['Customer Sentiment', 'Agent Politeness', 'Agent Empathy']
        values = [float(customer_sentiment), float(agent_politeness), float(agent_empathy)]
        bars = plt.bar(metrics, values, color=['blue', 'green', 'purple'])
        plt.ylim(0, 10)
        plt.title('Call Analysis Scores')
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.1f}',
                     ha='center', va='bottom')

        # Plot 2: Voice Feature Comparison
        plt.subplot(222)
        if 'agent_features' in call_results and 'customer_features' in call_results:
            features_to_plot = ['pitch_mean', 'energy_mean', 'speech_rate']

            # Get values, handling missing features
            agent_values = [call_results['agent_features'].get(f, 0) for f in features_to_plot]
            customer_values = [call_results['customer_features'].get(f, 0) for f in features_to_plot]

            # Normalize for better visualization
            max_vals = [max(a, c) if max(a,c) > 0 else 1 for a, c in zip(agent_values, customer_values)]
            agent_norm = [a/m*5 for a, m in zip(agent_values, max_vals)]
            customer_norm = [c/m*5 for c, m in zip(customer_values, max_vals)]

            x = np.arange(len(features_to_plot))
            width = 0.35

            plt.bar(x - width/2, agent_norm, width, label='Agent')
            plt.bar(x + width/2, customer_norm, width, label='Customer')
            plt.xticks(x, [f.replace('_', ' ').title() for f in features_to_plot], rotation=45, ha="right")
            plt.ylabel('Normalized Value')
            plt.legend()
            plt.title('Normalized Voice Features')

        else:
            plt.text(0.5, 0.5, 'Feature data not available',horizontalalignment='center', verticalalignment='center')
            plt.title('Voice Feature Comparison')

        # Plot 3: Additional Features
        plt.subplot(223)
        if 'agent_features' in call_results:
            try:
                features_to_plot = ['pause_ratio', 'spectral_centroid', 'tempo']
                
                # Extract values safely, ensuring they are simple numbers
                values = []
                for f in features_to_plot:
                    val = call_results['agent_features'].get(f, 0)
                    # Convert any complex objects to float
                    if isinstance(val, (list, np.ndarray)):
                        val = float(val[0]) if len(val) > 0 else 0.0
                    values.append(float(val))
                
                # Normalize for better visualization
                max_val = max(values) if max(values) > 0 else 1.0
                norm_values = [float(v/max_val*5) for v in values]
                
                # Use integers for x-axis
                x_pos = np.arange(len(features_to_plot))
                
                # Create a simple bar chart with explicit numeric x-positions
                for i, (pos, val, name) in enumerate(zip(x_pos, norm_values, features_to_plot)):
                    plt.bar(pos, val, color='teal')
                    
                plt.xticks(x_pos, [f.replace('_', ' ').title() for f in features_to_plot], 
                           rotation=45, ha="right")
                plt.title('Agent Voice Characteristics (Normalized)')
                plt.ylabel('Normalized Value')
            except Exception as e:
                print(f"Error in Plot 3: {str(e)}")
                plt.text(0.5, 0.5, f'Error creating plot: {str(e)}',
                         horizontalalignment='center')
        else:
            plt.text(0.5, 0.5, 'Feature data not available',
                     horizontalalignment='center', verticalalignment='center')
            plt.title('Agent Voice Characteristics')

        # Plot 4: LLM Analysis Summary
        plt.subplot(212)
        plt.axis('off')

        llm_text = "LLM Analysis Summary:\n\n"
        if 'llm_analysis' in call_results:
            for llm, analysis in call_results['llm_analysis'].items():
                if analysis:
                    # Check if the analysis contains an error message
                    if analysis.startswith("Error with") or "API key not configured" in analysis:
                        llm_text += f"{llm.upper()}: {analysis}\n\n"
                    else:
                        # Limit the text length for display
                        summary = analysis[:300] + "..." if len(analysis) > 300 else analysis
                        llm_text += f"{llm.upper()}:\n{summary}\n\n"
                else:
                    llm_text += f"{llm.upper()}: No response\n\n"
        else:
            llm_text += "No LLM analysis available"

        plt.text(0.05, 0.95, llm_text, fontsize=9,
                 verticalalignment='top', wrap=True)

        plt.tight_layout()
        plt.savefig(output_path, dpi=80)  # Lower DPI for faster saving
        plt.close()

        return output_path

    except Exception as e:
        print(f"Error creating visualization: {str(e)}")
        traceback.print_exc()
        # Return a fallback path even if the visualization fails
        return os.path.join(output_dir, f"{filename}_analysis.png")

def create_waveform_comparison(original_audio, noise_reduced_audio, sample_rate, output_path):
    """Create a waveform comparison between original and noise-reduced audio"""
    try:
        # Use a smaller figure size and lower DPI for faster rendering
        plt.figure(figsize=(10, 6), dpi=80)
        
        # Plot original audio
        plt.subplot(211)
        plt.plot(np.linspace(0, len(original_audio)/sample_rate, len(original_audio)), original_audio)
        plt.title('Original Audio Waveform')
        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude')
        
        # Plot noise-reduced audio
        plt.subplot(212)
        plt.plot(np.linspace(0, len(noise_reduced_audio)/sample_rate, len(noise_reduced_audio)), noise_reduced_audio, color='green')
        plt.title('Noise-Reduced Audio Waveform')
        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude')
        
        plt.tight_layout()
        plt.savefig(output_path, dpi=80)  # Lower DPI for faster saving
        plt.close()
        
        return output_path
    except Exception as e:
        print(f"Error creating waveform comparison: {e}")
        return None

# Function to process audio in a separate thread
def process_audio_thread(audio_path, progress=None):
    """Process audio in a separate thread to avoid blocking the UI"""
    try:
        # Initialize components
        audio_processor = AudioProcessor()
        
        # Process the audio file
        print(f"Processing audio file: {audio_path}")
        if progress:
            progress(0.1, "Loading audio file...")
        processed_file, y_reduced, sample_rate, noise_reduced_path = audio_processor.process_audio(audio_path)
        
        if not processed_file:
            return None, None, None, None
        
        if progress:
            progress(0.3, "Audio processing complete")
        
        # Load original audio for comparison
        y_original, sr_original = librosa.load(audio_path, sr=None)
        
        # Create waveform comparison
        if progress:
            progress(0.4, "Creating waveform comparison...")
        waveform_comparison_path = os.path.join(os.path.dirname(processed_file), 
                                               f"{os.path.basename(audio_path)}_waveform_comparison.png")
        create_waveform_comparison(y_original, y_reduced, sample_rate, waveform_comparison_path)
        
        return processed_file, y_reduced, sample_rate, noise_reduced_path, waveform_comparison_path
    except Exception as e:
        traceback.print_exc()
        print(f"Error in audio processing thread: {e}")
        return None, None, None, None, None

def process_and_analyze(audio_path, progress=gr.Progress()):
    """Process and analyze an audio file given its path"""
    try:
        if not os.path.exists(audio_path):
            return "Error: Audio file not found", None, None, None, None, None, None
        
        # Process audio in a separate thread to avoid blocking the UI
        progress(0.05, "Starting audio processing...")
        processed_file, y_reduced, sample_rate, noise_reduced_path, waveform_comparison_path = process_audio_thread(
            audio_path, 
            lambda p, msg: progress(p, msg)
        )
        
        if not processed_file:
            return "Audio processing failed", None, None, None, None, None, None
        
        # Initialize components for analysis
        feature_extractor = FeatureExtractor()
        voice_analyzer = VoiceAnalyzer()
        call_analyzer = CallAnalyzer()
        
        # Extract features
        progress(0.5, "Extracting audio features...")
        features = feature_extractor.extract_all_features(processed_file, y_reduced, sample_rate)
        
        # Save features to JSON for reference
        features_json_path = f"{os.path.splitext(processed_file)[0]}_features.json"
        save_features_to_json(features, features_json_path)
        
        # Analyze voice
        progress(0.6, "Analyzing voice characteristics...")
        voice_results, indicators = voice_analyzer.analyze(features)
        
        # Save scores to JSON
        # Save scores to JSON
        scores_json_path = f"{os.path.splitext(processed_file)[0]}_scores.json"
        # FIX: Wrap values in float() to convert from numpy types
        score_output = {  
            "Customer Sentiment Score": float(voice_results['sentiment']['score']),
            "Agent Politeness Score": float(voice_results['politeness']['score']),
            "Agent Empathy Score": float(voice_results['empathy']['score'])
        }
        with open(scores_json_path, "w") as f:
            json.dump(score_output, f, indent=4)
        
        # Run call analyzer with speaker diarization
        progress(0.7, "Running call analysis with speaker diarization...")
        call_results = call_analyzer.analyze_call(processed_file, scores_json_path)
        
        # Get the paths to the separated audio files
        agent_audio_path = call_results.get('audio_paths', {}).get('agent_path', None)
        customer_audio_path = call_results.get('audio_paths', {}).get('customer_path', None)
        
        # Create visualization
        progress(0.9, "Creating visualization...")
        analysis_image_path = create_analysis_visualization(
            os.path.dirname(processed_file),
            os.path.basename(audio_path),
            voice_results,
            call_results
        )
        
        # Format all features for display
        formatted_features = format_features_for_display(features)
        
        # Format agent and customer features
        agent_features_formatted = format_call_features(call_results.get('agent_features', {}))
        customer_features_formatted = format_call_features(call_results.get('customer_features', {}))
        
        # Prepare results text with all features
        results_text = f"""
## Analysis Results for {os.path.basename(audio_path)}

**Duration:** {librosa.get_duration(y=y_reduced, sr=sample_rate):.2f} seconds

### Scores
- **Customer Sentiment:** {voice_results['sentiment']['score']:.2f}/10 - {voice_results['sentiment']['description']}
- **Agent Politeness:** {voice_results['politeness']['score']:.2f}/10 - {voice_results['politeness']['description']}
- **Agent Empathy:** {voice_results['empathy']['score']:.2f}/10 - {voice_results['empathy']['description']}

### Noise Reduction
Noise reduction has been applied to improve audio quality. The noise-reduced audio is available for playback.

### Agent Features
{agent_features_formatted}

### Customer Features
{customer_features_formatted}

## Extracted Audio Features
{formatted_features}
"""
        
        # Create a feature table for display
        feature_table = create_feature_table(features, call_results)
        
        progress(1.0, "Analysis complete!")
        return results_text, noise_reduced_path, agent_audio_path, customer_audio_path, analysis_image_path, waveform_comparison_path, feature_table
        
    except Exception as e:
        traceback.print_exc()
        error_msg = str(e)
        # Truncate error message to avoid content length issues
        if len(error_msg) > 500:
            error_msg = error_msg[:500] + "..."
        return f"Error: {error_msg}", None, None, None, None, None, None

# Create a Gradio interface with streamlined components
demo = gr.Interface(
    fn=process_and_analyze,
    inputs=gr.Audio(type="filepath", label="Upload Call Recording"),
    outputs=[
        gr.Markdown(label="Analysis Results"),
        gr.Audio(label="Noise-Reduced Audio"),
        gr.Audio(label="Agent Audio"),
        gr.Audio(label="Customer Audio"),
        gr.Image(label="Analysis Visualization"),
        gr.Image(label="Waveform Comparison (Original vs. Noise-Reduced)"),
        gr.Dataframe(label="All Extracted Features")
    ],
    title="Call Audio Analysis Tool",
    description="""
    Upload a call recording to analyze:
    1. Noise reduction to clean the audio
    2. Customer sentiment analysis
    3. Agent politeness and empathy evaluation
    4. Speaker diarization with language detection
    """,
    theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="violet"),
    examples=[],  # You can add example audio files here if needed
    cache_examples=False
)

# Launch the app with optimized settings
if __name__ == "__main__":
   
    demo.launch(share=False)

  super().__init__(


* Running on local URL:  http://127.0.0.1:7867
* To create a public link, set `share=True` in `launch()`.


Processing audio file: /private/var/folders/g_/zzf2f_250pb_mndpb1ygp24w0000gn/T/gradio/87c2ba7481a671eb4878351c837819221dae7fd471ff8b03d39541c9ca66f1d8/4504.wav
Loading /private/var/folders/g_/zzf2f_250pb_mndpb1ygp24w0000gn/T/gradio/87c2ba7481a671eb4878351c837819221dae7fd471ff8b03d39541c9ca66f1d8/4504.wav...
Long file detected (470.76 seconds), optimizing processing...
Original: Sample Rate = 22050 Hz, Duration = 470.76 seconds
Applying noise reduction...
Noise reduction complete
Noise reduced audio saved to /private/var/folders/g_/zzf2f_250pb_mndpb1ygp24w0000gn/T/gradio/87c2ba7481a671eb4878351c837819221dae7fd471ff8b03d39541c9ca66f1d8/4504_noise_reduced.wav
Resampling to 16000 Hz...
Processed audio saved to /private/var/folders/g_/zzf2f_250pb_mndpb1ygp24w0000gn/T/gradio/87c2ba7481a671eb4878351c837819221dae7fd471ff8b03d39541c9ca66f1d8/4504_16khz.wav
Extracting custom OpenSMILE-style features...
Extracting Praat-like features...
Features saved to /private/var/folders/g_/zzf2f_250pb_mndpb