In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import json


In [None]:

class VoiceAuthenticator:
    def __init__(self):
        # Thresholds derived from the research paper's observations
        # The paper notes AI has "narrower intra group variability" [cite: 130]
        # and "standardized volume".
        self.THRESHOLDS = {
            'pitch_variability_min': 20.0,    # Humans usually have > 20Hz std dev
            'silence_ratio_max': 0.15,        # AI often has cleaner, unnatural pacing
            'intensity_dynamic_range': 0.06   # Minimum RMS std dev for humans
        }

    def extract_features(self, audio_path):
        """
        Extracts the "Trinity" of features defined in the paper:
        Pitch (F0), Intensity (RMS), and Frequency.
        """
        try:
            # 1. Load Audio
            # y = audio time series, sr = sample rate
            y, sr = librosa.load(audio_path, sr=None)

            # 2. Extract PITCH (Fundamental Frequency - F0)
            # The paper states pitch is the "primary trait" for distinction [cite: 18]
            # using pYIN (Probabilistic YIN) which is robust for voice
            f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            
            # Filter out NaNs (unvoiced parts)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0:
                return None, "Audio too silent or no voice detected"

            # 3. Extract INTENSITY (Loudness)
            # Paper: AI voices "produce more louder outputs due to standardized volume" 
            rms = librosa.feature.rms(y=y)[0]
            
            # 4. Extract FREQUENCY characteristics
            # Paper: AI male voices show "consistent negative deltas" (compression) [cite: 206]
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]

            features = {
                'pitch_std': np.std(valid_f0),       # Measure of jitter/variation
                'pitch_range': np.ptp(valid_f0),     # Max - Min pitch
                'intensity_mean': np.mean(rms),      # Average loudness
                'intensity_std': np.std(rms),        # Dynamic range (Emotion)
                'freq_skew': stats.skew(spectral_centroid) # Spectral shape
            }
            
            return features, None

        except Exception as e:
            return None, str(e)

    def calculate_ai_score(self, features):
        """
        Generates a probability score (0.0 - 1.0) using continuous linear mapping.
        Instead of hard thresholds, it scales the score based on how 'perfect' or 'flat' the features are.
        """
        
        # Helper function for linear mapping (clamped between 0 and 1)
        # Low values = High AI Score (Inverse relationship)
        def get_linear_score(val, min_val, max_val):
            if val <= min_val: return 1.0  # Extremely low value -> Definitely AI
            if val >= max_val: return 0.0  # High value -> Definitely Human
            # Linear interpolation formula
            return 1.0 - ((val - min_val) / (max_val - min_val))

        # --- WEIGHTS (Importance of each feature) ---
        # Pitch is the strongest indicator (45%), followed by Intensity (35%), then Frequency Skew (20%)
        W_PITCH = 0.45
        W_INTENSITY = 0.35
        W_SKEW = 0.20

        explanations = []

        # 1. PITCH SCORE (Linear Scale: 10Hz to 45Hz)
        # If pitch std is < 10Hz, it's 100% AI. If > 45Hz, it's 0% AI.
        p_score = get_linear_score(features['pitch_std'], 10.0, 45.0)
        
        # 2. INTENSITY SCORE (Linear Scale: 0.02 to 0.12)
        # If intensity std is < 0.02, it's 100% AI. If > 0.12, it's 0% AI.
        # This catches the "0.072" hackathon case which falls roughly in the middle (~48% AI score).
        i_score = get_linear_score(features['intensity_std'], 0.02, 0.12)

        # 3. SKEW SCORE (Linear Scale: 0.1 to 1.5)
        # Closer to 0 means perfectly balanced (AI). 
        # We take absolute value because skew can be negative.
        s_score = get_linear_score(abs(features['freq_skew']), 0.1, 1.5)

        # --- FINAL CALCULATION ---
        final_score = (p_score * W_PITCH) + (i_score * W_INTENSITY) + (s_score * W_SKEW)

        # --- DYNAMIC EXPLANATIONS ---
        # Only add explanations for features that contributed significantly to the AI score
        if p_score > 0.5:
            explanations.append(f"Pitch is strictly constrained ({features['pitch_std']:.2f}Hz)")
        else:
            explanations.append(f"Pitch shows human variability ({features['pitch_std']:.2f}Hz)")

        if i_score > 0.5:
            explanations.append(f"Intensity is standardized (std: {features['intensity_std']:.3f})")
        
        if s_score > 0.6: # Skew is a weaker signal, so we only mention it if it's very strong
            explanations.append("Frequency distribution is statistically smoothed")

        return round(final_score, 3), "; ".join(explanations)

    def analyze(self, file_path, language="Unknown"):
        features, error = self.extract_features(file_path)
        
        if error:
            return {"status": "error", "message": error}

        ai_probability, explanation_text = self.calculate_ai_score(features)

        # Decision Threshold (0.5 is neutral, >0.6 is confident AI)
        classification = "AI_GENERATED" if ai_probability > 0.6 else "HUMAN"
        
        return {
            "status": "success",
            "language": language,
            "classification": classification,
            "confidenceScore": round(ai_probability, 3),
            "explanation": explanation_text,
            "debug_features": features  # Helpful for your own tuning
        }



In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import json

class ResearchBackedAuthenticator:
    def __init__(self):
        # --- RESEARCH-DRIVEN THRESHOLDS ---
        
        # 1. PITCH (F0) VARIABILITY
        # Source:
        # Logic: "Reduced F0 variation" is the primary marker of synthesis.
        # - < 15Hz: Highly unnatural (Robotic/Standard TTS)
        # - > 45Hz: Highly dynamic (Natural/Expressive Human)
        self.PITCH_MIN_HZ = 15.0  
        self.PITCH_MAX_HZ = 45.0

        # 2. INTENSITY (AMPLITUDE) DYNAMICS
        # Source:
        # Logic: AI models use normalization (standardized volume), reducing dynamic range.
        # - < 0.04: Extremely consistent (Broadcast/AI)
        # - > 0.12: Natural variation (Breaths, trailing off)
        self.INTENSITY_MIN = 0.04
        self.INTENSITY_MAX = 0.12

        # 3. SPECTRAL SKEWNESS
        # Source:
        # Logic: AI audio has "lower spectral complexity" and is statistically smoother.
        # - 0.0: Perfectly symmetrical (Mathematical generation)
        # - > 1.0: Complex distribution (Organic/Physical source)
        self.SKEW_MIN = 0.1
        self.SKEW_MAX = 1.5

    def get_linear_score(self, val, min_val, max_val):
        """
        Calculates a 'Synthetic Probability' (0.0 to 1.0) for a single feature.
        - Value <= min_val: 1.0 (Definitely AI)
        - Value >= max_val: 0.0 (Definitely Human)
        - In-between: Linearly interpolated
        """
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=None)
            
            # Extract Pitch (F0) using pYIN (Robust against noise)
            f0, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return None, "No voice detected"

            # Extract Intensity (RMS)
            rms = librosa.feature.rms(y=y)[0]
            
            # Extract Frequency Skewness
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            return {
                'pitch_std': np.std(valid_f0),
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(spectral_centroid)
            }, None
        except Exception as e:
            return None, str(e)

    def calculate_score(self, features):
        """
        Computes the final 'AI Confidence Score' using weighted research parameters.
        """
        # Weights based on feature reliability
        # Pitch is the strongest biological marker.
        W_PITCH = 0.45      
        W_INTENSITY = 0.35  
        W_SKEW = 0.20       

        # Calculate individual feature scores
        p_score = self.get_linear_score(features['pitch_std'], self.PITCH_MIN_HZ, self.PITCH_MAX_HZ)
        i_score = self.get_linear_score(features['intensity_std'], self.INTENSITY_MIN, self.INTENSITY_MAX)
        s_score = self.get_linear_score(abs(features['freq_skew']), self.SKEW_MIN, self.SKEW_MAX)

        # Weighted Sum
        final_score = (p_score * W_PITCH) + (i_score * W_INTENSITY) + (s_score * W_SKEW)
        
        # Generate Explanations
        reasons = []
        if p_score > 0.6:
            reasons.append(f"Low pitch variation ({features['pitch_std']:.1f}Hz) indicates synthetic constraint")
        elif p_score < 0.4:
            reasons.append(f"High pitch dynamics ({features['pitch_std']:.1f}Hz) typical of human physiology")
            
        if i_score > 0.6:
            reasons.append("Unnaturally consistent volume (Standardized Intensity)")
            
        if s_score > 0.7:
             reasons.append("Frequency distribution is statistically too smooth")

        return round(final_score, 3), "; ".join(reasons)

    def analyze(self, file_path):
        feats, err = self.extract_features(file_path)
        if err: return {"status": "error", "message": err}
        
        score, explanation = self.calculate_score(feats)
        return {
            "status": "success",
            "classification": "AI_GENERATED" if score > 0.5 else "HUMAN",
            "confidenceScore": score,
            "explanation": explanation
        }

In [None]:
# --- USAGE EXAMPLE ---
# Create the analyzer
detector = ResearchBackedAuthenticator()

# Replace with your actual MP3 file path
# Note: In a real API, you would first decode the Base64 string to a temp file
sample_file = r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\narration_20251210_232337.mp3" 
sample_file = r"narration_20251210_232729.mp3"
sample_file=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\sample voice 1.mp3" 
sample_file=r"voice_preview_faiq - standard, clear and neutral.mp3"
sample_file=r"voice_preview_tarini - soft, cheerful and expressive.mp3"

# Run analysis
result = detector.analyze(sample_file)

# Print nicely
print(result)

In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import json

class GenderAdaptiveAuthenticator:
    def __init__(self):
        # --- RESEARCH-DRIVEN THRESHOLDS ---
        
        # GENDER SPLIT
        # Source:
        # Male F0 Range: 85-180 Hz | Female F0 Range: 165-255 Hz
        self.GENDER_FREQ_SPLIT = 165.0 

        # 1. PITCH VARIABILITY (Standard Deviation)
        # Source: - Female speech has naturally higher variability (SD ~28Hz) than Male (~21Hz)
        
        # MALE THRESHOLDS
        self.MALE_AI_MAX_STD = 15.0    # Below this is definitely AI (Robotic)
        self.MALE_HUMAN_MIN_STD = 35.0 # Above this is definitely Human
        
        # FEMALE THRESHOLDS (Shifted UP)
        # Your error happened because 36.8Hz is "Human" for a male, but "Robotic" for a female.
        self.FEMALE_AI_MAX_STD = 40.0   # Expanded range for Female AI
        self.FEMALE_HUMAN_MIN_STD = 60.0 # Females need much more variance to be "Human"

        # 2. INTENSITY & SKEW (Gender neutral)
        self.INTENSITY_MIN = 0.04
        self.INTENSITY_MAX = 0.12
        self.SKEW_MIN = 0.1
        self.SKEW_MAX = 1.5

    def get_linear_score(self, val, min_val, max_val):
        """
        Calculates Synthetic Probability (0.0 to 1.0)
        """
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=None)
            
            # Extract Pitch (F0)
            f0, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return None, "No voice detected"

            rms = librosa.feature.rms(y=y)[0]
            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            return {
                'mean_f0': np.mean(valid_f0), # Used for Gender Detection
                'pitch_std': np.std(valid_f0),
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(spectral_centroid)
            }, None
        except Exception as e:
            return None, str(e)

    def calculate_score(self, features):
        explanations = []
        
        # --- STEP 1: DETECT GENDER ---
        if features['mean_f0'] > self.GENDER_FREQ_SPLIT:
            gender = "Female"
            # Use stricter Female Thresholds
            p_min = self.FEMALE_AI_MAX_STD
            p_max = self.FEMALE_HUMAN_MIN_STD
            explanations.append(f"Voice detected as Female (Mean F0: {features['mean_f0']:.0f}Hz)")
        else:
            gender = "Male"
            # Use Standard Male Thresholds
            p_min = self.MALE_AI_MAX_STD
            p_max = self.MALE_HUMAN_MIN_STD
            explanations.append(f"Voice detected as Male (Mean F0: {features['mean_f0']:.0f}Hz)")

        # --- STEP 2: CALCULATE SCORES ---
        
        # Pitch Score (Dynamic based on Gender)
        p_score = self.get_linear_score(features['pitch_std'], p_min, p_max)
        
        # Intensity & Skew (Global)
        i_score = self.get_linear_score(features['intensity_std'], self.INTENSITY_MIN, self.INTENSITY_MAX)
        s_score = self.get_linear_score(abs(features['freq_skew']), self.SKEW_MIN, self.SKEW_MAX)

        # Weighted Sum
        W_PITCH = 0.45
        W_INTENSITY = 0.35
        W_SKEW = 0.20
        
        final_score = (p_score * W_PITCH) + (i_score * W_INTENSITY) + (s_score * W_SKEW)

        # --- DYNAMIC EXPLANATION ---
        if p_score > 0.5:
            explanations.append(f"Pitch variance ({features['pitch_std']:.1f}Hz) is too low for a natural {gender} voice")
        else:
            explanations.append(f"Pitch variance ({features['pitch_std']:.1f}Hz) aligns with natural {gender} physiology")

        if i_score > 0.6:
            explanations.append("Intensity is standardized (AI artifact)")

        return round(final_score, 3), "; ".join(explanations)

    def analyze(self, file_path):
        feats, err = self.extract_features(file_path)
        if err: return {"status": "error", "message": err}
        
        score, explanation = self.calculate_score(feats)
        return {
            "status": "success",
            "classification": "AI_GENERATED" if score > 0.55 else "HUMAN",
            "confidenceScore": score,
            "explanation": explanation
        }

In [None]:
# --- USAGE EXAMPLE ---
# Create the analyzer
detector = GenderAdaptiveAuthenticator()

# Replace with your actual MP3 file path
# Note: In a real API, you would first decode the Base64 string to a temp file
sample_file = r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\narration_20251210_232337.mp3" 
# sample_file = r"narration_20251210_232729.mp3"
sample_file=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\sample voice 1.mp3" 
# sample_file=r"voice_preview_faiq - standard, clear and neutral.mp3"
# sample_file=r"voice_preview_tarini - soft, cheerful and expressive.mp3"
sample_file=r"voice_preview_kanika - soft, smooth and muffled.mp3"

# Run analysis
result = detector.analyze(sample_file)

# Print nicely
print(result)

In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import json

class RobustVoiceAuthenticator:
    def __init__(self):
        # --- RESEARCH-DRIVEN THRESHOLDS ---
        
        # 1. COEFFICIENT OF VARIATION (CV) - The "Gender-Agnostic" Metric
        # CV = Standard Deviation / Mean Pitch
        # Logic: Normalizes variation. 
        # - AI (Smoothed): CV < 0.18 (Variation is small relative to pitch)
        # - Human (Dynamic): CV > 0.25 (Variation is distinct relative to pitch)
        # This works for both 110Hz (Male) and 220Hz (Female) equally.
        self.CV_AI_THRESHOLD = 0.18
        self.CV_HUMAN_THRESHOLD = 0.25

        # 2. INTENSITY (AMPLITUDE) - The "Lie Detector" 
        # Paper: AI Intensity has ~0 correlation with humans. It is the strongest tell.
        self.INTENSITY_MIN_STD = 0.04  # Very Flat (AI)
        self.INTENSITY_MAX_STD = 0.11  # Very Dynamic (Human)

        # 3. SPECTRAL SKEW
        self.SKEW_AI_THRESHOLD = 0.5  # Too symmetrical

    def get_linear_score(self, val, min_val, max_val):
        """ Returns 1.0 for AI-like values (<= min), 0.0 for Human-like (>= max) """
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def extract_features(self, audio_path):
        try:
            # Load audio (mono)
            y, sr = librosa.load(audio_path, sr=None)
            
            # Use 'sr' in pyin to help it guess range better, but keep bounds wide
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return None, "No voice detected"

            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            return {
                'pitch_cv': std_pitch / mean_pitch,  # The new metric
                'pitch_std': std_pitch,
                'mean_pitch': mean_pitch,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }, None
        except Exception as e:
            return None, str(e)

    def calculate_score(self, features):
        explanations = []
        
        # --- 1. INTENSITY SCORE (Primary Discriminator) ---
        # Weight increased because Paper  says Pitch can be faked, Intensity cannot.
        i_score = self.get_linear_score(features['intensity_std'], self.INTENSITY_MIN_STD, self.INTENSITY_MAX_STD)
        
        # --- 2. PITCH CV SCORE (Normalized Variability) ---
        # Uses relative variation (CV) to solve the 110Hz vs 220Hz issue.
        p_score = self.get_linear_score(features['pitch_cv'], self.CV_AI_THRESHOLD, self.CV_HUMAN_THRESHOLD)
        
        # --- 3. SKEW SCORE ---
        s_score = self.get_linear_score(abs(features['freq_skew']), 0.1, 1.0)

        # --- SCORING LOGIC ---
        # Standard Weighted Average
        # We give Intensity higher weight now.
        W_INTENSITY = 0.50  #  "Intensity... weak or even negative correlation"
        W_PITCH = 0.30      #  "High positive correlation" (Can be faked)
        W_SKEW = 0.20
        
        base_score = (i_score * W_INTENSITY) + (p_score * W_PITCH) + (s_score * W_SKEW)

        # --- THE "VETO" RULE ---
        # If Intensity is STRONGLY AI (Standardized), we force the score up.
        # Even if pitch variation is perfect (because of ElevenLabs), standard volume reveals it.
        if i_score > 0.8: 
            final_score = max(base_score, 0.75) # Force classification to AI
            explanations.append(f"‚ö†Ô∏è Intensity is highly standardized (Score {i_score:.2f}), overriding pitch metrics.")
        else:
            final_score = base_score

        # Explanations
        if i_score > 0.6:
            explanations.append(f"Volume is unnaturally consistent (std: {features['intensity_std']:.3f})")
        if p_score > 0.6:
            explanations.append(f"Pitch modulation is constrained (CV: {features['pitch_cv']:.2f})")
        elif p_score < 0.4:
            explanations.append(f"Pitch is dynamic (CV: {features['pitch_cv']:.2f})")

        return round(final_score, 3), "; ".join(explanations)

    def analyze(self, file_path):
        feats, err = self.extract_features(file_path)
        if err: return {"status": "error", "message": err}
        
        score, explanation = self.calculate_score(feats)
        
        # Threshold 0.55 allows a small margin of error
        return {
            "status": "success",
            "classification": "AI_GENERATED" if score > 0.55 else "HUMAN",
            "confidenceScore": score,
            "explanation": explanation,
            "debug": feats
        }

In [None]:
# --- USAGE EXAMPLE ---
# Create the analyzer
detector = RobustVoiceAuthenticator()

# Replace with your actual MP3 file path
# Note: In a real API, you would first decode the Base64 string to a temp file
sample_file = r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\narration_20251210_232337.mp3" 
sample_file = r"narration_20251210_232729.mp3"
sample_file=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\sample voice 1.mp3" 
# sample_file=r"voice_preview_faiq - standard, clear and neutral.mp3"
# sample_file=r"voice_preview_tarini - soft, cheerful and expressive.mp3"
# sample_file=r"voice_preview_kanika - soft, smooth and muffled.mp3"

# Run analysis
result = detector.analyze(sample_file)

# Print nicely
print(result)

In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import json

class FinalVoiceAuthenticator:
    def __init__(self):
        # --- TUNED PARAMETERS ---
        
        # 1. PITCH CV (Coefficient of Variation)
        # We raised the 'Human' bar slightly. 
        # Before: 0.25 was guaranteed Human. Now: 0.32 is guaranteed Human.
        # This makes 0.19 (your sample) look more "AI-like".
        self.CV_AI_THRESHOLD = 0.15      # Strictly Robotic
        self.CV_HUMAN_THRESHOLD = 0.32   # Natural Expressiveness

        # 2. INTENSITY (Loudness)
        # We shifted the window to catch the "0.072" case.
        # Anything below 0.05 is definite AI.
        # Anything above 0.15 is definite Human.
        self.INTENSITY_MIN_STD = 0.03
        self.INTENSITY_MAX_STD = 0.15 

        # 3. SPECTRAL SKEW
        self.SKEW_AI_THRESHOLD = 0.5

    def get_linear_score(self, val, min_val, max_val):
        """ Returns 1.0 for AI (<= min), 0.0 for Human (>= max) """
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=None)
            
            # Robust Pitch Tracking
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return None, "No voice detected"

            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            return {
                'pitch_cv': std_pitch / mean_pitch,
                'pitch_std': std_pitch,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }, None
        except Exception as e:
            return None, str(e)

    def calculate_score(self, features):
        explanations = []
        
        # --- CALCULATE RAW SCORES ---
        i_score = self.get_linear_score(features['intensity_std'], self.INTENSITY_MIN_STD, self.INTENSITY_MAX_STD)
        p_score = self.get_linear_score(features['pitch_cv'], self.CV_AI_THRESHOLD, self.CV_HUMAN_THRESHOLD)
        s_score = self.get_linear_score(abs(features['freq_skew']), 0.1, 1.0)

        # --- WEIGHTS ---
        # Pitch is slightly more reliable for high-quality clones, 
        # Intensity is the fallback for "perfect" clones.
        W_INTENSITY = 0.40
        W_PITCH = 0.40
        W_SKEW = 0.20
        
        base_score = (i_score * W_INTENSITY) + (p_score * W_PITCH) + (s_score * W_SKEW)

        # --- THE SYNERGY BONUS (The Fix for 0.499) ---
        # If BOTH Pitch and Intensity are "Suspicious" (> 0.4),
        # it is highly unlikely to be a Human. Humans usually trade off (Monotone but loud, or Quiet but expressive).
        # We add a 0.15 boost if both metrics are flagging.
        if i_score > 0.4 and p_score > 0.4:
            final_score = min(base_score + 0.15, 1.0)
            explanations.append("Combined lack of Pitch and Intensity dynamics suggests synthesis")
        else:
            final_score = base_score

        # --- EXPLANATIONS ---
        if final_score > 0.5:
            if i_score > 0.5: explanations.append(f"Intensity is standardized (std: {features['intensity_std']:.3f})")
            if p_score > 0.5: explanations.append(f"Pitch is constrained (CV: {features['pitch_cv']:.2f})")
        else:
             explanations.append("Voice exhibits natural variability in pitch and loudness")

        return round(final_score, 3), "; ".join(explanations)

    def analyze(self, file_path):
        feats, err = self.extract_features(file_path)
        if err: return {"status": "error", "message": err}
        
        score, explanation = self.calculate_score(feats)
        
        return {
            "status": "success",
            "classification": "AI_GENERATED" if score > 0.55 else "HUMAN",
            "confidenceScore": score,
            "explanation": explanation,
            "debug": feats
        }

In [None]:
# --- USAGE EXAMPLE ---
# Create the analyzer
detector = FinalVoiceAuthenticator()

# Replace with your actual MP3 file path
# Note: In a real API, you would first decode the Base64 string to a temp file
sample_file = r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\narration_20251210_232337.mp3" 
# sample_file = r"narration_20251210_232729.mp3"
sample_file=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\sample voice 1.mp3" 
# sample_file=r"voice_preview_faiq - standard, clear and neutral.mp3"
# sample_file=r"voice_preview_tarini - soft, cheerful and expressive.mp3"
# sample_file=r"voice_preview_kanika - soft, smooth and muffled.mp3"
sample_file=r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3"
sample_file=r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3"


# Run analysis
result = detector.analyze(sample_file)

# Print nicely
print(result)

Training Models

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import scipy.stats as stats
from glob import glob

In [None]:


# CONFIGURATION
CHUNK_DURATION = 10.0  # Seconds per sample
# We need to skip the first/last few seconds as they might be silence
OFFSET = 10.0         

# --- UPDATED FEATURE EXTRACTOR ---
def extract_features_from_chunk(y, sr):
    try:
        # 1. Standard Features (Keep these for Explanation)
        f0 = librosa.yin(y, fmin=50, fmax=400, sr=sr)
        valid_f0 = f0[~np.isnan(f0)]
        if len(valid_f0) == 0: return None
        
        rms = librosa.feature.rms(y=y)[0]
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        
        # 2. MFCCs (The Accuracy Boosters)
        # We take the mean of 13 MFCC coefficients
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_means = np.mean(mfccs, axis=1) # Results in an array of 13 numbers
        
        # Build the dictionary
        features = {
            'pitch_cv': np.std(valid_f0) / np.mean(valid_f0),
            'intensity_std': np.std(rms),
            'freq_skew': stats.skew(centroid),
        }
        
        # Add MFCCs as separate columns (mfcc_0 to mfcc_12)
        for i, val in enumerate(mfcc_means):
            features[f'mfcc_{i}'] = val
            
        return features
    except:
        return None


def process_dataset(base_folder):
    data_records = []
    
    # Define classes based on folder names
    classes = {'ai_voices': 1, 'human_voices': 0} # 1 = AI, 0 = Human
    
    for label_name, label_code in classes.items():
        folder_path = os.path.join(base_folder, label_name)
        audio_files = glob(folder_path + "/*.wav") # Finds all .wav files
        
        print(f"Processing {label_name}: Found {len(audio_files)} files...")
        
        for file in audio_files:
            try:
                # Load the FULL file
                y_full, sr = librosa.load(file, sr=None)
                total_duration = librosa.get_duration(y=y_full, sr=sr)
                
                # Slicing Logic
                # We start at OFFSET and take 5s chunks until the end
                num_chunks = int((total_duration - (OFFSET*2)) // CHUNK_DURATION)
                
                # Limit chunks per file to avoid dataset imbalance 
                # (e.g., max 50 chunks per file)
                num_chunks = min(num_chunks, 50) 
                
                for i in range(num_chunks):
                    start_sample = int((OFFSET + i * CHUNK_DURATION) * sr)
                    end_sample = int(start_sample + (CHUNK_DURATION * sr))
                    
                    y_chunk = y_full[start_sample:end_sample]
                    
                    # Extract features
                    feats = extract_features_from_chunk(y_chunk, sr)
                    if feats:
                        feats['label'] = label_code # Add the answer key
                        feats['source_file'] = os.path.basename(file)
                        data_records.append(feats)
                        
            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
                
    return pd.DataFrame(data_records)


In [None]:
# --- EXECUTE ---
# 1. Update this path to where you extracted your dataset
dataset_path = "AI_voice_dataset/training" 

print("Starting Feature Extraction... This may take a few minutes.")
df = process_dataset(dataset_path)

# 2. Save to CSV so we don't have to wait again
df.to_csv("trained_voice_features.csv", index=False)
print(f"Done! Saved {len(df)} training samples to 'trained_voice_features.csv'.")
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib # To save the model

# 1. Load Data
df = pd.read_csv("trained_voice_features.csv")

# Define Predictors: 3 Physics Features + 13 MFCCs
feature_cols = ['pitch_cv', 'intensity_std', 'freq_skew'] + [f'mfcc_{i}' for i in range(13)]

X = df[feature_cols] # Now X has 16 columns instead of 3
y = df['label']

# 3. Train/Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Initialize and Train Random Forest
# n_estimators=100 means it builds 100 decision trees
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nDetailed Report:")
print(classification_report(y_test, y_pred, target_names=['HUMAN', 'AI']))

# 6. Save the trained brain to a file
joblib.dump(clf, "voice_auth_model.pkl")
print("Model saved as 'voice_auth_model.pkl'")

In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import joblib

class VoiceAuthModel:
    def __init__(self):
        self.model = joblib.load("voice_auth_model.pkl")

    def analyze_with_ml(self, file_path):
        feats, err = self.extract_features(file_path)
        if err: return {"status": "error", "message": err}
        
        # 1. PREPARE INPUT VECTOR (16 Features)
        # Order: [pitch_cv, intensity_std, skew, mfcc_0 ... mfcc_12]
        input_vector = [
            feats['pitch_cv'], 
            feats['intensity_std'], 
            feats['freq_skew']
        ] + feats['mfccs'].tolist() # Add the list of 13 MFCCs
        
        # 2. PREDICT
        # Reshape to 2D array for sklearn: [[col1, col2...]]
        prediction = self.model.predict([input_vector])[0] 
        probability = self.model.predict_proba([input_vector])[0][1]

        # 3. GENERATE EXPLANATION (Using only the readable features)
        # We don't explain MFCCs to humans; we explain Pitch/Intensity.
        explanation_parts = []
        
        # Logic: If model says AI, find the "AI-like" physical traits to blame
        if prediction == 1:
            if feats['intensity_std'] < 0.05:
                explanation_parts.append(f"Standardized intensity (AI)({feats['intensity_std']:.3f})")
            if feats['pitch_cv'] < 0.20:
                explanation_parts.append(f"Robotic pitch variation (CV: {feats['pitch_cv']:.2f})")
            if not explanation_parts:
                explanation_parts.append("Synthetic timbre artifacts detected in MFCC analysis")
        
        # Logic: If model says Human, highlight the dynamic traits
        else:
            if feats['intensity_std'] > 0.10:
                explanation_parts.append("Natural dynamic intensity range")
            if feats['pitch_cv'] > 0.25:
                explanation_parts.append("Human-like pitch modulation")
            if not explanation_parts:
                explanation_parts.append("Acoustic profile matches natural human speech")

        return {
            "status": "success",
            "classification": "AI_GENERATED" if prediction == 1 else "HUMAN",
            "confidenceScore": round(probability, 3),
            "explanation": "; ".join(explanation_parts)
        }

    def extract_features(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=None)
            
            # Physics Features
            f0 = librosa.yin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            if len(valid_f0) == 0: return None, "No voice"
            
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            # MFCC Features (Mean of each band)
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            mfcc_means = np.mean(mfccs, axis=1)

            return {
                'pitch_cv': np.std(valid_f0) / np.mean(valid_f0),
                'pitch_std': np.std(valid_f0),
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid),
                'mfccs': mfcc_means # Store raw MFCCs for the model
            }, None
        except Exception as e:
            return None, str(e)

In [None]:
import librosa
import numpy as np
import scipy.stats as stats
import joblib  # Required for loading the model

class VoiceAuthModel:

    def __init__(self):
        # Load the trained Random Forest model
        self.model = joblib.load("voice_auth_model.pkl")

    def analyze_with_ml(self, file_path):
        # 1. Extract features
        feats, err = self.extract_features(file_path)
        if err:
            return {"status": "error", "message": err}
        
        # 2. Prepare data (Ensure exact column order as training!)
        input_data = [[feats['pitch_cv'], feats['intensity_std'], feats['freq_skew']]]
        
        # 3. Predict using ML
        prediction = self.model.predict(input_data)[0]  # 0 (Human) or 1 (AI)
        probability = self.model.predict_proba(input_data)[0][1]  # Probability of class 1 (AI)
        
        # 4. Generate Dynamic Explanation (The "Why")
        # Even though the ML decides the score, we explain it using the feature values.
        explanation_parts = []
        
        # Intensity Analysis
        if feats['intensity_std'] < 0.05:
            explanation_parts.append(f"Standardized intensity ({feats['intensity_std']:.3f}) typical of AI")
        elif feats['intensity_std'] > 0.10:
             explanation_parts.append("Dynamic intensity indicates human emotion")
             
        # Pitch Analysis
        if feats['pitch_cv'] < 0.18:
            explanation_parts.append(f"Pitch variation is robotic (CV: {feats['pitch_cv']:.2f})")
        elif feats['pitch_cv'] > 0.30:
            explanation_parts.append("Natural pitch modulation detected")
            
        # Fallback if no specific feature triggered
        if not explanation_parts:
            explanation_parts.append("Acoustic profile matches learned AI patterns" if prediction == 1 else "Acoustic profile consistent with human speech")

        return {
            "status": "success",
            "classification": "AI_GENERATED" if prediction == 1 else "HUMAN",
            "confidenceScore": round(probability, 3),
            "explanation": "; ".join(explanation_parts)
        }

    def extract_features(self, audio_path):
        try:
            # Load Audio
            y, sr = librosa.load(audio_path, sr=None)
            
            # Extract Pitch (Robust pYIN is better for single-file inference than yin)
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return None, "No voice detected"

            # Extract other features
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            return {
                'pitch_cv': std_pitch / mean_pitch,
                'pitch_std': std_pitch,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }, None
        except Exception as e:
            return None, str(e)

In [None]:
# --- USAGE EXAMPLE ---
# Create the analyzer
detector = VoiceAuthModel()

# Replace with your actual MP3 file path
# Note: In a real API, you would first decode the Base64 string to a temp file
sample_file = r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\narration_20251210_232337.mp3" 
# sample_file = r"narration_20251210_232729.mp3"
# sample_file=r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\sample voice 1.mp3" 
# sample_file=r"voice_preview_faiq - standard, clear and neutral.mp3"
# sample_file=r"voice_preview_tarini - soft, cheerful and expressive.mp3"
# sample_file=r"voice_preview_kanika - soft, smooth and muffled.mp3"

# Run analysis
result = detector.analyze_with_ml(sample_file)

# Print nicely
print(result)

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
import scipy.stats as stats
import joblib
from glob import glob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- CONFIGURATION ---
TEST_FOLDER = "AI_voice_dataset/testing"  # Folder containing 'ai_voices' and 'human_voices'
MODEL_FILE = "voice_auth_model.pkl"
CHUNK_DURATION = 5.0
OFFSET = 2.0  # Skip start to avoid silence

# --- 1. FEATURE EXTRACTION (MUST MATCH TRAINING EXACTLY) ---
def extract_features_from_chunk(y, sr):
    try:
        # Physics Features
        f0 = librosa.yin(y, fmin=50, fmax=400, sr=sr)
        valid_f0 = f0[~np.isnan(f0)]
        if len(valid_f0) == 0: return None
        
        rms = librosa.feature.rms(y=y)[0]
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        
        # MFCC Features (13 coeffs)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_means = np.mean(mfccs, axis=1) # 13 values
        
        # Build Vector: [pitch_cv, intensity_std, freq_skew, mfcc_0 ... mfcc_12]
        # This ORDER is critical. It must match X_train columns.
        features = [
            np.std(valid_f0) / np.mean(valid_f0), # pitch_cv
            np.std(rms),                          # intensity_std
            stats.skew(centroid)                  # freq_skew
        ] + mfcc_means.tolist()                   # Add 13 MFCCs
        
        return features
    except:
        return None

# --- 2. BATCH TESTING FUNCTION ---
def evaluate_test_set(model_path, data_path):
    print(f"Loading model from {model_path}...")
    clf = joblib.load(model_path)
    
    X_test = []
    y_test = []
    file_names = []
    
    classes = {'ai_voices': 1, 'human_voices': 0}
    
    print("Processing Test Files...")
    
    for label_name, label_code in classes.items():
        folder = os.path.join(data_path, label_name)
        files = glob(folder + "/*.wav")
        
        for file in files:
            try:
                # Load file
                y_full, sr = librosa.load(file, sr=None)
                total_duration = librosa.get_duration(y=y_full, sr=sr)
                
                # We test on chunks just like we trained
                num_chunks = int((total_duration - (OFFSET*2)) // CHUNK_DURATION)
                num_chunks = min(num_chunks, 10) # Test max 10 chunks per file to save time
                
                for i in range(num_chunks):
                    start = int((OFFSET + i * CHUNK_DURATION) * sr)
                    end = int(start + (CHUNK_DURATION * sr))
                    y_chunk = y_full[start:end]
                    
                    feats = extract_features_from_chunk(y_chunk, sr)
                    
                    if feats:
                        X_test.append(feats)
                        y_test.append(label_code)
                        file_names.append(os.path.basename(file))
                        
            except Exception as e:
                print(f"Error reading {file}: {e}")

    # Convert to Numpy for Sklearn
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    print(f"\n--- RESULTS ({len(y_test)} samples) ---")
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    print(f"FINAL ACCURACY: {acc * 100:.2f}%")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['HUMAN', 'AI']))

# --- 3. SINGLE FILE CHECKER (For your Hackathon Sample) ---
def test_single_file(file_path):
    clf = joblib.load(MODEL_FILE)
    y, sr = librosa.load(file_path, sr=None)
    
    # Grab a 5s chunk from the middle
    mid = len(y) // 2
    chunk_len = int(5.0 * sr)
    y_chunk = y[mid : mid + chunk_len]
    
    feats = extract_features_from_chunk(y_chunk, sr)
    if not feats:
        print("Could not extract features.")
        return

    # Reshape for model [[col1, col2...]]
    pred = clf.predict([feats])[0]
    prob = clf.predict_proba([feats])[0][1]
    
    print(f"\n--- SINGLE FILE ANALYSIS: {file_path} ---")
    print(f"Prediction: {'AI_GENERATED' if pred == 1 else 'HUMAN'}")
    print(f"Confidence: {prob:.4f}")

# --- EXECUTE ---
if __name__ == "__main__":
    # 1. Run the full evaluation on the folder
    evaluate_test_set(MODEL_FILE, TEST_FOLDER)
    
    # 2. (Optional) Test your specific Hackathon file


In [None]:
test_single_file("medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3")
test_single_file(r"D:\hackathons\GUVI_HCL\AI_Voice_Detector\voice_preview_tarini - soft, cheerful and expressive.mp3")
test_single_file(r"sample voice 1.mp3")

In [None]:
import torchaudio
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

class HybridEnsembleDetector:
    def __init__(self, model_path="wav2vec2_finetuned_model"):
        # --- 1. SETUP WAV2VEC2 (The Deep Learning Brain) ---
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Loading Wav2Vec2 model from '{model_path}' on {self.device}...")
        
        try:
            self.dl_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, local_files_only=True)
            self.processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
            self.dl_model.to(self.device)
            self.dl_model.eval()
            self.dl_ready = True
            print("‚úÖ Wav2Vec2 Model Loaded.")
        except Exception as e:
            print(f"‚ö†Ô∏è Wav2Vec2 Load Failed: {e}. Running in Physics-Only mode.")
            self.dl_ready = False

        self.id2label = {
            0: "diffwave", 1: "melgan", 2: "parallel_wave_gan", 
            3: "Real", 4: "wavegrad", 5: "wavnet", 6: "wavernn"
        }

        # --- 2. SETUP PHYSICS PARAMETERS (The Logic Brain) ---
        # (Your Tuned Parameters)
        self.CV_AI_THRESHOLD = 0.15
        self.CV_HUMAN_THRESHOLD = 0.32
        self.INTENSITY_MIN_STD = 0.03
        self.INTENSITY_MAX_STD = 0.15 

    # ==========================================================
    # PART A: PHYSICS ENGINE (Your Code)
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        try:
            # Load Audio (Native SR)
            y, sr = librosa.load(audio_path, sr=None)
            
            # Robust Pitch Tracking
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return 0.0, "No voice detected", {}

            # Feature Extraction
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            feats = {
                'pitch_cv': std_pitch / mean_pitch,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }

            # Scoring Logic
            i_score = self.get_linear_score(feats['intensity_std'], self.INTENSITY_MIN_STD, self.INTENSITY_MAX_STD)
            p_score = self.get_linear_score(feats['pitch_cv'], self.CV_AI_THRESHOLD, self.CV_HUMAN_THRESHOLD)
            s_score = self.get_linear_score(abs(feats['freq_skew']), 0.1, 1.0)

            # Weights
            W_INTENSITY = 0.40
            W_PITCH = 0.40
            W_SKEW = 0.20
            
            base_score = (i_score * W_INTENSITY) + (p_score * W_PITCH) + (s_score * W_SKEW)

            # Synergy Bonus (The "ElevenLabs Trap")
            if i_score > 0.4 and p_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            return 0.0, f"Physics Error: {str(e)}", {}

    # ==========================================================
    # PART B: WAV2VEC2 ENGINE (Deep Learning)
    # ==========================================================
    # ==========================================================
    # PART B: WAV2VEC2 ENGINE (Updated to use Librosa)
    # ==========================================================
    def get_dl_score(self, audio_path):
        if not self.dl_ready: return 0.0, "Model not loaded"

        target_sr = 16000
        max_len = target_sr * 10 # 10 seconds

        try:
            # 1. Load with Librosa (Bypassing Torchaudio errors)
            # Librosa loads as (n_samples,), floating point -1 to 1
            waveform_np, sr = librosa.load(audio_path, sr=target_sr) # Force resample here

            # 2. Convert to Torch Tensor
            waveform = torch.tensor(waveform_np).unsqueeze(0) # Shape: (1, n_samples)

            # 3. Pad/Truncate
            if waveform.size(1) > max_len: 
                waveform = waveform[:, :max_len]
            elif waveform.size(1) < max_len:
                waveform = F.pad(waveform, (0, max_len - waveform.size(1)))

            # 4. Predict
            inputs = self.processor(waveform.squeeze().numpy(), sampling_rate=target_sr, return_tensors="pt", padding=True)
            input_values = inputs.input_values.to(self.device)

            with torch.no_grad():
                logits = self.dl_model(input_values).logits
                probs = F.softmax(logits, dim=-1)
                
            pred_idx = torch.argmax(probs, dim=-1).item()
            conf = probs[0, pred_idx].item()
            label_name = self.id2label.get(pred_idx, "Unknown")

            # 5. Calculate "AI Probability"
            if pred_idx == 3: # "Real" label
                ai_prob = 1.0 - conf
            else: # Any AI label
                ai_prob = conf

            return round(ai_prob, 3), label_name

        except Exception as e:
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART C: THE ENSEMBLE ORCHESTRATOR
    # ==========================================================
    def analyze(self, file_path):
        # 1. Run Physics Brain
        phys_score, _, phys_feats = self.get_physics_score(file_path)
        
        # 2. Run Deep Learning Brain
        dl_score, dl_label = self.get_dl_score(file_path)

        # 3. The VETO Logic (Max Suspicion)
        # We trust whichever model is MORE suspicious of AI.
        final_score = max(phys_score, dl_score)
        
        # 4. Generate Dynamic Explanation
        explanations = []
        
        if final_score > 0.55:
            classification = "AI_GENERATED"
            
            # Did DL catch it?
            if dl_score > 0.55:
                if dl_label != "Real":
                    explanations.append(f"Deep Learning detected artifacts consistent with '{dl_label}' generator")
                else:
                    explanations.append("Deep Learning detected synthetic anomalies")

            # Did Physics catch it? (e.g., ElevenLabs)
            if phys_score > 0.55:
                p_cv = phys_feats.get('pitch_cv', 0)
                i_std = phys_feats.get('intensity_std', 0)
                
                if i_std < 0.05:
                    explanations.append(f"Intensity is unnaturally standardized (std: {i_std:.3f})")
                if p_cv < 0.20:
                    explanations.append(f"Pitch modulation is robotic (CV: {p_cv:.2f})")
                if not explanations: # Fallback
                     explanations.append("Acoustic parameters lack natural human variability")
        else:
            classification = "HUMAN"
            explanations.append("Voice exhibits natural acoustic variability and lacks synthetic artifacts")

        return {
            "status": "success",
            "classification": classification,
            "confidenceScore": final_score,
            "explanation": "; ".join(explanations),
            "debug": {
                "Physics_Score": phys_score,
                "DL_Score": dl_score,
                "DL_Label": dl_label
            }
        }



In [None]:
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

class HybridEnsembleDetector:
    def __init__(self, model_path="wav2vec2_finetuned_model"):
        # --- 1. SETUP WAV2VEC2 (The Deep Learning Brain) ---
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Loading Wav2Vec2 model from '{model_path}' on {self.device}...")
        
        try:
            self.dl_model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, local_files_only=True)
            self.processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
            self.dl_model.to(self.device)
            self.dl_model.eval()
            self.dl_ready = True
            print("‚úÖ Wav2Vec2 Model Loaded.")
        except Exception as e:
            print(f"‚ö†Ô∏è Wav2Vec2 Load Failed: {e}. Running in Physics-Only mode.")
            self.dl_ready = False

        self.id2label = {
            0: "diffwave", 1: "melgan", 2: "parallel_wave_gan", 
            3: "Real", 4: "wavegrad", 5: "wavnet", 6: "wavernn"
        }

        # --- 2. SETUP PHYSICS PARAMETERS (Tuned for Hackathon Sample) ---
        # Adjusted thresholds to catch the 0.485 edge case
        self.CV_AI_THRESHOLD = 0.20      # Raised from 0.15 to catch more "semi-robotic" voices
        self.CV_HUMAN_THRESHOLD = 0.32
        self.INTENSITY_MIN_STD = 0.05    # Raised from 0.03 to catch slightly better volume fakes
        self.INTENSITY_MAX_STD = 0.15 

    # ==========================================================
    # PART A: PHYSICS ENGINE
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        try:
            # Load Audio (Native SR)
            y, sr = librosa.load(audio_path, sr=None)
            
            # Robust Pitch Tracking
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return 0.0, "No voice detected", {}

            # Feature Extraction
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            feats = {
                'pitch_cv': std_pitch / mean_pitch,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }

            # Scoring Logic
            i_score = self.get_linear_score(feats['intensity_std'], self.INTENSITY_MIN_STD, self.INTENSITY_MAX_STD)
            p_score = self.get_linear_score(feats['pitch_cv'], self.CV_AI_THRESHOLD, self.CV_HUMAN_THRESHOLD)
            s_score = self.get_linear_score(abs(feats['freq_skew']), 0.1, 1.0)

            # Weights
            W_INTENSITY = 0.40
            W_PITCH = 0.40
            W_SKEW = 0.20
            
            base_score = (i_score * W_INTENSITY) + (p_score * W_PITCH) + (s_score * W_SKEW)

            # Synergy Bonus
            if i_score > 0.4 and p_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            return 0.0, f"Physics Error: {str(e)}", {}

    # ==========================================================
    # PART B: WAV2VEC2 ENGINE (Librosa Port of Repo Logic)
    # ==========================================================
    def get_dl_score(self, audio_path):
        if not self.dl_ready: return 0.0, "Model not loaded"

        target_sr = 16000
        max_len = target_sr * 10 # 10 seconds

        try:
            # 1. Load with Librosa (Bypassing Torchaudio errors)
            # IMPORTANT: mono=False so we can select the first channel like the Repo does
            waveform_np, sr = librosa.load(audio_path, sr=target_sr, mono=False) 

            # 2. Handle Channels (Repo Logic: take waveform[0] if stereo)
            # Librosa returns (n_channels, n_samples) if mono=False and stereo
            # Librosa returns (n_samples,) if mono file
            if waveform_np.ndim > 1:
                waveform_np = waveform_np[0] # Take first channel (preserves phase better than averaging)
            
            # 3. Convert to Tensor
            waveform = torch.tensor(waveform_np).unsqueeze(0) # Shape: (1, n_samples)

            # 4. Pad/Truncate
            if waveform.size(1) > max_len: 
                waveform = waveform[:, :max_len]
            elif waveform.size(1) < max_len:
                waveform = F.pad(waveform, (0, max_len - waveform.size(1)))

            # 5. Predict
            inputs = self.processor(
                waveform.squeeze().numpy(), 
                sampling_rate=target_sr, 
                return_tensors="pt", 
                padding=True
            )
            input_values = inputs.input_values.to(self.device)

            with torch.no_grad():
                logits = self.dl_model(input_values).logits
                probs = F.softmax(logits, dim=-1)
                
            pred_idx = torch.argmax(probs, dim=-1).item()
            conf = probs[0, pred_idx].item()
            label_name = self.id2label.get(pred_idx, "Unknown")

            # 6. Calculate "AI Probability"
            # Label 3 is "Real". If model says Real, AI Score is (1 - confidence).
            if pred_idx == 3: 
                ai_prob = 1.0 - conf
            else: # Any AI label
                ai_prob = conf

            return round(ai_prob, 3), label_name

        except Exception as e:
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART C: THE ENSEMBLE ORCHESTRATOR
    # ==========================================================
    def analyze(self, file_path):
        # 1. Run Physics Brain
        phys_score, _, phys_feats = self.get_physics_score(file_path)
        
        # 2. Run Deep Learning Brain
        dl_score, dl_label = self.get_dl_score(file_path)

        # 3. The VETO Logic
        final_score = max(phys_score, dl_score)
        
        # 4. Generate Explanation
        explanations = []
        
        if final_score > 0.55:
            classification = "AI_GENERATED"
            
            # Did DL catch it?
            if dl_score > 0.55:
                if dl_label != "Real":
                    explanations.append(f"Deep Learning detected artifacts consistent with '{dl_label}' generator")
                else:
                    explanations.append("Deep Learning detected synthetic anomalies")

            # Did Physics catch it?
            if phys_score > 0.55:
                p_cv = phys_feats.get('pitch_cv', 0)
                i_std = phys_feats.get('intensity_std', 0)
                
                if i_std < 0.06: # Updated explanation logic
                    explanations.append(f"Intensity is unnaturally standardized (std: {i_std:.3f})")
                if p_cv < 0.22:
                    explanations.append(f"Pitch modulation is robotic (CV: {p_cv:.2f})")
                if not explanations:
                     explanations.append("Acoustic parameters lack natural human variability")
        else:
            classification = "HUMAN"
            explanations.append("Voice exhibits natural acoustic variability and lacks synthetic artifacts")

        return {
            "status": "success",
            "classification": classification,
            "confidenceScore": final_score,
            "explanation": "; ".join(explanations),
            "debug": {
                "Physics_Score": phys_score,
                "DL_Score": dl_score,
                "DL_Label": dl_label
            }
        }

In [None]:
# --- USAGE EXAMPLE ---
# 1. Ensure 'voiceguard_model' folder exists
detector = HybridEnsembleDetector()

# 2. Test
result = detector.analyze(r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3")
print(result)

In [None]:
# Install required packages (run once)
# !pip install transformers torch torchaudio pydub

import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from pydub import AudioSegment
from io import BytesIO
import numpy as np

# Configuration
model_path = "wav2vec2_finetuned_model"  # Update this path
target_sample_rate = 16000
max_length = target_sample_rate * 10  # 10 seconds

# Label mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Load model and processor
print("Loading model...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, local_files_only=True)
processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

def convert_mp3_to_wav(mp3_path):
    """Convert MP3 to WAV format in memory"""
    audio = AudioSegment.from_mp3(mp3_path)
    # Export to WAV in memory
    wav_io = BytesIO()
    audio.export(wav_io, format="wav")
    wav_io.seek(0)
    return wav_io

def predict_audio(audio_path):
    """Predict if audio is real or AI-generated"""
    try:
        # Convert MP3 to WAV if needed
        if audio_path.lower().endswith('.mp3'):
            print(f"Converting MP3 to WAV...")
            wav_io = convert_mp3_to_wav(audio_path)
            waveform, sample_rate = torchaudio.load(wav_io)
        else:
            waveform, sample_rate = torchaudio.load(audio_path)
        
        print(f"Original sample rate: {sample_rate} Hz")
        
        # Resample if needed
        if sample_rate != target_sample_rate:
            print(f"Resampling to {target_sample_rate} Hz...")
            resampler = torchaudio.transforms.Resample(
                orig_freq=sample_rate, 
                new_freq=target_sample_rate
            )
            waveform = resampler(waveform)
        
        # Truncate or pad to fixed length
        if waveform.size(1) > max_length:
            waveform = waveform[:, :max_length]
            print(f"Truncated to {max_length/target_sample_rate} seconds")
        elif waveform.size(1) < max_length:
            waveform = torch.nn.functional.pad(
                waveform, 
                (0, max_length - waveform.size(1))
            )
            print(f"Padded to {max_length/target_sample_rate} seconds")
        
        # Convert to mono if stereo
        if waveform.ndim > 1:
            waveform = waveform[0]
        
        # Process audio
        inputs = processor(
            waveform.squeeze().numpy(),
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        input_values = inputs["input_values"].to(device)
        
        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()
        
        # Get class name
        class_name = id2label.get(predicted_label, "Unknown Class")
        
        # Display results
        print("\n" + "="*50)
        print(f"Prediction: {class_name}")
        print(f"Confidence: {confidence*100:.2f}%")
        print(f"Label ID: {predicted_label}")
        print("="*50)
        
        # Show all probabilities
        print("\nAll class probabilities:")
        for label_id, prob in enumerate(probabilities[0].cpu().numpy()):
            print(f"  {id2label[label_id]}: {prob*100:.2f}%")
        
        return {
            "class_name": class_name,
            "confidence": confidence,
            "label": predicted_label,
            "all_probabilities": {id2label[i]: float(p) for i, p in enumerate(probabilities[0].cpu().numpy())}
        }
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None



In [4]:
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import librosa
import soundfile as sf
import numpy as np

# Configuration
model_path = r"wav2vec2_finetuned_model"  # Update this path
target_sample_rate = 16000
max_length = target_sample_rate * 10  # 10 seconds

# Label mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Load model and processor
print("Loading model...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, local_files_only=True)
processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

def predict_audio(audio_path):
    """Predict if audio is real or AI-generated"""
    try:
        print(f"\nLoading audio file: {audio_path}")
        
        # Load audio with librosa - directly at target sample rate
        # Set sr=target_sample_rate to resample during loading
        waveform_np, sample_rate = librosa.load(
            audio_path, 
            sr=target_sample_rate,  # Resample to 16kHz during load
            mono=True  # Convert to mono
        )
        
        print(f"Loaded at sample rate: {sample_rate} Hz")
        print(f"Audio duration: {len(waveform_np)/sample_rate:.2f} seconds")
        print(f"Waveform shape: {waveform_np.shape}")
        
        # Truncate or pad to fixed length (10 seconds)
        if len(waveform_np) > max_length:
            waveform_np = waveform_np[:max_length]
            print(f"Truncated to {max_length/target_sample_rate} seconds")
        elif len(waveform_np) < max_length:
            # Pad with zeros
            padding = max_length - len(waveform_np)
            waveform_np = np.pad(waveform_np, (0, padding), mode='constant')
            print(f"Padded to {max_length/target_sample_rate} seconds")
        
        print(f"Final waveform shape: {waveform_np.shape}")
        
        # Process audio with the processor
        print("Processing audio with Wav2Vec2Processor...")
        inputs = processor(
            waveform_np,
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        
        input_values = inputs["input_values"].to(device)
        print(f"Input values shape: {input_values.shape}")
        
        # Inference
        print("Running inference...")
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()
        
        print(f"Logits shape: {logits.shape}")
        print(f"Logits values: {logits}")
        
        # Get class name
        class_name = id2label.get(predicted_label, "Unknown Class")
        
        # Display results
        print("\n" + "="*50)
        print(f"üéØ Prediction: {class_name}")
        print(f"üìä Confidence: {confidence*100:.2f}%")
        print(f"üè∑Ô∏è  Label ID: {predicted_label}")
        print("="*50)
        
        # Show all probabilities
        print("\nüìà All class probabilities:")
        for label_id, prob in enumerate(probabilities[0].cpu().numpy()):
            bar = "‚ñà" * int(prob * 50)
            print(f"  {id2label[label_id]:20s}: {prob*100:5.2f}% {bar}")
        
        return {
            "class_name": class_name,
            "confidence": confidence,
            "label": predicted_label,
            "all_probabilities": {id2label[i]: float(p) for i, p in enumerate(probabilities[0].cpu().numpy())}
        }
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None



Loading model...
Model loaded on cpu


In [10]:
import torch
import torchaudio
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import librosa
import soundfile as sf
import numpy as np
import os

# Configuration
model_path = r"wav2vec2_finetuned_model"  # Update this path
target_sample_rate = 16000
max_length = target_sample_rate * 10  # 10 seconds

# Label mapping
id2label = {
    0: "diffwave",
    1: "melgan",
    2: "parallel_wave_gan",
    3: "Real",
    4: "wavegrad",
    5: "wavnet",
    6: "wavernn"
}

# Load model and processor
print("Loading model...")
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path, local_files_only=True)
processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# Check model configuration
print("\nModel Configuration:")
print(f"Number of labels: {model.config.num_labels}")
print(f"Model type: {model.config.model_type}")

def predict_audio(audio_path, normalize_audio=True):
    """Predict if audio is real or AI-generated"""
    try:
        print(f"\n{'='*60}")
        print(f"Processing: {os.path.basename(audio_path)}")
        print('='*60)
        
        # Load audio with librosa
        waveform_np, sample_rate = librosa.load(
            audio_path, 
            sr=target_sample_rate,
            mono=True
        )
        
        print(f"Loaded at sample rate: {sample_rate} Hz")
        print(f"Audio duration: {len(waveform_np)/sample_rate:.2f} seconds")
        print(f"Audio min/max: {waveform_np.min():.4f} / {waveform_np.max():.4f}")
        print(f"Audio mean/std: {waveform_np.mean():.4f} / {waveform_np.std():.4f}")
        
        # Truncate or pad to fixed length (10 seconds)
        if len(waveform_np) > max_length:
            waveform_np = waveform_np[:max_length]
            print(f"‚úÇÔ∏è  Truncated to {max_length/target_sample_rate} seconds")
        elif len(waveform_np) < max_length:
            padding = max_length - len(waveform_np)
            waveform_np = np.pad(waveform_np, (0, padding), mode='constant')
            print(f"‚ûï Padded to {max_length/target_sample_rate} seconds")
        
        # Optional normalization (sometimes helps)
        if normalize_audio:
            waveform_np = waveform_np / (np.abs(waveform_np).max() + 1e-8)
            print(f"üîß Normalized audio")
        
        # Process audio with the processor
        inputs = processor(
            waveform_np,
            sampling_rate=target_sample_rate,
            return_tensors="pt",
            padding=True
        )
        
        input_values = inputs["input_values"].to(device)
        
        # Inference
        with torch.no_grad():
            logits = model(input_values).logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_label = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_label].item()
        
        # Get class name
        class_name = id2label.get(predicted_label, "Unknown Class")
        
        # Display results
        print("\n" + "="*50)
        print(f"üéØ Prediction: {class_name}")
        print(f"üìä Confidence: {confidence*100:.2f}%")
        print(f"üè∑Ô∏è  Label ID: {predicted_label}")
        print("="*50)
        
        # Show all probabilities with better visualization
        print("\nüìà All class probabilities:")
        probs_list = []
        for label_id, prob in enumerate(probabilities[0].cpu().numpy()):
            bar_length = int(prob * 50)
            bar = "‚ñà" * bar_length
            label = id2label[label_id]
            print(f"  {label:20s}: {prob*100:6.2f}% {bar}")
            probs_list.append((label, prob))
        
        # Show raw logits for debugging
        print("\nüîç Raw logits (before softmax):")
        for label_id, logit in enumerate(logits[0].cpu().numpy()):
            print(f"  {id2label[label_id]:20s}: {logit:8.4f}")
        
        return {
            "filename": os.path.basename(audio_path),
            "class_name": class_name,
            "confidence": confidence,
            "label": predicted_label,
            "all_probabilities": {id2label[i]: float(p) for i, p in enumerate(probabilities[0].cpu().numpy())},
            "logits": {id2label[i]: float(l) for i, l in enumerate(logits[0].cpu().numpy())}
        }
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Test single file
audio_file = r"clova.mp3"
result = predict_audio(audio_file)

# Test multiple files to see if model gives different predictions
print("\n\n" + "="*60)
print("BATCH TESTING - Testing multiple files")
print(result)

Loading model...
Model loaded on cpu

Model Configuration:
Number of labels: 7
Model type: wav2vec2

Processing: clova.mp3
Loaded at sample rate: 16000 Hz
Audio duration: 14.08 seconds
Audio min/max: -0.9114 / 0.6248
Audio mean/std: -0.0000 / 0.1290
‚úÇÔ∏è  Truncated to 10.0 seconds
üîß Normalized audio

üéØ Prediction: parallel_wave_gan
üìä Confidence: 99.99%
üè∑Ô∏è  Label ID: 2

üìà All class probabilities:
  diffwave            :   0.00% 
  melgan              :   0.00% 
  parallel_wave_gan   :  99.99% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  Real                :   0.00% 
  wavegrad            :   0.00% 
  wavnet              :   0.00% 
  wavernn             :   0.00% 

üîç Raw logits (before softmax):
  diffwave            :  -0.7177
  melgan              :  -2.9187
  parallel_wave_gan   :   9.7021
  Real                :  -0.3731
  wavegrad            :  -3.0424
  wa

In [12]:
import torch
import torchaudio
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import numpy as np
import os

# Configuration
model_path = r"Deepfake-audio-detection-V2"  # Update to your cloned path
target_sample_rate = 16000  # Most audio models use 16kHz

# Load model and feature extractor
print("Loading MelodyMachine Deepfake Detection model...")
try:
    model = AutoModelForAudioClassification.from_pretrained(model_path, local_files_only=True)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path, local_files_only=True)
    print("‚úÖ Model loaded successfully!")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("\nTrying alternative loading methods...")
    # Alternative: try loading without local_files_only
    model = AutoModelForAudioClassification.from_pretrained(model_path)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)

model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# Check model configuration
print("\nModel Configuration:")
print(f"Model type: {model.config.model_type if hasattr(model.config, 'model_type') else 'Unknown'}")
print(f"Number of labels: {model.config.num_labels if hasattr(model.config, 'num_labels') else 'Unknown'}")
if hasattr(model.config, 'id2label'):
    print(f"Labels: {model.config.id2label}")

def predict_deepfake(audio_path):
    """Predict if audio is deepfake/AI-generated or real"""
    try:
        print(f"\n{'='*60}")
        print(f"Processing: {os.path.basename(audio_path)}")
        print('='*60)
        
        # Load audio
        print("Loading audio file...")
        waveform, sample_rate = librosa.load(audio_path, sr=None, mono=True)
        
        print(f"Original sample rate: {sample_rate} Hz")
        print(f"Audio duration: {len(waveform)/sample_rate:.2f} seconds")
        
        # Resample if needed
        if sample_rate != target_sample_rate:
            print(f"Resampling to {target_sample_rate} Hz...")
            waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
            sample_rate = target_sample_rate
        
        # Process audio with feature extractor
        print("Extracting features...")
        inputs = feature_extractor(
            waveform,
            sampling_rate=sample_rate,
            return_tensors="pt",
            padding=True
        )
        
        # Move inputs to device
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        print(f"Input shape: {inputs['input_values'].shape if 'input_values' in inputs else 'N/A'}")
        
        # Inference
        print("Running inference...")
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_class].item()
        
        # Get label names
        if hasattr(model.config, 'id2label'):
            id2label = model.config.id2label
            class_name = id2label.get(predicted_class, f"Class {predicted_class}")
        else:
            # Default labels if not in config
            id2label = {0: "Real/Bonafide", 1: "Fake/Deepfake"}
            class_name = id2label.get(predicted_class, f"Class {predicted_class}")
        
        # Display results
        print("\n" + "="*50)
        print(f"üéØ Prediction: {class_name}")
        print(f"üìä Confidence: {confidence*100:.2f}%")
        print(f"üè∑Ô∏è  Class ID: {predicted_class}")
        print("="*50)
        
        # Show all probabilities
        print("\nüìà All class probabilities:")
        for class_id in range(probabilities.shape[1]):
            prob = probabilities[0, class_id].item()
            label = id2label.get(class_id, f"Class {class_id}")
            bar = "‚ñà" * int(prob * 50)
            print(f"  {label:20s}: {prob*100:6.2f}% {bar}")
        
        # Show raw logits
        print("\nüîç Raw logits (before softmax):")
        for class_id, logit in enumerate(logits[0].cpu().numpy()):
            label = id2label.get(class_id, f"Class {class_id}")
            print(f"  {label:20s}: {logit:8.4f}")
        
        return {
            "filename": os.path.basename(audio_path),
            "prediction": class_name,
            "confidence": confidence,
            "class_id": predicted_class,
            "all_probabilities": {
                id2label.get(i, f"Class {i}"): float(probabilities[0, i].cpu())
                for i in range(probabilities.shape[1])
            },
            "logits": {
                id2label.get(i, f"Class {i}"): float(logits[0, i].cpu())
                for i in range(logits.shape[1])
            }
        }
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


# Test multiple files
print("\n\n" + "="*60)
print("BATCH TESTING - Multiple Files")
print("="*60)

audio_files = [
    r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
    r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3",
    r"narration_20251210_232729.mp3",
    r"voice_preview_kanika - soft, smooth and muffled.mp3",
    r"voice_preview_faiq - standard, clear and neutral.mp3",
    r"clova.mp3"
    # Add more files here
]

results = []
for audio_file in audio_files:
    if os.path.exists(audio_file):
        result = predict_deepfake(audio_file)
        if result:
            results.append(result)
    else:
        print(f"‚ö†Ô∏è  File not found: {audio_file}")

# Summary Report
if results:
    print("\n\n" + "="*60)
    print("üìä SUMMARY OF ALL PREDICTIONS")
    print("="*60)
    print(f"{'Filename':<35} {'Prediction':<20} {'Confidence':<10}")
    print("-"*60)
    for r in results:
        print(f"{r['filename']:<35} {r['prediction']:<20} {r['confidence']*100:>6.2f}%")
    
    # Statistics
    print("\n" + "="*60)
    print("üìà STATISTICS")
    print("="*60)
    
    # Count predictions
    from collections import Counter
    prediction_counts = Counter([r['prediction'] for r in results])
    for pred, count in prediction_counts.items():
        print(f"{pred}: {count} files ({count/len(results)*100:.1f}%)")
    
    # Average confidence
    avg_confidence = np.mean([r['confidence'] for r in results])
    print(f"\nAverage confidence: {avg_confidence*100:.2f}%")

Loading MelodyMachine Deepfake Detection model...
‚úÖ Model loaded successfully!
Model loaded on cpu

Model Configuration:
Model type: wav2vec2
Number of labels: 2
Labels: {0: 'fake', 1: 'real'}


BATCH TESTING - Multiple Files

Processing: medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3
Loading audio file...
Original sample rate: 48000 Hz
Audio duration: 5.76 seconds
Resampling to 16000 Hz...
Extracting features...
Input shape: torch.Size([1, 92160])
Running inference...

üéØ Prediction: real
üìä Confidence: 100.00%
üè∑Ô∏è  Class ID: 1

üìà All class probabilities:
  fake                :   0.00% 
  real                : 100.00% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

üîç Raw logits (before softmax):
  fake                :  -5.7409
  real                :   5.2522

Processing: medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228

In [15]:
import torch
import librosa
import numpy as np
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
from scipy import signal
from scipy.stats import kurtosis, skew
import os

class HybridDeepfakeDetector:
    """
    Combines model predictions with acoustic analysis to detect modern AI voices
    """
    
    def __init__(self, model_path):
        print("Loading model...")
        self.model = AutoModelForAudioClassification.from_pretrained(model_path, local_files_only=True)
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_path, local_files_only=True)
        self.model.eval()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print(f"‚úÖ Model loaded on {self.device}")
    
    def extract_advanced_features(self, waveform, sr=16000):
        """Extract acoustic features that reveal AI generation"""
        features = {}
        
        # 1. Spectral Features
        spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
        features['spectral_centroid_mean'] = np.mean(spectral_centroids)
        features['spectral_centroid_std'] = np.std(spectral_centroids)
        features['spectral_centroid_var'] = np.var(spectral_centroids)
        
        # 2. Zero Crossing Rate (AI has more regular patterns)
        zcr = librosa.feature.zero_crossing_rate(waveform)[0]
        features['zcr_mean'] = np.mean(zcr)
        features['zcr_std'] = np.std(zcr)
        features['zcr_var'] = np.var(zcr)
        
        # 3. MFCC Statistics (AI has less natural variation)
        mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=20)
        features['mfcc_mean'] = np.mean(mfccs)
        features['mfcc_std'] = np.std(mfccs)
        features['mfcc_var'] = np.var(mfccs)
        features['mfcc_kurtosis'] = np.mean([kurtosis(mfcc) for mfcc in mfccs])
        features['mfcc_skewness'] = np.mean([skew(mfcc) for mfcc in mfccs])
        
        # 4. Spectral Rolloff
        rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
        features['rolloff_mean'] = np.mean(rolloff)
        features['rolloff_std'] = np.std(rolloff)
        
        # 5. Spectral Bandwidth
        bandwidth = librosa.feature.spectral_bandwidth(y=waveform, sr=sr)[0]
        features['bandwidth_mean'] = np.mean(bandwidth)
        features['bandwidth_std'] = np.std(bandwidth)
        
        # 6. RMS Energy
        rms = librosa.feature.rms(y=waveform)[0]
        features['rms_mean'] = np.mean(rms)
        features['rms_std'] = np.std(rms)
        features['rms_var'] = np.var(rms)
        
        # 7. Harmonic-Percussive Source Separation
        harmonic, percussive = librosa.effects.hpss(waveform)
        features['harmonic_ratio'] = np.sum(np.abs(harmonic)) / (np.sum(np.abs(waveform)) + 1e-8)
        features['percussive_ratio'] = np.sum(np.abs(percussive)) / (np.sum(np.abs(waveform)) + 1e-8)
        
        # 8. Spectral Contrast (AI often has smoother contrast)
        contrast = librosa.feature.spectral_contrast(y=waveform, sr=sr)
        features['contrast_mean'] = np.mean(contrast)
        features['contrast_std'] = np.std(contrast)
        
        # 9. Chroma Features
        chroma = librosa.feature.chroma_stft(y=waveform, sr=sr)
        features['chroma_mean'] = np.mean(chroma)
        features['chroma_std'] = np.std(chroma)
        
        # 10. Temporal Features
        features['duration'] = len(waveform) / sr
        
        return features
    
    def calculate_ai_score(self, features):
        """
        Calculate AI likelihood based on acoustic signatures
        Modern TTS (ElevenLabs, OpenAI, etc.) characteristics:
        - Very consistent spectral features (low variance)
        - Regular zero-crossing patterns
        - Unnaturally smooth harmonic structure
        - Less MFCC variation
        - High harmonic-to-noise ratio
        """
        ai_indicators = 0
        total_weight = 0
        
        # 1. Spectral Centroid Consistency (weight: 3)
        # AI voices have very stable spectral centroids
        if features['spectral_centroid_std'] < 300:
            ai_indicators += 3 * (1 - features['spectral_centroid_std'] / 300)
        total_weight += 3
        
        # 2. Zero Crossing Rate Regularity (weight: 2)
        # AI has very regular ZCR patterns
        if features['zcr_std'] < 0.06:
            ai_indicators += 2 * (1 - features['zcr_std'] / 0.06)
        total_weight += 2
        
        # 3. MFCC Variation (weight: 4)
        # Human voices have more MFCC variation
        if features['mfcc_std'] < 20:
            ai_indicators += 4 * (1 - features['mfcc_std'] / 20)
        total_weight += 4
        
        # 4. RMS Energy Consistency (weight: 2)
        # AI has very consistent energy levels
        if features['rms_var'] < 0.01:
            ai_indicators += 2 * (1 - features['rms_var'] / 0.01)
        total_weight += 2
        
        # 5. Harmonic Ratio (weight: 3)
        # AI voices are "too clean" - very high harmonic ratio
        if features['harmonic_ratio'] > 0.80:
            ai_indicators += 3 * ((features['harmonic_ratio'] - 0.80) / 0.20)
        total_weight += 3
        
        # 6. Spectral Bandwidth Consistency (weight: 2)
        # AI has more stable bandwidth
        if features['bandwidth_std'] < 400:
            ai_indicators += 2 * (1 - features['bandwidth_std'] / 400)
        total_weight += 2
        
        # 7. Spectral Contrast Smoothness (weight: 2)
        # AI has smoother spectral contrast
        if features['contrast_std'] < 5:
            ai_indicators += 2 * (1 - features['contrast_std'] / 5)
        total_weight += 2
        
        # 8. MFCC Kurtosis (weight: 2)
        # AI often has different distribution shapes
        if abs(features['mfcc_kurtosis']) < 1.5:
            ai_indicators += 2 * (1 - abs(features['mfcc_kurtosis']) / 1.5)
        total_weight += 2
        
        # Normalize to 0-1 scale
        ai_score = ai_indicators / total_weight
        
        return ai_score
    
    def predict(self, audio_path, show_details=True):
        """Combined prediction using model + acoustic analysis"""
        try:
            if show_details:
                print(f"\n{'='*60}")
                print(f"Processing: {os.path.basename(audio_path)}")
                print('='*60)
            
            # Load audio
            waveform, sr = librosa.load(audio_path, sr=16000, mono=True)
            
            if show_details:
                print(f"Duration: {len(waveform)/sr:.2f}s | Sample rate: {sr} Hz")
            
            # 1. Model Prediction
            inputs = self.feature_extractor(
                waveform,
                sampling_rate=sr,
                return_tensors="pt",
                padding=True
            )
            inputs = {key: val.to(self.device) for key, val in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits
                probabilities = torch.nn.functional.softmax(logits, dim=-1)
                model_pred_class = torch.argmax(probabilities, dim=-1).item()
                model_confidence = probabilities[0, model_pred_class].item()
            
            id2label = self.model.config.id2label
            model_prediction = id2label[model_pred_class]
            
            # 2. Acoustic Analysis
            features = self.extract_advanced_features(waveform, sr)
            acoustic_ai_score = self.calculate_ai_score(features)
            
            # 3. Combined Decision
            # If model says real BUT acoustic score is high, override to AI
            if model_prediction == "real" and acoustic_ai_score > 0.55:
                final_prediction = "AI Generated (Modern TTS)"
                final_confidence = acoustic_ai_score
                detection_method = "Acoustic Analysis"
            elif model_prediction == "fake":
                final_prediction = "AI Generated (Classic TTS)"
                final_confidence = model_confidence
                detection_method = "Model Detection"
            else:
                final_prediction = "Real Human Voice"
                final_confidence = 1 - acoustic_ai_score
                detection_method = "Combined Analysis"
            
            # Display results
            if show_details:
                print("\n" + "="*50)
                print(f"üéØ FINAL VERDICT: {final_prediction}")
                print(f"üìä Confidence: {final_confidence*100:.2f}%")
                print(f"üîç Detection Method: {detection_method}")
                print("="*50)
                
                print(f"\nüì± Model Prediction: {model_prediction} ({model_confidence*100:.2f}%)")
                print(f"üî¨ Acoustic AI Score: {acoustic_ai_score*100:.2f}%")
                
                print("\nüéº Key Acoustic Indicators:")
                print(f"  Spectral Consistency: {features['spectral_centroid_std']:.2f} (AI if < 300)")
                print(f"  ZCR Regularity: {features['zcr_std']:.4f} (AI if < 0.06)")
                print(f"  MFCC Variation: {features['mfcc_std']:.2f} (AI if < 20)")
                print(f"  Harmonic Ratio: {features['harmonic_ratio']:.3f} (AI if > 0.80)")
                print(f"  RMS Variance: {features['rms_var']:.4f} (AI if < 0.01)")
            
            return {
                "filename": os.path.basename(audio_path),
                "final_prediction": final_prediction,
                "final_confidence": final_confidence,
                "model_prediction": model_prediction,
                "model_confidence": model_confidence,
                "acoustic_ai_score": acoustic_ai_score,
                "detection_method": detection_method,
                "features": features
            }
            
        except Exception as e:
            print(f"‚ùå Error: {str(e)}")
            import traceback
            traceback.print_exc()
            return None

# Initialize detector
model_path = r"Deepfake-audio-detection-V2"
detector = HybridDeepfakeDetector(model_path)

# Test files
audio_files = [
    r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
    r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3",
    r"narration_20251210_232729.mp3",
    r"voice_preview_kanika - soft, smooth and muffled.mp3",
    r"voice_preview_faiq - standard, clear and neutral.mp3",
    r"clova.mp3"
    # Add more files here
]

# Batch testing
print("\n" + "="*60)
print("HYBRID DEEPFAKE DETECTION - BATCH TESTING")
print("="*60)

results = []
for audio_file in audio_files:
    if os.path.exists(audio_file):
        result = detector.predict(audio_file, show_details=True)
        if result:
            results.append(result)
    else:
        print(f"‚ö†Ô∏è  File not found: {audio_file}")

# Summary
if results:
    print("\n\n" + "="*60)
    print("üìä FINAL SUMMARY")
    print("="*60)
    print(f"{'Filename':<50} {'Prediction':<25} {'Confidence':<12} {'Method'}")
    print("-"*120)
    for r in results:
        filename = r['filename'][:47] + "..." if len(r['filename']) > 50 else r['filename']
        pred = r['final_prediction'][:22] + "..." if len(r['final_prediction']) > 25 else r['final_prediction']
        print(f"{filename:<50} {pred:<25} {r['final_confidence']*100:>6.2f}%      {r['detection_method']}")
    
    print("\n" + "="*60)
    print("üìà DETECTION STATISTICS")
    print("="*60)
    
    from collections import Counter
    prediction_counts = Counter([r['final_prediction'] for r in results])
    for pred, count in prediction_counts.items():
        print(f"{pred}: {count} files ({count/len(results)*100:.1f}%)")
    
    # Show which were detected by acoustic vs model
    acoustic_detections = sum(1 for r in results if "Acoustic" in r['detection_method'])
    model_detections = sum(1 for r in results if "Model" in r['detection_method'])
    print(f"\nDetected by Acoustic Analysis: {acoustic_detections}")
    print(f"Detected by Model: {model_detections}")

Loading model...
‚úÖ Model loaded on cpu

HYBRID DEEPFAKE DETECTION - BATCH TESTING

Processing: medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3
Duration: 5.76s | Sample rate: 16000 Hz

üéØ FINAL VERDICT: Real Human Voice
üìä Confidence: 84.38%
üîç Detection Method: Combined Analysis

üì± Model Prediction: real (100.00%)
üî¨ Acoustic AI Score: 15.62%

üéº Key Acoustic Indicators:
  Spectral Consistency: 1845.78 (AI if < 300)
  ZCR Regularity: 0.2322 (AI if < 0.06)
  MFCC Variation: 69.00 (AI if < 20)
  Harmonic Ratio: 0.316 (AI if > 0.80)
  RMS Variance: 0.0029 (AI if < 0.01)

Processing: medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3
Duration: 3.26s | Sample rate: 16000 Hz

üéØ FINAL VERDICT: Real Human Voice
üìä Confidence: 83.55%
üîç Detection Method: Combined Analysis

üì± Model Prediction: real (100.00%)
üî¨ Acoustic AI Score: 16.45%

üéº Key Acoustic Indicators:
  Spectral Consistency: 1125.55 (AI if < 300)
  ZC

In [3]:
import torch
import librosa
import numpy as np
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, Wav2Vec2Processor
import os

# Configuration
model_path = r"wav2vec2-large-xlsr-deepfake-audio-classification"  # Update path
target_sample_rate = 16000

# Load model and processor
print("Loading Gustking Deepfake Detection model...")
try:
    model = AutoModelForAudioClassification.from_pretrained(model_path, local_files_only=True)
    
    # Try different processor types
    try:
        processor = AutoFeatureExtractor.from_pretrained(model_path, local_files_only=True)
        print("‚úÖ Loaded with AutoFeatureExtractor")
    except:
        processor = Wav2Vec2Processor.from_pretrained(model_path, local_files_only=True)
        print("‚úÖ Loaded with Wav2Vec2Processor")
    
    print("‚úÖ Model loaded successfully!")
    
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    exit()

model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model loaded on {device}")

# Check model configuration
print("\nModel Configuration:")
print(f"Model type: {model.config.model_type if hasattr(model.config, 'model_type') else 'Unknown'}")
print(f"Number of labels: {model.config.num_labels if hasattr(model.config, 'num_labels') else 'Unknown'}")
if hasattr(model.config, 'id2label'):
    print(f"Labels: {model.config.id2label}")
else:
    print("Labels: Not found in config (will use default)")

def predict_deepfake(audio_path):
    """Predict if audio is deepfake/AI-generated or real"""
    try:
        print(f"\n{'='*60}")
        print(f"Processing: {os.path.basename(audio_path)}")
        print('='*60)
        
        # Load audio
        print("Loading audio file...")
        waveform, sample_rate = librosa.load(audio_path, sr=None, mono=True)
        
        print(f"Original sample rate: {sample_rate} Hz")
        print(f"Audio duration: {len(waveform)/sample_rate:.2f} seconds")
        
        # Resample if needed
        if sample_rate != target_sample_rate:
            print(f"Resampling to {target_sample_rate} Hz...")
            waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
            sample_rate = target_sample_rate
        
        # Process audio with processor
        print("Extracting features...")
        inputs = processor(
            waveform,
            sampling_rate=sample_rate,
            return_tensors="pt",
            padding=True
        )
        
        # Convert BatchFeature to dict and move to device
        if hasattr(inputs, 'data'):
            # It's a BatchFeature object
            input_dict = dict(inputs.data)
        elif isinstance(inputs, dict):
            input_dict = inputs
        else:
            # Try to convert to dict
            input_dict = {k: v for k, v in inputs.items()}
        
        # Move to device
        input_dict = {key: val.to(device) for key, val in input_dict.items()}
        
        # Print shape info
        for key, val in input_dict.items():
            print(f"Input '{key}' shape: {val.shape}")
        
        # Inference
        print("Running inference...")
        with torch.no_grad():
            outputs = model(**input_dict)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            predicted_class = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0, predicted_class].item()
        
        # Get label names
        if hasattr(model.config, 'id2label'):
            id2label = model.config.id2label
        else:
            # Default labels
            num_labels = probabilities.shape[1]
            if num_labels == 2:
                id2label = {0: "Bonafide/Real", 1: "Spoof/Fake"}
            else:
                id2label = {i: f"Class {i}" for i in range(num_labels)}
        
        class_name = id2label.get(predicted_class, f"Class {predicted_class}")
        
        # Display results
        print("\n" + "="*50)
        print(f"üéØ Prediction: {class_name}")
        print(f"üìä Confidence: {confidence*100:.2f}%")
        print(f"üè∑Ô∏è  Class ID: {predicted_class}")
        print("="*50)
        
        # Show all probabilities
        print("\nüìà All class probabilities:")
        for class_id in range(probabilities.shape[1]):
            prob = probabilities[0, class_id].item()
            label = id2label.get(class_id, f"Class {class_id}")
            bar = "‚ñà" * int(prob * 50)
            print(f"  {label:20s}: {prob*100:6.2f}% {bar}")
        
        # Show raw logits
        print("\nüîç Raw logits (before softmax):")
        for class_id, logit in enumerate(logits[0].cpu().numpy()):
            label = id2label.get(class_id, f"Class {class_id}")
            print(f"  {label:20s}: {logit:8.4f}")
        
        return {
            "filename": os.path.basename(audio_path),
            "prediction": class_name,
            "confidence": confidence,
            "class_id": predicted_class,
            "all_probabilities": {
                id2label.get(i, f"Class {i}"): float(probabilities[0, i].cpu())
                for i in range(probabilities.shape[1])
            },
            "logits": {
                id2label.get(i, f"Class {i}"): float(logits[0, i].cpu())
                for i in range(logits.shape[1])
            }
        }
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None



# Test multiple files
print("\n\n" + "="*60)
print("BATCH TESTING - Multiple Files")
print("="*60)

audio_files = [
    r"narration_20251210_232729.mp3",
    r"voice_preview_kanika - soft, smooth and muffled.mp3",
    r"voice_preview_faiq - standard, clear and neutral.mp3",
    r"voice_preview_tarini - soft, cheerful and expressive.mp3",
    r"clova.mp3",
    r"sample voice 1.mp3"
    # Add more files here
]
results = []
for audio_file in audio_files:
    if os.path.exists(audio_file):
        result = predict_deepfake(audio_file)
        if result:
            results.append(result)
    else:
        print(f"‚ö†Ô∏è  File not found: {audio_file}")

# Summary Report
if results:
    print("\n\n" + "="*60)
    print("üìä SUMMARY OF ALL PREDICTIONS")
    print("="*60)
    print(f"{'Filename':<50} {'Prediction':<20} {'Confidence':<10}")
    print("-"*80)
    for r in results:
        filename = r['filename'][:47] + "..." if len(r['filename']) > 50 else r['filename']
        print(f"{filename:<50} {r['prediction']:<20} {r['confidence']*100:>6.2f}%")
    
    # Statistics
    print("\n" + "="*60)
    print("üìà STATISTICS")
    print("="*60)
    
    from collections import Counter
    prediction_counts = Counter([r['prediction'] for r in results])
    for pred, count in prediction_counts.items():
        print(f"{pred}: {count} files ({count/len(results)*100:.1f}%)")
    
    avg_confidence = np.mean([r['confidence'] for r in results])
    print(f"\nAverage confidence: {avg_confidence*100:.2f}%")
    
    # Group by prediction type
    real_files = [r['filename'] for r in results if 'real' in r['prediction'].lower() or 'bonafide' in r['prediction'].lower()]
    fake_files = [r['filename'] for r in results if 'fake' in r['prediction'].lower() or 'spoof' in r['prediction'].lower()]
    
    if real_files:
        print(f"\n‚úÖ Detected as REAL ({len(real_files)}):")
        for f in real_files:
            print(f"  - {f}")
    
    if fake_files:
        print(f"\n‚ùå Detected as FAKE/AI ({len(fake_files)}):")
        for f in fake_files:
            print(f"  - {f}")

Loading Gustking Deepfake Detection model...
‚úÖ Loaded with AutoFeatureExtractor
‚úÖ Model loaded successfully!
Model loaded on cpu

Model Configuration:
Model type: wav2vec2
Number of labels: 2
Labels: {0: 'real', 1: 'fake'}


BATCH TESTING - Multiple Files

Processing: narration_20251210_232729.mp3
Loading audio file...
Original sample rate: 44100 Hz
Audio duration: 32.91 seconds
Resampling to 16000 Hz...
Extracting features...
Input 'input_values' shape: torch.Size([1, 526629])
Input 'attention_mask' shape: torch.Size([1, 526629])
Running inference...

üéØ Prediction: real
üìä Confidence: 87.40%
üè∑Ô∏è  Class ID: 0

üìà All class probabilities:
  real                :  87.40% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  fake                :  12.60% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

üîç Raw logits (before softmax):
  real                :   0.9566
  fake                :  -0.9800

Processing: voice_previe

In [6]:
import torch
import librosa
import numpy as np
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import os

# Configuration
model_path = r"wav2vec2-deepfake-voice-detector"  # Update to local path if cloned
target_sample_rate = 16000

# Load model and feature extractor
print("Loading garystafford/wav2vec2-deepfake-voice-detector model...")
try:
    # Try loading from local path first
    if os.path.exists(model_path):
        model = AutoModelForAudioClassification.from_pretrained(model_path, local_files_only=True)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_path, local_files_only=True)
        print(f"‚úÖ Model loaded from local path: {model_path}")
    else:
        # Load from HuggingFace
        model_name = "garystafford/wav2vec2-deepfake-voice-detector"
        model = AutoModelForAudioClassification.from_pretrained(model_name)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
        print("‚úÖ Model loaded from HuggingFace")
except Exception as e:
    print(f"Error loading from local, trying HuggingFace: {e}")
    model_name = "garystafford/wav2vec2-deepfake-voice-detector"
    model = AutoModelForAudioClassification.from_pretrained(model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    print("‚úÖ Model loaded from HuggingFace")

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print(f"Model on {device}\n")

# Check model configuration
print("Model Configuration:")
if hasattr(model.config, 'id2label'):
    print(f"Labels: {model.config.id2label}")
else:
    print("Labels: {0: 'real', 1: 'fake'} (default)")
print(f"Number of labels: {model.config.num_labels}\n")

def predict_audio(audio_path, show_details=True):
    """Predict if audio is real or fake/deepfake"""
    try:
        if show_details:
            print(f"{'='*70}")
            print(f"Processing: {os.path.basename(audio_path)}")
            print('='*70)
        
        # Load and preprocess audio (automatically resamples to 16kHz)
        audio, sr = librosa.load(audio_path, sr=target_sample_rate, mono=True)
        
        if show_details:
            print(f"Audio loaded: {len(audio)/sr:.2f}s duration @ {sr} Hz")
            print(f"Audio shape: {audio.shape}")
        
        # Process with feature extractor
        inputs = feature_extractor(
            audio, 
            sampling_rate=target_sample_rate, 
            return_tensors="pt", 
            padding=True
        )
        
        # Move to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        if show_details:
            for key, val in inputs.items():
                print(f"Input '{key}' shape: {val.shape}")
        
        # Run inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)
        
        # Get predictions
        # Class 0: Real, Class 1: Fake
        prob_real = probs[0][0].item()
        prob_fake = probs[0][1].item()
        
        # Determine prediction
        if prob_fake > 0.5:
            prediction = "FAKE/DEEPFAKE"
            confidence = prob_fake
        else:
            prediction = "REAL/HUMAN"
            confidence = prob_real
        
        # Display results
        if show_details:
            print(f"\n{'='*70}")
            print(f"üéØ Prediction: {prediction}")
            print(f"üìä Confidence: {confidence*100:.2f}%")
            print('='*70)
            
            print(f"\nüìà Probabilities:")
            print(f"  Real/Human:    {prob_real*100:6.2f}% {'‚ñà' * int(prob_real * 50)}")
            print(f"  Fake/Deepfake: {prob_fake*100:6.2f}% {'‚ñà' * int(prob_fake * 50)}")
            
            print(f"\nüîç Raw Logits:")
            print(f"  Real:  {logits[0][0].item():8.4f}")
            print(f"  Fake:  {logits[0][1].item():8.4f}")
        
        return {
            "filename": os.path.basename(audio_path),
            "prediction": prediction,
            "confidence": confidence,
            "prob_real": prob_real,
            "prob_fake": prob_fake,
            "logits": logits.cpu().numpy()[0]
        }
        
    except Exception as e:
        print(f"‚ùå Error processing {os.path.basename(audio_path)}: {e}")
        import traceback
        traceback.print_exc()
        return None


# Test multiple files
print("\n\n" + "="*70)
print("BATCH TESTING - Multiple Files")
print("="*70)

test_files = [
    r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
    r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3",
    r"narration_20251210_232729.mp3",
    r"voice_preview_kanika - soft, smooth and muffled.mp3",
    r"voice_preview_faiq - standard, clear and neutral.mp3",
    r"clova.mp3",
    r"sample voice 1.mp3",
    r"voice_preview_mukundan - formal and clear.mp3",
    r"voice_preview_martin li - raspy, serious and deep.mp3"
    # Add more files here
]

results = []
for audio_file in test_files:
    if os.path.exists(audio_file):
        result = predict_audio(audio_file, show_details=True)
        if result:
            results.append(result)
    else:
        print(f"‚ö†Ô∏è  File not found: {audio_file}")


# Summary Report
if results:
    print("\n\n" + "="*70)
    print("üìä SUMMARY OF ALL PREDICTIONS")
    print("="*70)
    print(f"{'Filename':<50} {'Prediction':<15} {'Confidence':<12} {'Real%':<10} {'Fake%'}")
    print("-"*70)
    
    for r in results:
        filename = r['filename'][:47] + "..." if len(r['filename']) > 50 else r['filename']
        pred_short = r['prediction'][:12] + "..." if len(r['prediction']) > 15 else r['prediction']
        print(f"{filename:<50} {pred_short:<15} {r['confidence']*100:>6.2f}%      {r['prob_real']*100:>5.1f}%     {r['prob_fake']*100:>5.1f}%")
    
    # Statistics
    print("\n" + "="*70)
    print("üìà DETECTION STATISTICS")
    print("="*70)
    
    real_count = sum(1 for r in results if "REAL" in r['prediction'])
    fake_count = sum(1 for r in results if "FAKE" in r['prediction'])
    
    print(f"Total files tested: {len(results)}")
    print(f"Detected as REAL: {real_count} ({real_count/len(results)*100:.1f}%)")
    print(f"Detected as FAKE: {fake_count} ({fake_count/len(results)*100:.1f}%)")
    
    avg_confidence = np.mean([r['confidence'] for r in results])
    print(f"\nAverage confidence: {avg_confidence*100:.2f}%")
    
    # List by category
    print(f"\n‚úÖ Files detected as REAL/HUMAN:")
    for r in results:
        if "REAL" in r['prediction']:
            print(f"  - {r['filename']} ({r['confidence']*100:.1f}% confidence)")
    
    print(f"\n‚ùå Files detected as FAKE/DEEPFAKE:")
    for r in results:
        if "FAKE" in r['prediction']:
            print(f"  - {r['filename']} ({r['confidence']*100:.1f}% confidence)")
    
    # Show detailed breakdown
    print(f"\nüìã DETAILED BREAKDOWN:")
    print(f"{'Filename':<45} {'Real%':<10} {'Fake%':<10} {'Verdict'}")
    print("-"*70)
    for r in results:
        filename = r['filename'][:42] + "..." if len(r['filename']) > 45 else r['filename']
        verdict = "‚úì REAL" if "REAL" in r['prediction'] else "‚úó FAKE"
        print(f"{filename:<45} {r['prob_real']*100:>6.2f}%    {r['prob_fake']*100:>6.2f}%    {verdict}")

Loading garystafford/wav2vec2-deepfake-voice-detector model...
‚úÖ Model loaded from local path: wav2vec2-deepfake-voice-detector
Model on cpu

Model Configuration:
Labels: {0: 'real', 1: 'fake'}
Number of labels: 2



BATCH TESTING - Multiple Files
Processing: medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3
Audio loaded: 5.76s duration @ 16000 Hz
Audio shape: (92160,)
Input 'input_values' shape: torch.Size([1, 92160])
Input 'attention_mask' shape: torch.Size([1, 92160])

üéØ Prediction: REAL/HUMAN
üìä Confidence: 86.10%

üìà Probabilities:
  Real/Human:     86.10% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  Fake/Deepfake:  13.90% ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

üîç Raw Logits:
  Real:    0.8975
  Fake:   -0.9257
Processing: medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3
Audio loaded: 3.26s duration @ 16000 Hz
Audio shape: (52224,)
Input 'input_values

In [7]:
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import base64
import io
import tempfile
import os

class HybridEnsembleDetector:
    """
    Hybrid AI Voice Detection System combining:
    1. Physics-based acoustic analysis
    2. Deep Learning model (garystafford/wav2vec2-deepfake-voice-detector)
    
    Configurable ensemble weights for flexibility
    """
    
    def __init__(
        self, 
        model_path="wav2vec2-deepfake-voice-detector",
        physics_weight=0.5,
        dl_weight=0.5,
        use_local_model=False
    ):
        """
        Initialize the hybrid detector
        
        Args:
            model_path: Path to model (local or HuggingFace)
            physics_weight: Weight for physics score (0-1)
            dl_weight: Weight for DL score (0-1)
            use_local_model: Whether to load from local path
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Normalize weights
        total_weight = physics_weight + dl_weight
        self.physics_weight = physics_weight / total_weight
        self.dl_weight = dl_weight / total_weight
        
        print(f"üîß Initializing Hybrid Detector")
        print(f"   Device: {self.device}")
        print(f"   Physics Weight: {self.physics_weight*100:.0f}%")
        print(f"   DL Weight: {self.dl_weight*100:.0f}%")
        
        # --- LOAD DEEP LEARNING MODEL ---
        try:
            print(f"üì• Loading model from '{model_path}'...")
            
            if use_local_model:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(
                    model_path, 
                    local_files_only=True
                )
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                    model_path, 
                    local_files_only=True
                )
            else:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(model_path)
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
            
            self.dl_model.to(self.device)
            self.dl_model.eval()
            self.dl_ready = True
            print("‚úÖ Deep Learning Model Loaded Successfully")
            
        except Exception as e:
            print(f"‚ö†Ô∏è  DL Model Load Failed: {e}")
            print("   Running in Physics-Only mode")
            self.dl_ready = False
            self.dl_weight = 0
            self.physics_weight = 1.0

        # --- PHYSICS ENGINE PARAMETERS ---
        # Tuned thresholds for modern TTS detection
        self.CV_AI_THRESHOLD = 0.20      # Coefficient of variation threshold for AI
        self.CV_HUMAN_THRESHOLD = 0.32   # CV threshold for human
        self.INTENSITY_MIN_STD = 0.05    # Minimum intensity std for human
        self.INTENSITY_MAX_STD = 0.15    # Maximum intensity std
        
        print("‚úÖ Hybrid Detector Ready\n")

    # ==========================================================
    # HELPER: Base64 Decoding
    # ==========================================================
    def decode_base64_audio(self, base64_string):
        """
        Decode base64 audio and save to temporary file
        
        Args:
            base64_string: Base64 encoded audio data
            
        Returns:
            str: Path to temporary audio file
        """
        try:
            # Decode base64
            audio_data = base64.b64decode(base64_string)
            
            # Create temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
            temp_file.write(audio_data)
            temp_file.close()
            
            return temp_file.name
            
        except Exception as e:
            raise ValueError(f"Failed to decode base64 audio: {str(e)}")

    # ==========================================================
    # PART A: PHYSICS ENGINE
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        """Linear interpolation for scoring"""
        if val <= min_val:
            return 1.0
        if val >= max_val:
            return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        """
        Analyze audio using physics-based acoustic features
        
        Returns:
            tuple: (ai_score, method, features_dict)
        """
        try:
            # Load audio at native sample rate
            y, sr = librosa.load(audio_path, sr=None)
            
            # Robust pitch tracking using PYIN
            f0, voiced_flag, voiced_probs = librosa.pyin(
                y, 
                fmin=50, 
                fmax=400, 
                sr=sr
            )
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0:
                return 0.0, "No voice detected", {}

            # Extract acoustic features
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            # Calculate feature metrics
            feats = {
                'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid),
                'mean_pitch': mean_pitch,
                'std_pitch': std_pitch
            }

            # Individual feature scores (higher = more AI-like)
            intensity_score = self.get_linear_score(
                feats['intensity_std'], 
                self.INTENSITY_MIN_STD, 
                self.INTENSITY_MAX_STD
            )
            
            pitch_score = self.get_linear_score(
                feats['pitch_cv'], 
                self.CV_AI_THRESHOLD, 
                self.CV_HUMAN_THRESHOLD
            )
            
            skew_score = self.get_linear_score(
                abs(feats['freq_skew']), 
                0.1, 
                1.0
            )

            # Weighted combination
            W_INTENSITY = 0.40
            W_PITCH = 0.40
            W_SKEW = 0.20
            
            base_score = (
                intensity_score * W_INTENSITY + 
                pitch_score * W_PITCH + 
                skew_score * W_SKEW
            )

            # Synergy bonus: if both intensity and pitch are suspicious
            if intensity_score > 0.4 and pitch_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            return 0.0, f"Physics Error: {str(e)}", {}

    # ==========================================================
    # PART B: DEEP LEARNING ENGINE
    # ==========================================================
    def get_dl_score(self, audio_path):
        """
        Analyze audio using deep learning model
        
        Returns:
            tuple: (ai_score, label)
        """
        if not self.dl_ready:
            return 0.0, "Model not loaded"

        target_sr = 16000

        try:
            # Load audio with librosa
            waveform_np, sr = librosa.load(audio_path, sr=target_sr, mono=True)

            # Process with feature extractor
            inputs = self.feature_extractor(
                waveform_np,
                sampling_rate=target_sr,
                return_tensors="pt",
                padding=True
            )
            
            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Run inference
            with torch.no_grad():
                outputs = self.dl_model(**inputs)
                logits = outputs.logits
                probs = F.softmax(logits, dim=-1)
                
            # Get predictions
            # Class 0: Real, Class 1: Fake
            prob_real = probs[0][0].item()
            prob_fake = probs[0][1].item()
            
            # AI score is the fake probability
            ai_score = prob_fake
            
            label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"

            return round(ai_score, 3), label

        except Exception as e:
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART C: EXPLANATION GENERATOR
    # ==========================================================
    def generate_explanation(self, final_score, phys_score, dl_score, dl_label, phys_feats):
        """
        Generate human-readable explanation for the classification
        
        Returns:
            str: Explanation text
        """
        explanations = []
        
        if final_score > 0.55:
            # AI GENERATED
            
            # Deep Learning contributions
            if dl_score > 0.55 and self.dl_ready:
                if "Fake" in dl_label or "Deepfake" in dl_label:
                    explanations.append(
                        f"Deep learning model detected synthetic voice patterns "
                        f"(confidence: {dl_score*100:.1f}%)"
                    )
            
            # Physics contributions
            if phys_score > 0.55:
                p_cv = phys_feats.get('pitch_cv', 0)
                i_std = phys_feats.get('intensity_std', 0)
                
                if i_std < 0.06:
                    explanations.append(
                        f"Unnaturally consistent energy levels detected "
                        f"(std: {i_std:.3f}, expected: >0.06)"
                    )
                
                if p_cv < 0.22:
                    explanations.append(
                        f"Robotic pitch modulation patterns "
                        f"(CV: {p_cv:.2f}, expected: >0.22)"
                    )
                
                if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
                    explanations.append(
                        "Acoustic parameters lack natural human variability"
                    )
            
            if not explanations:
                explanations.append(
                    "Voice exhibits characteristics consistent with AI generation"
                )
                
        else:
            # HUMAN
            explanations.append(
                "Voice exhibits natural acoustic variability and human speech characteristics"
            )
        
        return "; ".join(explanations)

    # ==========================================================
    # PART D: MAIN ANALYSIS FUNCTION
    # ==========================================================
    def analyze(self, audio_input, input_type="file"):
        """
        Main analysis function with configurable input types
        
        Args:
            audio_input: Either file path or base64 string
            input_type: "file" or "base64"
            
        Returns:
            dict: Analysis results following API response format
        """
        temp_file = None
        
        try:
            # Handle input type
            if input_type == "base64":
                temp_file = self.decode_base64_audio(audio_input)
                audio_path = temp_file
            elif input_type == "file":
                audio_path = audio_input
                if not os.path.exists(audio_path):
                    return {
                        "status": "error",
                        "error": f"Audio file not found: {audio_path}"
                    }
            else:
                return {
                    "status": "error",
                    "error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
                }

            # 1. Run Physics Analysis
            phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
            
            # 2. Run Deep Learning Analysis
            dl_score, dl_label = self.get_dl_score(audio_path)

            # 3. Calculate weighted ensemble score
            final_score = (
                self.physics_weight * phys_score + 
                self.dl_weight * dl_score
            )
            
            # Round to 2 decimal places
            final_score = round(final_score, 2)
            
            # 4. Determine classification
            classification = "AI_GENERATED" if final_score > 0.55 else "HUMAN"
            
            # 5. Generate explanation
            explanation = self.generate_explanation(
                final_score, 
                phys_score, 
                dl_score, 
                dl_label, 
                phys_feats
            )

            # 6. Return API-compliant response
            return {
                "status": "success",
                "classification": classification,
                "confidenceScore": final_score,
                "explanation": explanation,
                "debug": {
                    "physics_score": phys_score,
                    "dl_score": dl_score,
                    "dl_label": dl_label,
                    "physics_weight": f"{self.physics_weight*100:.0f}%",
                    "dl_weight": f"{self.dl_weight*100:.0f}%",
                    "physics_features": phys_feats
                }
            }
            
        except Exception as e:
            return {
                "status": "error",
                "error": str(e)
            }
            
        finally:
            # Clean up temporary file
            if temp_file and os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass

    # ==========================================================
    # UTILITY: Update Weights
    # ==========================================================
    def update_weights(self, physics_weight, dl_weight):
        """
        Update ensemble weights dynamically
        
        Args:
            physics_weight: New physics weight (0-1)
            dl_weight: New DL weight (0-1)
        """
        total = physics_weight + dl_weight
        self.physics_weight = physics_weight / total
        self.dl_weight = dl_weight / total
        
        print(f"‚öôÔ∏è  Weights updated:")
        print(f"   Physics: {self.physics_weight*100:.0f}%")
        print(f"   DL: {self.dl_weight*100:.0f}%")


# ==========================================================
# USAGE EXAMPLES
# ==========================================================
if __name__ == "__main__":
    
    # Example 1: Initialize with 60-40 split (Physics-DL)
    print("="*70)
    print("EXAMPLE 1: 40% Physics, 60% Deep Learning")
    print("="*70)
    
    detector = HybridEnsembleDetector(
        model_path="wav2vec2-deepfake-voice-detector",
        physics_weight=0.4,
        dl_weight=0.6,
        use_local_model=True  # Set True if using local model
    )
    
    # Test with file path
    audio_file = r"sample voice 1.mp3"
    result = detector.analyze(audio_file, input_type="file")
    
    print(f"\nüìä Result:")
    print(f"   Status: {result['status']}")
    print(f"   Classification: {result['classification']}")
    print(f"   Confidence: {result['confidenceScore']}")
    print(f"   Explanation: {result['explanation']}")
    print(f"\nüîç Debug Info:")
    for key, val in result.get('debug', {}).items():
        print(f"   {key}: {val}")
    
    
    # Example 2: Test with base64 input
    print("\n\n" + "="*70)
    print("EXAMPLE 2: Base64 Input")
    print("="*70)
    
    # Read file and convert to base64
    with open(audio_file, 'rb') as f:
        audio_bytes = f.read()
        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
    
    result_b64 = detector.analyze(audio_base64, input_type="base64")
    
    print(f"\nüìä Result:")
    print(f"   Classification: {result_b64['classification']}")
    print(f"   Confidence: {result_b64['confidenceScore']}")
    
    
    # Example 3: Change weights dynamically
    print("\n\n" + "="*70)
    print("EXAMPLE 3: Dynamic Weight Adjustment (80-20)")
    print("="*70)
    
    detector.update_weights(physics_weight=0.2, dl_weight=0.8)
    
    result_new = detector.analyze(audio_file, input_type="file")
    
    print(f"\nüìä Result with new weights:")
    print(f"   Classification: {result_new['classification']}")
    print(f"   Confidence: {result_new['confidenceScore']}")
    
    
    # Example 4: Batch processing
    print("\n\n" + "="*70)
    print("EXAMPLE 4: Batch Processing Multiple Files")
    print("="*70)
    
    test_files = [
        r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
        r"medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3",
        r"narration_20251210_232729.mp3",
        r"voice_preview_kanika - soft, smooth and muffled.mp3",
        r"voice_preview_faiq - standard, clear and neutral.mp3",
        r"clova.mp3",
        r"sample voice 1.mp3",
        r"voice_preview_mukundan - formal and clear.mp3",
        r"voice_preview_martin li - raspy, serious and deep.mp3"
        # Add more files here
    ]
    
    print(f"\n{'Filename':<40} {'Classification':<15} {'Confidence':<12} {'Method'}")
    print("-"*70)
    
    for file_path in test_files:
        if os.path.exists(file_path):
            result = detector.analyze(file_path, input_type="file")
            
            if result['status'] == 'success':
                filename = os.path.basename(file_path)[:37] + "..."
                print(f"{filename:<40} {result['classification']:<15} {result['confidenceScore']:<12.2f}")

EXAMPLE 1: 40% Physics, 60% Deep Learning
üîß Initializing Hybrid Detector
   Device: cpu
   Physics Weight: 40%
   DL Weight: 60%
üì• Loading model from 'wav2vec2-deepfake-voice-detector'...
‚úÖ Deep Learning Model Loaded Successfully
‚úÖ Hybrid Detector Ready


üìä Result:
   Status: success
   Classification: AI_GENERATED
   Confidence: 0.9399999976158142
   Explanation: Deep learning model detected synthetic voice patterns (confidence: 98.5%); Robotic pitch modulation patterns (CV: 0.20, expected: >0.22)

üîç Debug Info:
   physics_score: 0.8610000014305115
   dl_score: 0.985
   dl_label: Fake/Deepfake
   physics_weight: 40%
   dl_weight: 60%
   physics_features: {'pitch_cv': np.float64(0.19643575289740356), 'intensity_std': np.float32(0.07224538), 'freq_skew': np.float64(1.2587673386321117), 'mean_pitch': np.float64(121.40441626562153), 'std_pitch': np.float64(23.848167914207153)}


EXAMPLE 2: Base64 Input

üìä Result:
   Classification: AI_GENERATED
   Confidence: 0.93999999

In [None]:
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import base64
import tempfile
import os

class HybridEnsembleDetector:
    """
    Hybrid AI Voice Detection System (Production Version)
    1. Physics: Signal Processing (Librosa)
    2. Deep Learning: Wav2Vec2
    3. Language ID: Whisper-Tiny (Logit-Based, No Translation)
    """
    
    def __init__(
        self, 
        deepfake_model_path="wav2vec2-deepfake-voice-detector",
        use_local_model=False
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Initializing Hybrid Detector on {self.device}")

        # --- 1. LOAD DEEPFAKE DETECTION MODEL ---
        print(f"üì• Loading Deepfake Model...")
        try:
            self.df_model = AutoModelForAudioClassification.from_pretrained(
                deepfake_model_path, local_files_only=use_local_model
            ).to(self.device)
            self.df_extractor = AutoFeatureExtractor.from_pretrained(
                deepfake_model_path, local_files_only=use_local_model
            )
            self.df_model.eval()
            self.df_ready = True
            print("‚úÖ Deepfake Model Loaded.")
        except Exception as e:
            print(f"‚ö†Ô∏è Deepfake Model Failed: {e}")
            self.df_ready = False

        # --- 2. LOAD WHISPER LANGUAGE ID (Logit Mode) ---
        print("üì• Loading Whisper-Tiny for LID...")
        try:
            self.lid_processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
            self.lid_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to(self.device)
            self.lid_model.eval()
            self.lid_ready = True
            
            # Prepare Language Token Map (id -> language name)
            # Whisper stores languages as special tokens (e.g., <|hi|>, <|ta|>)
            # We extract just the language tokens to check their probabilities
            self.lang_code_to_id = {
                code: id for code, id in self.lid_processor.tokenizer.get_vocab().items() 
                if code.startswith("<|") and code.endswith("|>") and len(code) == 4
            }
            # Custom map for your requirements
            self.code_to_name = {
                "<|en|>": "English", "<|hi|>": "Hindi", "<|ta|>": "Tamil",
                "<|te|>": "Telugu", "<|ml|>": "Malayalam", "<|kn|>": "Kannada",
                "<|mr|>": "Marathi", "<|bn|>": "Bengali", "<|ur|>": "Urdu"
            }
            print("‚úÖ Language Model Loaded.")
        except Exception as e:
            print(f"‚ö†Ô∏è Language Model Failed: {e}")
            self.lid_ready = False

        # --- 3. PHYSICS PARAMETERS ---
        self.CV_AI_THRESHOLD = 0.20
        self.CV_HUMAN_THRESHOLD = 0.32
        self.INTENSITY_MIN_STD = 0.05
        self.INTENSITY_MAX_STD = 0.15 
        
        print("‚úÖ System Ready\n")

    # ==========================================================
    # PART A: LANGUAGE DETECTION (Logit-Based / No Translation)
    # ==========================================================
    def detect_language(self, audio_path):
        if not self.lid_ready: return "Unknown"

        try:
            # 1. Load Audio (16kHz)
            audio, _ = librosa.load(audio_path, sr=16000)
            
            # 2. Prepare Inputs
            input_features = self.lid_processor(
                audio, sampling_rate=16000, return_tensors="pt"
            ).input_features.to(self.device)

            # 3. Create Decoder Start Token
            # We only want to predict the FIRST token after start
            decoder_input_ids = torch.tensor([[self.lid_model.config.decoder_start_token_id]]).to(self.device)

            # 4. Forward Pass (No Generation Loop)
            with torch.no_grad():
                logits = self.lid_model(input_features, decoder_input_ids=decoder_input_ids).logits
            
            # 5. Extract Logits for First Token: (Batch, Seq, Vocab) -> (Vocab)
            first_token_logits = logits[0, 0, :]

            # 6. Find the Language Token with Max Probability
            best_lang_code = "Unknown"
            max_score = -float('inf')

            for code, token_id in self.lang_code_to_id.items():
                score = first_token_logits[token_id].item()
                if score > max_score:
                    max_score = score
                    best_lang_code = code

            # 7. Map to Readable Name
            return self.code_to_name.get(best_lang_code, best_lang_code.replace("<|", "").replace("|>", ""))

        except Exception as e:
            return f"Lang Error: {str(e)}"

    # ==========================================================
    # PART B: PHYSICS ENGINE
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        if val <= min_val: return 1.0
        if val >= max_val: return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        try:
            y, sr = librosa.load(audio_path, sr=None)
            f0, _, _ = librosa.pyin(y, fmin=50, fmax=400, sr=sr)
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0: return 0.0, "No voice detected", {}

            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            feats = {
                'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid)
            }

            i_score = self.get_linear_score(feats['intensity_std'], self.INTENSITY_MIN_STD, self.INTENSITY_MAX_STD)
            p_score = self.get_linear_score(feats['pitch_cv'], self.CV_AI_THRESHOLD, self.CV_HUMAN_THRESHOLD)
            s_score = self.get_linear_score(abs(feats['freq_skew']), 0.1, 1.0)

            # Weighted Physics Score
            base_score = (i_score * 0.40) + (p_score * 0.40) + (s_score * 0.20)

            # Synergy Bonus
            if i_score > 0.4 and p_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            return 0.0, f"Physics Error: {str(e)}", {}

    # ==========================================================
    # PART C: DEEP LEARNING ENGINE
    # ==========================================================
    def get_dl_score(self, audio_path):
        if not self.df_ready: return 0.0, "Model not loaded"

        try:
            waveform, sr = librosa.load(audio_path, sr=16000, mono=True)
            inputs = self.df_extractor(
                waveform, sampling_rate=16000, return_tensors="pt", padding=True
            ).to(self.device)

            with torch.no_grad():
                logits = self.df_model(**inputs).logits
                probs = F.softmax(logits, dim=-1)
                
            prob_fake = probs[0][1].item() 
            label = "Fake" if prob_fake > 0.5 else "Real"

            return round(prob_fake, 3), label

        except Exception as e:
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART D: MAIN ANALYZE FUNCTION (VETO LOGIC APPLIED)
    # ==========================================================
    def analyze(self, audio_input, input_type="file"):
        temp_file = None
        try:
            if input_type == "base64":
                audio_data = base64.b64decode(audio_input)
                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
                temp_file.write(audio_data)
                temp_file.close()
                audio_path = temp_file.name
            else:
                audio_path = audio_input

            if not os.path.exists(audio_path):
                return {"status": "error", "error": "File not found"}

            # 1. Detect Language
            detected_lang = self.detect_language(audio_path)

            # 2. Run Analysis Engines
            phys_score, _, phys_feats = self.get_physics_score(audio_path)
            dl_score, dl_label = self.get_dl_score(audio_path)

            # 3. VETO LOGIC (The Fix for 0.94 vs 0.05)
            # If Physics says "DEFINITELY AI" (>0.85), trust it even if DL misses.
            # If DL says "DEFINITELY AI" (>0.85), trust it even if Physics misses.
            if phys_score > 0.85 or dl_score > 0.85:
                final_score = max(phys_score, dl_score)
            else:
                # Otherwise, take a weighted average favoring the higher signal
                final_score = (0.6 * max(phys_score, dl_score)) + (0.4 * min(phys_score, dl_score))

            final_score = round(final_score, 2)
            
            # 4. Classification
            classification = "AI_GENERATED" if final_score > 0.55 else "HUMAN"

            # 5. Explanations
            explanations = []
            if classification == "AI_GENERATED":
                if dl_score > 0.55:
                    explanations.append(f"Deep learning model detected synthetic artifacts (conf: {dl_score})")
                if phys_score > 0.55:
                    p_cv = phys_feats.get('pitch_cv', 0)
                    i_std = phys_feats.get('intensity_std', 0)
                    if i_std < 0.06: explanations.append(f"Unnaturally consistent energy detected (std: {i_std:.3f})")
                    if p_cv < 0.22: explanations.append(f"Robotic pitch modulation detected (CV: {p_cv:.2f})")
                if not explanations: explanations.append("Acoustic fingerprint matches AI characteristics")
            else:
                explanations.append("Voice exhibits natural acoustic variability")

            return {
                "status": "success",
                "language": detected_lang,
                "classification": classification,
                "confidenceScore": final_score,
                "explanation": "; ".join(explanations),
                "debug": {
                    "phys_score": phys_score,
                    "dl_score": dl_score,
                    "raw_lang": detected_lang
                }
            }
            
        except Exception as e:
            return {"status": "error", "error": str(e)}
        finally:
            if temp_file and os.path.exists(temp_file): os.unlink(temp_file)

# --- USAGE ---
if __name__ == "__main__":
    detector = HybridEnsembleDetector(
        deepfake_model_path="wav2vec2-deepfake-voice-detector",
        use_local_model=True
    )
    
    # Test
    result = detector.analyze(r"")
    print(result)

üîß Initializing Hybrid Detector on cpu
üì• Loading Deepfake Model...
‚úÖ Deepfake Model Loaded.
üì• Loading Whisper-Tiny for LID...
‚úÖ Language Model Loaded.
‚úÖ System Ready

{'status': 'success', 'language': 'Unknown', 'classification': 'AI_GENERATED', 'confidenceScore': 0.95, 'explanation': 'Unnaturally consistent energy detected (std: 0.010); Robotic pitch modulation detected (CV: 0.08)', 'debug': {'phys_score': 0.95, 'dl_score': 0.0, 'raw_lang': 'Unknown'}}


In [24]:
#  If audio > 30 sec break it into first 30 sec then analyze. Add language detection using whisperAI

In [3]:
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
import base64
import io
import tempfile
import os
import soundfile as sf

class HybridEnsembleDetector:
    """
    Hybrid AI Voice Detection System with Language Detection
    
    Features:
    1. Physics-based acoustic analysis
    2. Deep Learning deepfake detection
    3. Language identification using Whisper (focus on Indian languages)
    4. Auto-truncation to 30 seconds for faster processing
    """
    
    def __init__(
        self, 
        deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
        whisper_model_path="openai/whisper-base",
        physics_weight=0.4,
        dl_weight=0.6,
        use_local_deepfake_model=False,
        use_local_whisper_model=False,
        max_audio_duration=30  # seconds
    ):
        """
        Initialize the hybrid detector
        
        Args:
            deepfake_model_path: Path to deepfake detection model
            whisper_model_path: Path to Whisper model for language detection
            physics_weight: Weight for physics score (0-1)
            dl_weight: Weight for DL score (0-1)
            use_local_deepfake_model: Whether to load deepfake model from local path
            use_local_whisper_model: Whether to load Whisper from local path
            max_audio_duration: Maximum audio duration to process (seconds)
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.max_duration = max_audio_duration
        
        # Normalize weights
        total_weight = physics_weight + dl_weight
        self.physics_weight = physics_weight / total_weight
        self.dl_weight = dl_weight / total_weight
        
        print(f"üîß Initializing Hybrid Detector with Language Detection")
        print(f"   Device: {self.device}")
        print(f"   Physics Weight: {self.physics_weight*100:.0f}%")
        print(f"   DL Weight: {self.dl_weight*100:.0f}%")
        print(f"   Max Audio Duration: {self.max_duration}s")
        
        # --- LOAD DEEPFAKE DETECTION MODEL ---
        try:
            print(f"üì• Loading deepfake detection model from '{deepfake_model_path}'...")
            
            if use_local_deepfake_model:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(
                    deepfake_model_path, 
                    local_files_only=True
                )
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                    deepfake_model_path, 
                    local_files_only=True
                )
            else:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(deepfake_model_path)
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(deepfake_model_path)
            
            self.dl_model.to(self.device)
            self.dl_model.eval()
            self.dl_ready = True
            print("‚úÖ Deepfake Detection Model Loaded")
            
        except Exception as e:
            print(f"‚ö†Ô∏è  DL Model Load Failed: {e}")
            print("   Running in Physics-Only mode")
            self.dl_ready = False
            self.dl_weight = 0
            self.physics_weight = 1.0

        # --- LOAD WHISPER FOR LANGUAGE DETECTION ---
        try:
            print(f"üì• Loading Whisper model for language detection from '{whisper_model_path}'...")
            
            if use_local_whisper_model:
                self.whisper_processor = WhisperProcessor.from_pretrained(
                    whisper_model_path,
                    local_files_only=True
                )
                self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
                    whisper_model_path,
                    local_files_only=True
                )
            else:
                self.whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
                self.whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
            
            self.whisper_model.to(self.device)
            self.whisper_model.eval()
            self.lang_ready = True
            print("‚úÖ Whisper Language Detection Model Loaded")
            
            # Language code mapping for Indian languages and common languages
            self.language_map = {
                'hi': 'Hindi',
                'bn': 'Bengali', 
                'te': 'Telugu',
                'mr': 'Marathi',
                'ta': 'Tamil',
                'gu': 'Gujarati',
                'kn': 'Kannada',
                'ml': 'Malayalam',
                'or': 'Odia',
                'pa': 'Punjabi',
                'as': 'Assamese',
                'ur': 'Urdu',
                'en': 'English',
                'ne': 'Nepali',
                'si': 'Sinhala',
                'sa': 'Sanskrit',
                'sd': 'Sindhi',
                'ks': 'Kashmiri'
            }
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Whisper Model Load Failed: {e}")
            print("   Running without language detection")
            self.lang_ready = False

        # --- PHYSICS ENGINE PARAMETERS ---
        self.CV_AI_THRESHOLD = 0.20
        self.CV_HUMAN_THRESHOLD = 0.32
        self.INTENSITY_MIN_STD = 0.05
        self.INTENSITY_MAX_STD = 0.15
        
        print("‚úÖ Hybrid Detector Ready\n")

    # ==========================================================
    # HELPER: Audio Preprocessing
    # ==========================================================
    def preprocess_audio(self, audio_path, target_sr=16000):
        """
        Load and preprocess audio:
        1. Load audio
        2. Convert to mono
        3. Truncate to max_duration if needed
        4. Resample to target_sr
        
        Args:
            audio_path: Path to audio file
            target_sr: Target sample rate
            
        Returns:
            tuple: (waveform_array, sample_rate, duration, was_truncated)
        """
        try:
            # Load audio
            y, sr = librosa.load(audio_path, sr=None, mono=True)
            
            # Calculate duration
            duration = len(y) / sr
            was_truncated = False
            
            # Truncate if longer than max_duration
            if duration > self.max_duration:
                print(f"   ‚ö†Ô∏è  Audio is {duration:.1f}s, truncating to {self.max_duration}s")
                max_samples = int(self.max_duration * sr)
                y = y[:max_samples]
                duration = self.max_duration
                was_truncated = True
            
            # Resample if needed
            if sr != target_sr:
                y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
                sr = target_sr
            
            return y, sr, duration, was_truncated
            
        except Exception as e:
            raise ValueError(f"Failed to preprocess audio: {str(e)}")

    # ==========================================================
    # HELPER: Base64 Decoding
    # ==========================================================
    def decode_base64_audio(self, base64_string):
        """
        Decode base64 audio and save to temporary file
        
        Args:
            base64_string: Base64 encoded audio data
            
        Returns:
            str: Path to temporary audio file
        """
        try:
            # Decode base64
            audio_data = base64.b64decode(base64_string)
            
            # Create temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
            temp_file.write(audio_data)
            temp_file.close()
            
            return temp_file.name
            
        except Exception as e:
            raise ValueError(f"Failed to decode base64 audio: {str(e)}")

    # ==========================================================
    # LANGUAGE DETECTION (FIXED)
    # ==========================================================
    def detect_language(self, audio_path):
        """
        Detect language using Whisper model - FIXED VERSION
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            str: Detected language name
        """
        if not self.lang_ready:
            return "Unknown"
        
        try:
            # Load and preprocess audio for Whisper (uses 16kHz)
            # Use first 30 seconds for language detection
            audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)
            
            # Process audio with Whisper processor
            input_features = self.whisper_processor(
                audio,
                sampling_rate=16000,
                return_tensors="pt"
            ).input_features
            
            input_features = input_features.to(self.device)
            
            # Whisper language detection using forced_decoder_ids
            with torch.no_grad():
                # Generate with language detection enabled
                # Set task to "transcribe" and let Whisper detect language
                generated_ids = self.whisper_model.generate(
                    input_features,
                    task="transcribe",
                    return_dict_in_generate=True
                )
                
                # Decode the output
                transcription = self.whisper_processor.batch_decode(
                    generated_ids.sequences,
                    skip_special_tokens=True
                )[0]
                
                # Extract language from generated tokens
                # Whisper embeds language token at the beginning
                full_output = self.whisper_processor.batch_decode(
                    generated_ids.sequences,
                    skip_special_tokens=False
                )[0]
                
                # Parse language from special tokens
                # Format: <|startoftranscript|><|en|><|transcribe|>...
                detected_lang = None
                
                # Look for language tokens in the format <|xx|>
                import re
                lang_pattern = r'<\|([a-z]{2})\|>'
                matches = re.findall(lang_pattern, full_output)
                
                if matches:
                    # First match after startoftranscript is usually the language
                    for match in matches:
                        if match in self.language_map:
                            detected_lang = match
                            break
                
                if detected_lang:
                    lang_name = self.language_map.get(detected_lang, detected_lang.upper())
                    print(f"   üåê Detected Language: {lang_name} ({detected_lang})")
                    return lang_name
                else:
                    # Fallback: Try alternate method using model's internal language detection
                    # This method uses the log probabilities
                    print(f"   üåê Language: Analyzing transcription...")
                    
                    # Simple heuristic: if transcription has content, likely English or detected language
                    if len(transcription.strip()) > 0:
                        # Default to English if we can transcribe but can't detect language
                        print(f"   üåê Detected Language: English (default)")
                        return "English"
                    else:
                        return "Unknown"
                    
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Language detection error: {str(e)}")
            # Fallback: Try simple language detection based on transcription
            try:
                # Simplified approach
                audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)
                input_features = self.whisper_processor(
                    audio,
                    sampling_rate=16000,
                    return_tensors="pt"
                ).input_features.to(self.device)
                
                # Just generate transcription
                with torch.no_grad():
                    predicted_ids = self.whisper_model.generate(input_features)
                    transcription = self.whisper_processor.batch_decode(
                        predicted_ids, 
                        skip_special_tokens=True
                    )[0]
                
                # If we got transcription, assume English
                if len(transcription.strip()) > 0:
                    print(f"   üåê Detected Language: English (from transcription)")
                    return "English"
                    
            except:
                pass
            
            return "Unknown"

    # ==========================================================
    # PART A: PHYSICS ENGINE
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        """Linear interpolation for scoring"""
        if val <= min_val:
            return 1.0
        if val >= max_val:
            return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        """
        Analyze audio using physics-based acoustic features
        
        Returns:
            tuple: (ai_score, method, features_dict)
        """
        try:
            # Load and preprocess audio
            y, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=None)
            
            # Robust pitch tracking using PYIN
            f0, voiced_flag, voiced_probs = librosa.pyin(
                y, 
                fmin=50, 
                fmax=400, 
                sr=sr
            )
            valid_f0 = f0[~np.isnan(f0)]
            
            if len(valid_f0) == 0:
                return 0.0, "No voice detected", {'duration': duration, 'was_truncated': was_truncated}

            # Extract acoustic features
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            # Calculate feature metrics
            feats = {
                'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid),
                'mean_pitch': mean_pitch,
                'std_pitch': std_pitch,
                'duration': duration,
                'was_truncated': was_truncated
            }

            # Individual feature scores (higher = more AI-like)
            intensity_score = self.get_linear_score(
                feats['intensity_std'], 
                self.INTENSITY_MIN_STD, 
                self.INTENSITY_MAX_STD
            )
            
            pitch_score = self.get_linear_score(
                feats['pitch_cv'], 
                self.CV_AI_THRESHOLD, 
                self.CV_HUMAN_THRESHOLD
            )
            
            skew_score = self.get_linear_score(
                abs(feats['freq_skew']), 
                0.1, 
                1.0
            )

            # Weighted combination
            W_INTENSITY = 0.40
            W_PITCH = 0.40
            W_SKEW = 0.20
            
            base_score = (
                intensity_score * W_INTENSITY + 
                pitch_score * W_PITCH + 
                skew_score * W_SKEW
            )

            # Synergy bonus: if both intensity and pitch are suspicious
            if intensity_score > 0.4 and pitch_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            return 0.0, f"Physics Error: {str(e)}", {'duration': 0, 'was_truncated': False}

    # ==========================================================
    # PART B: DEEP LEARNING ENGINE
    # ==========================================================
    def get_dl_score(self, audio_path):
        """
        Analyze audio using deep learning model
        
        Returns:
            tuple: (ai_score, label)
        """
        if not self.dl_ready:
            return 0.0, "Model not loaded"

        try:
            # Load and preprocess audio
            waveform_np, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=16000)

            # Process with feature extractor
            inputs = self.feature_extractor(
                waveform_np,
                sampling_rate=16000,
                return_tensors="pt",
                padding=True
            )
            
            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Run inference
            with torch.no_grad():
                outputs = self.dl_model(**inputs)
                logits = outputs.logits
                probs = F.softmax(logits, dim=-1)
                
            # Get predictions
            # Class 0: Real, Class 1: Fake
            prob_real = probs[0][0].item()
            prob_fake = probs[0][1].item()
            
            # AI score is the fake probability
            ai_score = prob_fake
            
            label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"

            return round(ai_score, 3), label

        except Exception as e:
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART C: EXPLANATION GENERATOR
    # ==========================================================
    def generate_explanation(self, final_score, phys_score, dl_score, dl_label, phys_feats):
        """
        Generate human-readable explanation for the classification
        
        Returns:
            str: Explanation text
        """
        explanations = []
        
        if final_score > 0.55:
            # AI GENERATED
            
            # Deep Learning contributions
            if dl_score > 0.55 and self.dl_ready:
                if "Fake" in dl_label or "Deepfake" in dl_label:
                    explanations.append(
                        f"Deep learning model detected synthetic voice patterns "
                        f"(confidence: {dl_score*100:.1f}%)"
                    )
            
            # Physics contributions
            if phys_score > 0.55:
                p_cv = phys_feats.get('pitch_cv', 0)
                i_std = phys_feats.get('intensity_std', 0)
                
                if i_std < 0.06:
                    explanations.append(
                        f"Unnaturally consistent energy levels detected "
                        f"(std: {i_std:.3f}, expected: >0.06)"
                    )
                
                if p_cv < 0.22:
                    explanations.append(
                        f"Robotic pitch modulation patterns "
                        f"(CV: {p_cv:.2f}, expected: >0.22)"
                    )
                
                if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
                    explanations.append(
                        "Acoustic parameters lack natural human variability"
                    )
            
            if not explanations:
                explanations.append(
                    "Voice exhibits characteristics consistent with AI generation"
                )
                
        else:
            # HUMAN
            explanations.append(
                "Voice exhibits natural acoustic variability and human speech characteristics"
            )
        
        return "; ".join(explanations)

    # ==========================================================
    # PART D: MAIN ANALYSIS FUNCTION
    # ==========================================================
    def analyze(self, audio_input, input_type="file"):
        """
        Main analysis function with configurable input types
        
        Args:
            audio_input: Either file path or base64 string
            input_type: "file" or "base64"
            
        Returns:
            dict: Analysis results following API response format
        """
        temp_file = None
        
        try:
            # Handle input type
            if input_type == "base64":
                temp_file = self.decode_base64_audio(audio_input)
                audio_path = temp_file
            elif input_type == "file":
                audio_path = audio_input
                if not os.path.exists(audio_path):
                    return {
                        "status": "error",
                        "error": f"Audio file not found: {audio_path}"
                    }
            else:
                return {
                    "status": "error",
                    "error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
                }

            print(f"üéµ Analyzing: {os.path.basename(audio_path)}")

            # 1. Detect Language
            detected_language = self.detect_language(audio_path)

            # 2. Run Physics Analysis
            phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
            
            # 3. Run Deep Learning Analysis
            dl_score, dl_label = self.get_dl_score(audio_path)

            # 4. Calculate weighted ensemble score
            final_score = (
                self.physics_weight * phys_score + 
                self.dl_weight * dl_score
            )
            
            # Round to 2 decimal places
            final_score = round(final_score, 2)
            
            # 5. Determine classification
            classification = "AI_GENERATED" if final_score > 0.55 else "HUMAN"
            
            # 6. Generate explanation
            explanation = self.generate_explanation(
                final_score, 
                phys_score, 
                dl_score, 
                dl_label, 
                phys_feats
            )

            # 7. Return API-compliant response
            return {
                "status": "success",
                "language": detected_language,
                "classification": classification,
                "confidenceScore": final_score,
                "explanation": explanation,
                "debug": {
                    "physics_score": phys_score,
                    "dl_score": dl_score,
                    "dl_label": dl_label,
                    "physics_weight": f"{self.physics_weight*100:.0f}%",
                    "dl_weight": f"{self.dl_weight*100:.0f}%",
                    "audio_duration": phys_feats.get('duration', 0),
                    "was_truncated": phys_feats.get('was_truncated', False),
                    "physics_features": {k: v for k, v in phys_feats.items() if k not in ['duration', 'was_truncated']}
                }
            }
            
        except Exception as e:
            import traceback
            return {
                "status": "error",
                "error": str(e),
                "traceback": traceback.format_exc()
            }
            
        finally:
            # Clean up temporary file
            if temp_file and os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass

    # ==========================================================
    # UTILITY: Update Weights
    # ==========================================================
    def update_weights(self, physics_weight, dl_weight):
        """
        Update ensemble weights dynamically
        
        Args:
            physics_weight: New physics weight (0-1)
            dl_weight: New DL weight (0-1)
        """
        total = physics_weight + dl_weight
        self.physics_weight = physics_weight / total
        self.dl_weight = dl_weight / total
        
        print(f"‚öôÔ∏è  Weights updated:")
        print(f"   Physics: {self.physics_weight*100:.0f}%")
        print(f"   DL: {self.dl_weight*100:.0f}%")


# ==========================================================
# USAGE EXAMPLES
# ==========================================================
if __name__ == "__main__":
    
    # Example 1: Initialize with 40-60 split (Physics-DL)
    print("="*70)
    print("EXAMPLE 1: Initialize Hybrid Detector with Language Detection")
    print("="*70)
    
    detector = HybridEnsembleDetector(
        deepfake_model_path="wav2vec2-deepfake-voice-detector",
        whisper_model_path="openai/whisper-base",
        physics_weight=0.4,
        dl_weight=0.6,
        use_local_deepfake_model=True,
        use_local_whisper_model=False,
        max_audio_duration=30  # Truncate to 30 seconds
    )
    
    # Test with file path
    audio_file = r"sample voice 1.mp3"
    result = detector.analyze(audio_file, input_type="file")
    
    print(f"\nüìä Result:")
    print(f"   Status: {result['status']}")
    if result['status'] == 'success':
        print(f"   Language: {result['language']}")
        print(f"   Classification: {result['classification']}")
        print(f"   Confidence: {result['confidenceScore']}")
        print(f"   Explanation: {result['explanation']}")
        print(f"\nüîç Debug Info:")
        for key, val in result.get('debug', {}).items():
            if key != 'physics_features':
                print(f"   {key}: {val}")
    else:
        print(f"   Error: {result.get('error')}")
    
    
    # Example 2: Batch processing
    print("\n\n" + "="*70)
    print("EXAMPLE 2: Batch Processing with Language Detection")
    print("="*70)
    
    test_files = [
        r"voice_preview_faiq - standard, clear and neutral.mp3",
        r"clova.mp3",
        r"sample voice 1.mp3",
        r"voice_preview_mukundan - formal and clear.mp3",
        r"voice_preview_kanika - soft, smooth and muffled.mp3",
        r"medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
        r"voice_preview_tarini - soft, cheerful and expressive.mp3"
    ]
    
    print(f"\n{'Filename':<40} {'Language':<15} {'Classification':<15} {'Confidence'}")
    print("-"*80)
    
    for file_path in test_files:
        if os.path.exists(file_path):
            result = detector.analyze(file_path, input_type="file")
            
            if result['status'] == 'success':
                filename = os.path.basename(file_path)[:37] + "..."
                print(f"{filename:<40} {result['language']:<15} {result['classification']:<15} {result['confidenceScore']:.2f}")
                print(f"\nüîç Debug Info:")
                for key, val in result.get('debug', {}).items():
                    print(f"   {key}: {val}")

EXAMPLE 1: Initialize Hybrid Detector with Language Detection
üîß Initializing Hybrid Detector with Language Detection
   Device: cpu
   Physics Weight: 40%
   DL Weight: 60%
   Max Audio Duration: 30s
üì• Loading deepfake detection model from 'wav2vec2-deepfake-voice-detector'...
‚úÖ Deepfake Detection Model Loaded
üì• Loading Whisper model for language detection from 'openai/whisper-base'...
‚úÖ Whisper Language Detection Model Loaded
‚úÖ Hybrid Detector Ready

üéµ Analyzing: sample voice 1.mp3
   üåê Detected Language: English (en)

üìä Result:
   Status: success
   Language: English
   Classification: AI_GENERATED
   Confidence: 0.59
   Explanation: Deep learning model detected synthetic voice patterns (confidence: 98.5%)

üîç Debug Info:
   physics_score: 0.0
   dl_score: 0.985
   dl_label: Fake/Deepfake
   physics_weight: 40%
   dl_weight: 60%
   audio_duration: 0
   was_truncated: False


EXAMPLE 2: Batch Processing with Language Detection

Filename                       

In [None]:
import torch
import librosa
import numpy as np
import scipy.stats as stats
import torch.nn.functional as F
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration
import base64
import io
import tempfile
import os
import soundfile as sf
import warnings

# Suppress librosa warnings
warnings.filterwarnings('ignore')

class HybridEnsembleDetector:
    """
    Hybrid AI Voice Detection System with Language Detection
    
    Features:
    1. Physics-based acoustic analysis
    2. Deep Learning deepfake detection
    3. Language identification using Whisper (focus on Indian languages)
    4. Auto-truncation to 30 seconds for faster processing
    """
    
    def __init__(
        self, 
        deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
        whisper_model_path="openai/whisper-base",
        physics_weight=0.4,
        dl_weight=0.6,
        use_local_deepfake_model=False,
        use_local_whisper_model=False,
        max_audio_duration=30  # seconds
    ):
        """
        Initialize the hybrid detector
        
        Args:
            deepfake_model_path: Path to deepfake detection model
            whisper_model_path: Path to Whisper model for language detection
            physics_weight: Weight for physics score (0-1)
            dl_weight: Weight for DL score (0-1)
            use_local_deepfake_model: Whether to load deepfake model from local path
            use_local_whisper_model: Whether to load Whisper from local path
            max_audio_duration: Maximum audio duration to process (seconds)
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.max_duration = max_audio_duration
        
        # Normalize weights
        total_weight = physics_weight + dl_weight
        self.physics_weight = physics_weight / total_weight
        self.dl_weight = dl_weight / total_weight
        
        print(f"üîß Initializing Hybrid Detector with Language Detection")
        print(f"   Device: {self.device}")
        print(f"   Physics Weight: {self.physics_weight*100:.0f}%")
        print(f"   DL Weight: {self.dl_weight*100:.0f}%")
        print(f"   Max Audio Duration: {self.max_duration}s")
        
        # --- LOAD DEEPFAKE DETECTION MODEL ---
        try:
            print(f"üì• Loading deepfake detection model from '{deepfake_model_path}'...")
            
            if use_local_deepfake_model:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(
                    deepfake_model_path, 
                    local_files_only=True
                )
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                    deepfake_model_path, 
                    local_files_only=True
                )
            else:
                self.dl_model = AutoModelForAudioClassification.from_pretrained(deepfake_model_path)
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(deepfake_model_path)
            
            self.dl_model.to(self.device)
            self.dl_model.eval()
            self.dl_ready = True
            print("‚úÖ Deepfake Detection Model Loaded")
            
        except Exception as e:
            print(f"‚ö†Ô∏è  DL Model Load Failed: {e}")
            print("   Running in Physics-Only mode")
            self.dl_ready = False
            self.dl_weight = 0
            self.physics_weight = 1.0

        # --- LOAD WHISPER FOR LANGUAGE DETECTION ---
        try:
            print(f"üì• Loading Whisper model for language detection from '{whisper_model_path}'...")
            
            if use_local_whisper_model:
                self.whisper_processor = WhisperProcessor.from_pretrained(
                    whisper_model_path,
                    local_files_only=True
                )
                self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
                    whisper_model_path,
                    local_files_only=True
                )
            else:
                self.whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
                self.whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_path)
            
            self.whisper_model.to(self.device)
            self.whisper_model.eval()
            self.lang_ready = True
            print("‚úÖ Whisper Language Detection Model Loaded")
            
            # Language code mapping for Indian languages and common languages
            self.language_map = {
                'hi': 'Hindi',
                'bn': 'Bengali', 
                'te': 'Telugu',
                'mr': 'Marathi',
                'ta': 'Tamil',
                'gu': 'Gujarati',
                'kn': 'Kannada',
                'ml': 'Malayalam',
                'or': 'Odia',
                'pa': 'Punjabi',
                'as': 'Assamese',
                'ur': 'Urdu',
                'en': 'English',
                'ne': 'Nepali',
                'si': 'Sinhala',
                'sa': 'Sanskrit',
                'sd': 'Sindhi',
                'ks': 'Kashmiri'
            }
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Whisper Model Load Failed: {e}")
            print("   Running without language detection")
            self.lang_ready = False

        # --- PHYSICS ENGINE PARAMETERS ---
        self.CV_AI_THRESHOLD = 0.20
        self.CV_HUMAN_THRESHOLD = 0.32
        self.INTENSITY_MIN_STD = 0.05
        self.INTENSITY_MAX_STD = 0.15
        
        print("‚úÖ Hybrid Detector Ready\n")

    # ==========================================================
    # HELPER: Audio Preprocessing
    # ==========================================================
    def preprocess_audio(self, audio_path, target_sr=16000):
        """
        Load and preprocess audio:
        1. Load audio
        2. Convert to mono
        3. Truncate to max_duration if needed
        4. Resample to target_sr
        
        Args:
            audio_path: Path to audio file
            target_sr: Target sample rate
            
        Returns:
            tuple: (waveform_array, sample_rate, duration, was_truncated)
        """
        try:
            # Load audio
            y, sr = librosa.load(audio_path, sr=None, mono=True)
            
            # Calculate duration
            duration = len(y) / sr
            was_truncated = False
            
            # Truncate if longer than max_duration
            if duration > self.max_duration:
                print(f"   ‚ö†Ô∏è  Audio is {duration:.1f}s, truncating to {self.max_duration}s")
                max_samples = int(self.max_duration * sr)
                y = y[:max_samples]
                duration = self.max_duration
                was_truncated = True
            
            # Resample if needed
            if sr != target_sr:
                y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
                sr = target_sr
            
            return y, sr, duration, was_truncated
            
        except Exception as e:
            raise ValueError(f"Failed to preprocess audio: {str(e)}")

    # ==========================================================
    # HELPER: Base64 Decoding
    # ==========================================================
    def decode_base64_audio(self, base64_string):
        """
        Decode base64 audio and save to temporary file
        
        Args:
            base64_string: Base64 encoded audio data
            
        Returns:
            str: Path to temporary audio file
        """
        try:
            # Decode base64
            audio_data = base64.b64decode(base64_string)
            
            # Create temporary file
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
            temp_file.write(audio_data)
            temp_file.close()
            
            return temp_file.name
            
        except Exception as e:
            raise ValueError(f"Failed to decode base64 audio: {str(e)}")

    # ==========================================================
    # LANGUAGE DETECTION
    # ==========================================================
    def detect_language(self, audio_path):
        """
        Detect language using Whisper model
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            str: Detected language name
        """
        if not self.lang_ready:
            return "Unknown"
        
        try:
            # Load and preprocess audio for Whisper (uses 16kHz)
            # Use first 30 seconds for language detection
            audio, sr = librosa.load(audio_path, sr=16000, mono=True, duration=30)
            
            # Process audio with Whisper processor
            input_features = self.whisper_processor(
                audio,
                sampling_rate=16000,
                return_tensors="pt"
            ).input_features
            
            input_features = input_features.to(self.device)
            
            # Whisper language detection using forced_decoder_ids
            with torch.no_grad():
                # Generate with language detection enabled
                generated_ids = self.whisper_model.generate(
                    input_features,
                    task="transcribe",
                    return_dict_in_generate=True
                )
                
                # Decode the output
                full_output = self.whisper_processor.batch_decode(
                    generated_ids.sequences,
                    skip_special_tokens=False
                )[0]
                
                # Parse language from special tokens
                # Format: <|startoftranscript|><|en|><|transcribe|>...
                detected_lang = None
                
                # Look for language tokens in the format <|xx|>
                import re
                lang_pattern = r'<\|([a-z]{2})\|>'
                matches = re.findall(lang_pattern, full_output)
                
                if matches:
                    # First match after startoftranscript is usually the language
                    for match in matches:
                        if match in self.language_map:
                            detected_lang = match
                            break
                
                if detected_lang:
                    lang_name = self.language_map.get(detected_lang, detected_lang.upper())
                    print(f"   üåê Detected Language: {lang_name} ({detected_lang})")
                    return lang_name
                else:
                    # Fallback: if transcription successful, assume English
                    transcription = self.whisper_processor.batch_decode(
                        generated_ids.sequences,
                        skip_special_tokens=True
                    )[0]
                    
                    if len(transcription.strip()) > 0:
                        print(f"   üåê Detected Language: English (default)")
                        return "English"
                    else:
                        return "Unknown"
                    
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Language detection error: {str(e)}")
            return "Unknown"

    # ==========================================================
    # PART A: PHYSICS ENGINE (FIXED)
    # ==========================================================
    def get_linear_score(self, val, min_val, max_val):
        """Linear interpolation for scoring"""
        if val <= min_val:
            return 1.0
        if val >= max_val:
            return 0.0
        return 1.0 - ((val - min_val) / (max_val - min_val))

    def get_physics_score(self, audio_path):
        """
        Analyze audio using physics-based acoustic features
        
        Returns:
            tuple: (ai_score, method, features_dict)
        """
        try:
            # Load audio at NATIVE sample rate (don't resample for physics analysis)
            y, sr = librosa.load(audio_path, sr=None, mono=True)
            
            # Calculate original duration
            duration = len(y) / sr
            was_truncated = False
            
            # Truncate if needed
            if duration > self.max_duration:
                max_samples = int(self.max_duration * sr)
                y = y[:max_samples]
                duration = self.max_duration
                was_truncated = True
            
            print(f"   üî¨ Running physics analysis on {duration:.1f}s audio at {sr}Hz")
            
            # Robust pitch tracking using PYIN
            try:
                f0, voiced_flag, voiced_probs = librosa.pyin(
                    y, 
                    fmin=librosa.note_to_hz('C2'),  # ~65 Hz
                    fmax=librosa.note_to_hz('C7'),  # ~2093 Hz
                    sr=sr,
                    frame_length=2048
                )
                valid_f0 = f0[~np.isnan(f0)]
            except Exception as pitch_error:
                print(f"   ‚ö†Ô∏è  Pitch detection failed: {pitch_error}, using fallback method")
                # Fallback: use simpler pitch detection
                valid_f0 = np.array([])
            
            if len(valid_f0) < 10:  # Need at least 10 valid pitch points
                print(f"   ‚ö†Ô∏è  Insufficient pitch data ({len(valid_f0)} points), using alternative features")
                # Fall back to non-pitch features
                rms = librosa.feature.rms(y=y)[0]
                centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
                zcr = librosa.feature.zero_crossing_rate(y)[0]
                
                feats = {
                    'pitch_cv': 0.25,  # Neutral value
                    'intensity_std': np.std(rms),
                    'freq_skew': stats.skew(centroid),
                    'zcr_std': np.std(zcr),
                    'mean_pitch': 0,
                    'std_pitch': 0,
                    'duration': duration,
                    'was_truncated': was_truncated
                }
                
                # Score based on available features
                intensity_score = self.get_linear_score(
                    feats['intensity_std'], 
                    self.INTENSITY_MIN_STD, 
                    self.INTENSITY_MAX_STD
                )
                
                zcr_score = self.get_linear_score(
                    feats['zcr_std'],
                    0.01,
                    0.08
                )
                
                skew_score = self.get_linear_score(
                    abs(feats['freq_skew']), 
                    0.1, 
                    1.0
                )
                
                # Weighted combination (no pitch)
                final_score = (intensity_score * 0.5 + zcr_score * 0.2 + skew_score * 0.3)
                
                print(f"   üî¨ Physics score (no pitch): {final_score:.3f}")
                return round(final_score, 3), "Physics Analysis (Limited)", feats

            # Full analysis with pitch
            rms = librosa.feature.rms(y=y)[0]
            centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
            
            mean_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            
            # Calculate feature metrics
            feats = {
                'pitch_cv': std_pitch / mean_pitch if mean_pitch > 0 else 0,
                'intensity_std': np.std(rms),
                'freq_skew': stats.skew(centroid),
                'mean_pitch': mean_pitch,
                'std_pitch': std_pitch,
                'duration': duration,
                'was_truncated': was_truncated
            }

            # Individual feature scores (higher = more AI-like)
            intensity_score = self.get_linear_score(
                feats['intensity_std'], 
                self.INTENSITY_MIN_STD, 
                self.INTENSITY_MAX_STD
            )
            
            pitch_score = self.get_linear_score(
                feats['pitch_cv'], 
                self.CV_AI_THRESHOLD, 
                self.CV_HUMAN_THRESHOLD
            )
            
            skew_score = self.get_linear_score(
                abs(feats['freq_skew']), 
                0.1, 
                1.0
            )

            # Weighted combination
            W_INTENSITY = 0.40
            W_PITCH = 0.40
            W_SKEW = 0.20
            
            base_score = (
                intensity_score * W_INTENSITY + 
                pitch_score * W_PITCH + 
                skew_score * W_SKEW
            )

            # Synergy bonus: if both intensity and pitch are suspicious
            if intensity_score > 0.4 and pitch_score > 0.4:
                final_score = min(base_score + 0.15, 1.0)
            else:
                final_score = base_score

            print(f"   üî¨ Physics score: {final_score:.3f} (intensity:{intensity_score:.2f}, pitch:{pitch_score:.2f})")
            return round(final_score, 3), "Physics Analysis", feats

        except Exception as e:
            print(f"   ‚ùå Physics analysis failed: {str(e)}")
            import traceback
            traceback.print_exc()
            return 0.0, f"Physics Error: {str(e)}", {'duration': 0, 'was_truncated': False}

    # ==========================================================
    # PART B: DEEP LEARNING ENGINE
    # ==========================================================
    def get_dl_score(self, audio_path):
        """
        Analyze audio using deep learning model
        
        Returns:
            tuple: (ai_score, label)
        """
        if not self.dl_ready:
            return 0.0, "Model not loaded"

        try:
            # Load and preprocess audio
            waveform_np, sr, duration, was_truncated = self.preprocess_audio(audio_path, target_sr=16000)

            # Process with feature extractor
            inputs = self.feature_extractor(
                waveform_np,
                sampling_rate=16000,
                return_tensors="pt",
                padding=True
            )
            
            # Move to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Run inference
            with torch.no_grad():
                outputs = self.dl_model(**inputs)
                logits = outputs.logits
                probs = F.softmax(logits, dim=-1)
                
            # Get predictions
            # Class 0: Real, Class 1: Fake
            prob_real = probs[0][0].item()
            prob_fake = probs[0][1].item()
            
            # AI score is the fake probability
            ai_score = prob_fake
            
            label = "Fake/Deepfake" if prob_fake > 0.5 else "Real/Human"

            return round(ai_score, 3), label

        except Exception as e:
            print(f"   ‚ùå DL analysis failed: {str(e)}")
            return 0.0, f"DL Error: {str(e)}"

    # ==========================================================
    # PART C: EXPLANATION GENERATOR
    # ==========================================================
    def generate_explanation(self, final_score, phys_score, dl_score, dl_label, phys_feats):
        """
        Generate human-readable explanation for the classification
        
        Returns:
            str: Explanation text
        """
        explanations = []
        
        if final_score > 0.55:
            # AI GENERATED
            
            # Deep Learning contributions
            if dl_score > 0.55 and self.dl_ready:
                if "Fake" in dl_label or "Deepfake" in dl_label:
                    explanations.append(
                        f"Deep learning model detected synthetic voice patterns "
                        f"(confidence: {dl_score*100:.1f}%)"
                    )
            
            # Physics contributions
            if phys_score > 0.55:
                p_cv = phys_feats.get('pitch_cv', 0)
                i_std = phys_feats.get('intensity_std', 0)
                
                if i_std < 0.06:
                    explanations.append(
                        f"Unnaturally consistent energy levels detected "
                        f"(std: {i_std:.3f}, expected: >0.06)"
                    )
                
                if p_cv < 0.22 and p_cv > 0:
                    explanations.append(
                        f"Robotic pitch modulation patterns "
                        f"(CV: {p_cv:.2f}, expected: >0.22)"
                    )
                
                if not explanations or (i_std >= 0.06 and p_cv >= 0.22):
                    explanations.append(
                        "Acoustic parameters lack natural human variability"
                    )
            
            if not explanations:
                explanations.append(
                    "Voice exhibits characteristics consistent with AI generation"
                )
                
        else:
            # HUMAN
            explanations.append(
                "Voice exhibits natural acoustic variability and human speech characteristics"
            )
        
        return "; ".join(explanations)

    # ==========================================================
    # PART D: MAIN ANALYSIS FUNCTION
    # ==========================================================
    def analyze(self, audio_input, input_type="file"):
        """
        Main analysis function with configurable input types
        
        Args:
            audio_input: Either file path or base64 string
            input_type: "file" or "base64"
            
        Returns:
            dict: Analysis results following API response format
        """
        temp_file = None
        
        try:
            # Handle input type
            if input_type == "base64":
                temp_file = self.decode_base64_audio(audio_input)
                audio_path = temp_file
            elif input_type == "file":
                audio_path = audio_input
                if not os.path.exists(audio_path):
                    return {
                        "status": "error",
                        "error": f"Audio file not found: {audio_path}"
                    }
            else:
                return {
                    "status": "error",
                    "error": f"Invalid input_type: {input_type}. Use 'file' or 'base64'"
                }

            print(f"üéµ Analyzing: {os.path.basename(audio_path)}")

            # 1. Detect Language
            detected_language = self.detect_language(audio_path)

            # 2. Run Physics Analysis
            phys_score, phys_method, phys_feats = self.get_physics_score(audio_path)
            
            # 3. Run Deep Learning Analysis
            dl_score, dl_label = self.get_dl_score(audio_path)

            # 4. Calculate weighted ensemble score
            final_score = (
                self.physics_weight * phys_score + 
                self.dl_weight * dl_score
            )
            
            # Round to 2 decimal places
            final_score = round(final_score, 2)
            
            # 5. Determine classification
            classification = "AI_GENERATED" if final_score > 0.55 else "HUMAN"
            
            # 6. Generate explanation
            explanation = self.generate_explanation(
                final_score, 
                phys_score, 
                dl_score, 
                dl_label, 
                phys_feats
            )

            # 7. Return API-compliant response
            return {
                "status": "success",
                "language": detected_language,
                "classification": classification,
                "confidenceScore": final_score,
                "explanation": explanation,
                "debug": {
                    "physics_score": phys_score,
                    "dl_score": dl_score,
                    "dl_label": dl_label,
                    "physics_weight": f"{self.physics_weight*100:.0f}%",
                    "dl_weight": f"{self.dl_weight*100:.0f}%",
                    "audio_duration": phys_feats.get('duration', 0),
                    "was_truncated": phys_feats.get('was_truncated', False),
                    "physics_features": {k: v for k, v in phys_feats.items() if k not in ['duration', 'was_truncated']}
                }
            }
            
        except Exception as e:
            import traceback
            return {
                "status": "error",
                "error": str(e),
                "traceback": traceback.format_exc()
            }
            
        finally:
            # Clean up temporary file
            if temp_file and os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass

    # ==========================================================
    # UTILITY: Update Weights
    # ==========================================================
    def update_weights(self, physics_weight, dl_weight):
        """
        Update ensemble weights dynamically
        
        Args:
            physics_weight: New physics weight (0-1)
            dl_weight: New DL weight (0-1)
        """
        total = physics_weight + dl_weight
        self.physics_weight = physics_weight / total
        self.dl_weight = dl_weight / total
        
        print(f"‚öôÔ∏è  Weights updated:")
        print(f"   Physics: {self.physics_weight*100:.0f}%")
        print(f"   DL: {self.dl_weight*100:.0f}%")


# ==========================================================
# USAGE EXAMPLES
# ==========================================================
if __name__ == "__main__":
    
    # Example 1: Initialize with 40-60 split (Physics-DL)
    print("="*70)
    print("EXAMPLE 1: Initialize Hybrid Detector with Language Detection")
    print("="*70)
    
    detector = HybridEnsembleDetector(
        deepfake_model_path="garystafford/wav2vec2-deepfake-voice-detector",
        whisper_model_path="openai/whisper-base",
        physics_weight=0.4,
        dl_weight=0.6,
        use_local_deepfake_model=False,
        use_local_whisper_model=False,
        max_audio_duration=30  # Truncate to 30 seconds
    )
    
    # Test with file path
    audio_file = r"test_audio\human\medieval-gamer-voice-you-can-view-our-website-at-the-link-below-228410.mp3"
    result = detector.analyze(audio_file, input_type="file")
    
    print(f"\nüìä Result:")
    print(f"   Status: {result['status']}")
    if result['status'] == 'success':
        print(f"   Language: {result['language']}")
        print(f"   Classification: {result['classification']}")
        print(f"   Confidence: {result['confidenceScore']}")
        print(f"   Explanation: {result['explanation']}")
        print(f"\nüîç Debug Info:")
        for key, val in result.get('debug', {}).items():
            if key != 'physics_features':
                print(f"   {key}: {val}")
    else:
        print(f"   Error: {result.get('error')}")
    
    
    # Example 2: Batch processing
    print("\n\n" + "="*70)
    print("EXAMPLE 2: Batch Processing with Language Detection")
    print("="*70)
    
    test_files = [

        r"test_audio\AI\voice_preview_faiq - standard, clear and neutral.mp3",
        r"test_audio\AI\clova.mp3",
        r"test_audio\AI\sample voice 1.mp3",
        r"test_audio\AI\voice_preview_mukundan - formal and clear.mp3",
        r"test_audio\AI\voice_preview_kanika - soft, smooth and muffled.mp3",
        r"test_audio\AI\medieval-gamer-voice-darkness-hunts-us-what-youx27ve-learned-stay-226596.mp3",
        r"test_audio\AI\voice_preview_tarini - soft, cheerful and expressive.mp3"
    ]
    
    print(f"\n{'Filename':<40} {'Language':<15} {'Classification':<15} {'Confidence'}")
    print("-"*80)
    
    for file_path in test_files:
        if os.path.exists(file_path):
            result = detector.analyze(file_path, input_type="file")
            
            if result['status'] == 'success':
                filename = os.path.basename(file_path)[:37] + "..."
                print(f"{filename:<40} {result['language']:<15} {result['classification']:<15} {result['confidenceScore']:.2f}")
                print(f"\nüîç Debug Info:")
                for key, val in result.get('debug', {}).items():
                    print(f"   {key}: {val}")

EXAMPLE 1: Initialize Hybrid Detector with Language Detection
üîß Initializing Hybrid Detector with Language Detection
   Device: cpu
   Physics Weight: 40%
   DL Weight: 60%
   Max Audio Duration: 30s
üì• Loading deepfake detection model from 'wav2vec2-deepfake-voice-detector'...
‚úÖ Deepfake Detection Model Loaded
üì• Loading Whisper model for language detection from 'openai/whisper-base'...
‚úÖ Whisper Language Detection Model Loaded
‚úÖ Hybrid Detector Ready

üéµ Analyzing: sample voice 1.mp3
   üåê Detected Language: English (en)
   üî¨ Running physics analysis on 23.6s audio at 44100Hz
   üî¨ Physics score: 0.861 (intensity:0.78, pitch:1.00)

üìä Result:
   Status: success
   Language: English
   Classification: AI_GENERATED
   Confidence: 0.9399999976158142
   Explanation: Deep learning model detected synthetic voice patterns (confidence: 98.5%); Robotic pitch modulation patterns (CV: 0.20, expected: >0.22)

üîç Debug Info:
   physics_score: 0.8610000014305115
   dl_sco