In [1]:
import os
import numpy as np
from moviepy import VideoFileClip
import whisper # OpenAI's Whisper for STT
import librosa # For audio analysis
import cv2 # OpenCV for video processing
from deepface import DeepFace # For facial emotion analysis
import mediapipe as mp # For face mesh and pose estimation
import time




In [2]:
# --- Configuration ---
OUTPUT_AUDIO_FILENAME = "temp_audio.wav"

def extract_audio(video_path, audio_output_path):
    """Extracts audio from video file and saves as WAV."""
    print(f"Extracting audio from {video_path}...")
    try:
        video_clip = VideoFileClip(video_path)
        audio_clip = video_clip.audio
        if audio_clip is None:
             print(f"Error: No audio track found in {video_path}")
             return False
        audio_clip.write_audiofile(audio_output_path, codec='pcm_s16le') # Standard WAV codec
        audio_clip.close()
        video_clip.close()
        print(f"Audio extracted successfully to {audio_output_path}")
        return True
    except Exception as e:
        print(f"Error extracting audio: {e}")
        # Clean up if partial file exists
        if os.path.exists(audio_output_path):
            os.remove(audio_output_path)
        return False

In [3]:
WHISPER_MODEL = "base" # Options: "tiny", "base", "small", "medium", "large". Larger = more accurate but slower/more resource intensive.

def transcribe_audio(audio_path):
    """
    Transcribes audio using Whisper, attempting to retain filler words
    by requesting word-level timestamps and reconstructing the transcript.
    """
    print(f"Loading Whisper model ({WHISPER_MODEL})...")
    try:
        # Load the model (consider doing this once outside the function if calling repeatedly)
        model = whisper.load_model(WHISPER_MODEL)
        print(f"Transcribing audio file: {audio_path} with word timestamps (this may take a while)...")

        # Key change: Set word_timestamps=True
        start_time = time.time()
        result = model.transcribe(audio_path, word_timestamps=True, fp16=False) # fp16=False might improve stability/accuracy on some systems
        end_time = time.time()
        print(f"Transcription complete in {end_time - start_time:.2f} seconds.")

        # Reconstruct the transcript from word segments
        # This ensures we capture words that might be filtered in the basic 'text' output
        full_transcript = ""
        if 'segments' in result:
            all_words = []
            for segment in result['segments']:
                if 'words' in segment:
                    for word_info in segment['words']:
                        # word_info is a dict like {'word': ' Hello', 'start': 0.0, 'end': 0.5, 'probability': 0.9}
                        # Note: Whisper often includes leading/trailing spaces in word_info['word']
                        all_words.append(word_info['word'])

            # Join the words carefully. Using strip() on each word and joining with a single space
            # handles cases where Whisper includes spaces and avoids double spacing.
            full_transcript = " ".join(word.strip() for word in all_words).strip()

            # Alternative simpler join (might have occasional extra spaces if whisper includes them):
            # full_transcript = "".join(word_info['word'] for segment in result['segments'] if 'words' in segment for word_info in segment['words']).strip()

        else:
            # Fallback if segments/words aren't available (shouldn't happen with word_timestamps=True)
            print("Warning: Word segments not found in Whisper result. Falling back to basic text.")
            full_transcript = result.get('text', "") # Use basic text if structure is unexpected

        if not full_transcript:
             print("Warning: Transcription resulted in empty text.")
             return "" # Return empty string instead of None for consistency downstream

        return full_transcript

    except Exception as e:
        print(f"Error during transcription with word timestamps: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for debugging
        return None # Return None on error


In [4]:
def analyze_audio_features(audio_path, transcript):
    """Analyzes audio features like pace, pauses, pitch, volume."""
    print("Analyzing audio features...")
    try:
        y, sr = librosa.load(audio_path, sr=None) # Load audio with its original sample rate
        duration = librosa.get_duration(y=y, sr=sr)

        analysis_results = {}

        # 1. Pace (Words Per Minute)
        word_count = len(transcript.split())
        if duration > 0:
            wpm = int((word_count / duration) * 60)
            analysis_results['pace_wpm'] = wpm
            print(f"- Pace: {wpm} WPM")
        else:
             analysis_results['pace_wpm'] = 0
             print("- Pace: N/A (duration is zero)")


        # 2. Filler Words (Simple Count)
        # More sophisticated filler word detection is complex and often requires specific acoustic models
        fillers = ["um", "uh", "like", "you know", "so", "actually", "basically"]
        filler_count = sum(transcript.lower().count(f) for f in fillers)
        analysis_results['filler_count'] = filler_count
        print(f"- Filler Words Count (basic): {filler_count}")

        # 3. Volume Analysis (RMS Energy)
        rms = librosa.feature.rms(y=y)[0]
        avg_volume = np.mean(rms)
        std_volume = np.std(rms)
        analysis_results['avg_volume_rms'] = float(avg_volume)
        analysis_results['std_volume_rms'] = float(std_volume)
        print(f"- Average Volume (RMS): {avg_volume:.4f}")
        print(f"- Volume Variation (Std Dev RMS): {std_volume:.4f}")


        # 4. Pitch Analysis (Fundamental Frequency - F0)
        f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
        valid_f0 = f0[voiced_flag] # Consider only voiced segments for pitch stats
        if len(valid_f0) > 0:
            avg_pitch = np.mean(valid_f0)
            std_pitch = np.std(valid_f0)
            analysis_results['avg_pitch_hz'] = float(avg_pitch)
            analysis_results['std_pitch_hz'] = float(std_pitch)
            print(f"- Average Pitch (F0): {avg_pitch:.2f} Hz")
            print(f"- Pitch Variation (Std Dev F0): {std_pitch:.2f} Hz")
        else:
            analysis_results['avg_pitch_hz'] = 0
            analysis_results['std_pitch_hz'] = 0
            print("- Pitch: Could not reliably detect pitch.")


        # 5. Pause Analysis (Simple Silence Detection)
        # Use librosa's split based on RMS energy threshold
        # top_db=40 means consider anything 40dB below the max RMS as silence
        non_silent_intervals = librosa.effects.split(y, top_db=40)
        pauses = []
        last_end = 0
        for start, end in non_silent_intervals:
            pause_duration = (start / sr) - (last_end / sr)
            if pause_duration > 0.2: # Consider pauses longer than 200ms
                pauses.append(pause_duration)
            last_end = end
        # Check pause after last segment until end of audio
        final_pause = duration - (last_end / sr)
        if final_pause > 0.2:
             pauses.append(final_pause)

        analysis_results['num_pauses'] = len(pauses)
        analysis_results['avg_pause_duration_s'] = float(np.mean(pauses)) if pauses else 0
        print(f"- Number of Pauses (>0.2s): {len(pauses)}")
        if pauses:
            print(f"- Average Pause Duration: {np.mean(pauses):.2f} s")

        return analysis_results

    except Exception as e:
        print(f"Error during audio feature analysis: {e}")
        return None

In [32]:
import tensorflow as tf
from tensorflow.keras.models import Sequential # Add this import
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense # Add these layer imports
from tensorflow.keras.optimizers import Adam # Add optimizer import (needed for compile)

# --- Define Model Architecture ---
IMG_SIZE = (48, 48) # Make sure this matches training
NUM_CLASSES = 7     # Make sure this matches training

def create_model_simple():
    model = tf.keras.models.Sequential([
        # NOTE: Define input_shape *without* the batch dimension here
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(NUM_CLASSES, activation='softmax') # Use NUM_CLASSES
    ])
    # Compile is necessary after loading weights for the model to be usable for prediction
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# --- Model Loading and Configuration ---
model_path = 'best_model_full.h5'  # Make sure this path is correct
loaded_emotion_model = None # Initialize as None

try:
    # 1. Create the model architecture
    loaded_emotion_model = create_model_simple()
    # 2. Load only the weights
    loaded_emotion_model.load_weights(model_path)
    print(f"Custom emotion model architecture created and weights loaded successfully from {model_path}")
    # loaded_emotion_model.summary() # Optional: check summary
except Exception as e:
    print(f"Error creating model architecture or loading weights: {e}")
    loaded_emotion_model = None # Ensure it's None if loading fails

Custom emotion model architecture created and weights loaded successfully from best_model_full.h5


In [27]:


# --- Define Class Labels (Ensure this order matches training) ---
# Based on alphabetical sorting typically used by ImageDataGenerator:
class_labels = ['angry', 'disgusted', 'fearful', 'happy', 'neutral', 'sad', 'surprised']
target_size = (48, 48) # The input size your model expects

# --- Constants and MediaPipe Initialization (Keep as is) ---
VISUALIZE = True
WINDOW_NAME = 'Interview Analysis Visualization'
VIDEO_ANALYSIS_FRAME_SKIP = 2

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_face_mesh = mp.solutions.face_mesh
mp_pose = mp.solutions.pose
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5, refine_landmarks=True, min_tracking_confidence=0.5)
pose_estimator = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.5)

def analyze_video_features(video_path):
    """
    Analyzes video for facial expressions (using custom Keras model), eye contact, posture,
    and provides visual feedback if VISUALIZE is True.
    """
    if loaded_emotion_model is None:
        print("Emotion model not loaded. Skipping emotion analysis.")
        # Handle how you want the function to behave if the model isn't loaded
        # Maybe return partial results or None

    print("Analyzing video features (using custom model)...")
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return None

    frame_count = 0
    processed_frame_count = 0
    # Initialize emotion counts dictionary using the defined labels
    emotion_counts = {label: 0 for label in class_labels}
    eye_contact_frames = 0
    upright_frames = 0

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    if VISUALIZE:
        cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_AUTOSIZE)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break # End of video

        frame_count += 1
        if frame_count % VIDEO_ANALYSIS_FRAME_SKIP != 0:
            continue # Skip frame

        processed_frame_count += 1
        start_time_frame = time.time()

        annotated_frame = frame.copy()
        rgb_frame_mp = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Use RGB for MediaPipe
        rgb_frame_mp.flags.writeable = False

        # --- MediaPipe Face Mesh and Pose Processing ---
        face_results = face_mesh.process(rgb_frame_mp)
        pose_results = pose_estimator.process(rgb_frame_mp)

        rgb_frame_mp.flags.writeable = True # Re-enable if needed later

        # --- Facial Emotion Analysis (Using Custom Keras Model) ---
        current_emotion = "N/A" # Default value

        if loaded_emotion_model: # Check if the model was loaded successfully
            try:
                # 1. Preprocess the *current frame* for the emotion model
                # Convert frame to RGB (if not already done for MP)
                rgb_frame_emotion = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                # Resize to the target size (48x48)
                img_resized = cv2.resize(rgb_frame_emotion, target_size)
                # Convert to float and rescale pixel values to [0, 1]
                img_array = np.array(img_resized, dtype=np.float32) / 255.0
                # Add the batch dimension (shape becomes 1, 48, 48, 3)
                img_batched = np.expand_dims(img_array, axis=0)

                # 2. Predict using the loaded Keras model
                # Use verbose=0 to avoid printing progress for every frame
                predictions = loaded_emotion_model.predict(img_batched, verbose=0)

                # 3. Interpret the prediction
                predicted_index = np.argmax(predictions[0]) # Get index of max probability
                current_emotion = class_labels[predicted_index] # Map index to label

                # Update counts
                emotion_counts[current_emotion] += 1

            except Exception as e:
                # Catch potential errors during preprocessing or prediction
                # print(f"Frame {frame_count}: Custom Emotion Model error: {e}") # Optional for debugging
                current_emotion = "Error" # Indicate an error occurred for this frame
                pass # Continue processing the video
        is_eye_contact = False # Flag for current frame
        if face_results.multi_face_landmarks:
            for face_landmarks in face_results.multi_face_landmarks:
                # Draw face mesh
                if VISUALIZE:
                    mp_drawing.draw_landmarks(
                        image=annotated_frame,
                        landmark_list=face_landmarks,
                        connections=mp_face_mesh.FACEMESH_TESSELATION,
                        landmark_drawing_spec=None,
                        connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style())
                    mp_drawing.draw_landmarks(
                        image=annotated_frame,
                        landmark_list=face_landmarks,
                        connections=mp_face_mesh.FACEMESH_CONTOURS,
                        landmark_drawing_spec=None,
                        connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())
                    # Iris landmarks drawing (optional, can be busy)
                    # mp_drawing.draw_landmarks(
                    #     image=annotated_frame,
                    #     landmark_list=face_landmarks,
                    #     connections=mp_face_mesh.FACEMESH_IRISES,
                    #     landmark_drawing_spec=None,
                    #     connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_iris_connections_style())

                # Calculate eye contact
                try:
                    left_pupil = face_landmarks.landmark[473] # Corrected index? Check docs if unsure. Typically 473-477 are right iris, 468-472 left. Let's use documented iris centers.
                    right_pupil = face_landmarks.landmark[468] # Corrected index? Let's use documented iris centers.

                    # Get corners for width calculation
                    left_eye_inner = face_landmarks.landmark[133]
                    left_eye_outer = face_landmarks.landmark[33]
                    right_eye_inner = face_landmarks.landmark[362]
                    right_eye_outer = face_landmarks.landmark[263]

                    left_eye_width = abs(left_eye_outer.x - left_eye_inner.x)
                    right_eye_width = abs(right_eye_outer.x - right_eye_inner.x)

                    if left_eye_width > 0.01 and right_eye_width > 0.01: # Avoid division by zero
                        # Use the correct pupil landmark index for relative position calculation
                        left_pupil_rel_pos = (face_landmarks.landmark[468].x - left_eye_inner.x) / left_eye_width # Use left pupil center [468]
                        right_pupil_rel_pos = (face_landmarks.landmark[473].x - right_eye_inner.x) / right_eye_width # Use right pupil center [473]

                        # Thresholds for looking 'forward' (TUNING NEEDED!)
                        if 0.3 < left_pupil_rel_pos < 0.7 and 0.3 < right_pupil_rel_pos < 0.7:
                            is_eye_contact = True
                            eye_contact_frames += 1
                            # print("Eye contact DETECTED") # Debug print

                    # Visualize Eye Contact state (draw pupils differently)
                    if VISUALIZE:
                        pupil_color = (0, 255, 0) if is_eye_contact else (0, 0, 255) # Green if contact, Red if not
                        # Get pixel coordinates
                        l_pupil_px = mp_drawing._normalized_to_pixel_coordinates(face_landmarks.landmark[468].x, face_landmarks.landmark[468].y, frame_width, frame_height)
                        r_pupil_px = mp_drawing._normalized_to_pixel_coordinates(face_landmarks.landmark[473].x, face_landmarks.landmark[473].y, frame_width, frame_height)
                        if l_pupil_px and r_pupil_px:
                            cv2.circle(annotated_frame, l_pupil_px, 3, pupil_color, -1)
                            cv2.circle(annotated_frame, r_pupil_px, 3, pupil_color, -1)

                    break # Process only the first detected face

                except IndexError:
                     print(f"Warning: Iris landmarks (468, 473) not found. Ensure 'refine_landmarks=True' is set.")
                     # Draw basic mesh even if iris fails
                     if VISUALIZE:
                         mp_drawing.draw_landmarks(
                            image=annotated_frame,
                            landmark_list=face_landmarks,
                            connections=mp_face_mesh.FACEMESH_TESSELATION,
                            landmark_drawing_spec=None,
                            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style())
                         mp_drawing.draw_landmarks(
                            image=annotated_frame,
                            landmark_list=face_landmarks,
                            connections=mp_face_mesh.FACEMESH_CONTOURS,
                            landmark_drawing_spec=None,
                            connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())
                except Exception as e_eye:
                    print(f"Error during eye contact calculation/drawing: {e_eye}")


        # --- Posture Heuristic Calculation & Visualization ---
        is_upright = False # Flag for current frame
        if pose_results.pose_landmarks:
             # Draw the pose skeleton
            if VISUALIZE:
                mp_drawing.draw_landmarks(
                    annotated_frame,
                    pose_results.pose_landmarks,
                    mp_pose.POSE_CONNECTIONS,
                    landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())

            # Calculate posture
            landmarks = pose_results.pose_landmarks.landmark
            left_shoulder = landmarks[mp_pose.PoseLandmark.LEFT_SHOULDER]
            right_shoulder = landmarks[mp_pose.PoseLandmark.RIGHT_SHOULDER]

            if left_shoulder.visibility > 0.6 and right_shoulder.visibility > 0.6:
                y_diff = abs(left_shoulder.y - right_shoulder.y) * frame_height # Pixel diff
                if y_diff < frame_height * 0.1: # Shoulders relatively level
                    is_upright = True
                    upright_frames += 1

            # Visualize Posture state (draw shoulder line differently)
            if VISUALIZE:
                posture_color = (0, 255, 0) if is_upright else (0, 0, 255) # Green if upright, Red if not
                ls_px = mp_drawing._normalized_to_pixel_coordinates(left_shoulder.x, left_shoulder.y, frame_width, frame_height)
                rs_px = mp_drawing._normalized_to_pixel_coordinates(right_shoulder.x, right_shoulder.y, frame_width, frame_height)
                if ls_px and rs_px and left_shoulder.visibility > 0.6 and right_shoulder.visibility > 0.6:
                    cv2.line(annotated_frame, ls_px, rs_px, posture_color, 2)


        # --- Display Text Info on Frame ---
        if VISUALIZE:
            y_pos = 30 # Starting Y position for text
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.7
            text_color = (255, 255, 255) # White
            bg_color = (0, 0, 0) # Black background for text
            thickness = 2

            # Emotion Text
            text_emotion = f"Emotion: {current_emotion}"
            (w, h), _ = cv2.getTextSize(text_emotion, font, font_scale, thickness)
            cv2.rectangle(annotated_frame, (10, y_pos - h - 5) , (10 + w + 5, y_pos + 5), bg_color, -1)
            cv2.putText(annotated_frame, text_emotion, (10, y_pos), font, font_scale, text_color, thickness)
            y_pos += h + 10

            # Eye Contact Text
            text_eye = f"Eye Contact: {'YES' if is_eye_contact else 'NO'}"
            (w, h), _ = cv2.getTextSize(text_eye, font, font_scale, thickness)
            cv2.rectangle(annotated_frame, (10, y_pos - h - 5) , (10 + w + 5, y_pos + 5), bg_color, -1)
            cv2.putText(annotated_frame, text_eye, (10, y_pos), font, font_scale, (0, 255, 0) if is_eye_contact else (0, 0, 255), thickness)
            y_pos += h + 10

            # Posture Text
            text_posture = f"Posture: {'Upright' if is_upright else 'Not Upright'}"
            (w, h), _ = cv2.getTextSize(text_posture, font, font_scale, thickness)
            cv2.rectangle(annotated_frame, (10, y_pos - h - 5) , (10 + w + 5, y_pos + 5), bg_color, -1)
            cv2.putText(annotated_frame, text_posture, (10, y_pos), font, font_scale, (0, 255, 0) if is_upright else (0, 0, 255), thickness)

        # --- Display Frame ---
        if VISUALIZE:
            end_time_frame = time.time()
            processing_time = end_time_frame - start_time_frame
            fps = 1.0 / processing_time if processing_time > 0 else 0
            cv2.putText(annotated_frame, f"FPS: {fps:.1f}", (frame_width - 100, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,255,0), 2)

            cv2.imshow(WINDOW_NAME, annotated_frame)
            # Allow window events and check for 'q' key to quit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                print("Visualization stopped by user ('q' pressed).")
                break

        # Progress print to console
        if processed_frame_count % 20 == 0: # Print progress less often now
             print(f"...processed video frame {frame_count} (Total processed: {processed_frame_count})")

    # --- Cleanup and Final Calculations ---
    cap.release()
    if VISUALIZE:
        cv2.destroyAllWindows()
    print("Video analysis complete.")

    # Compile results (same as before)
    video_analysis_results = {}
    if processed_frame_count > 0:
        dominant_emotion = max(emotion_counts, key=emotion_counts.get) if emotion_counts else "N/A"
        video_analysis_results['dominant_emotion'] = dominant_emotion
        video_analysis_results['emotion_distribution'] = emotion_counts
        # video_analysis_results['eye_contact_percentage'] = round((eye_contact_frames / processed_frame_count) * 100, 2)
        video_analysis_results['upright_posture_percentage'] = round((upright_frames / processed_frame_count) * 100, 2)

        print(f"- Dominant Emotion Detected: {dominant_emotion}")
        print(f"- Emotion Distribution: {emotion_counts}")
        # print(f"- Estimated Eye Contact: {video_analysis_results['eye_contact_percentage']}%")
        print(f"- Estimated Upright Posture: {video_analysis_results['upright_posture_percentage']}%")
    else:
        print("- No frames processed for video analysis.")
        video_analysis_results['error'] = "No frames processed"

    return video_analysis_results

In [29]:
args = {'video_file': 'confident_interview.mp4', 'question': 'Tell me about yourself'}

In [None]:
print("-" * 30)
print("Starting Interview Analysis")
print(f"Video File: {args['question']}")
print(f"Interview Question: {args['video_file']}")
print("-" * 30)
# 1. Extract Audio
if not extract_audio(args['video_file'], OUTPUT_AUDIO_FILENAME):
    print("Analysis aborted due to audio extraction failure.")
    exit(1)

------------------------------
Starting Interview Analysis
Video File: Tell me about yourself
Interview Question: nervous_interview.mp4
------------------------------
Extracting audio from nervous_interview.mp4...
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '0', 'compatible_brands': 'mp42isom'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [474, 850], 'bitrate': 1578, 'fps': 30.1, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 48000, 'bitrate': 156, 'metadata': {'Metadata': '', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 18.13, 'bitrate': 1736, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(

                                                                    

MoviePy - Done.
Audio extracted successfully to temp_audio.wav




In [20]:
# 2. Transcribe Audio
transcript = transcribe_audio(OUTPUT_AUDIO_FILENAME)
if transcript is None:
    print("Analysis aborted due to transcription failure.")
        # Clean up audio file before exiting
    if os.path.exists(OUTPUT_AUDIO_FILENAME):
        os.remove(OUTPUT_AUDIO_FILENAME)
    exit(1)

print("\n--- Transcription ---")
print(transcript)
print("-" * 20)

Loading Whisper model (base)...
Transcribing audio file: temp_audio.wav with word timestamps (this may take a while)...
Transcription complete in 13.39 seconds.

--- Transcription ---
My name is Priyant Chibhargav and I've done my BTEC in computer science and... What do you mean by that? I've worked on several projects.
--------------------


In [21]:
# 3. Analyze Audio Features
print("\n--- Audio Delivery Analysis ---")
audio_metrics = analyze_audio_features(OUTPUT_AUDIO_FILENAME, transcript)
if audio_metrics is None:
    print("Could not perform detailed audio analysis.") 
elif audio_metrics:
    for key, value in audio_metrics.items():
        print(f"- {key.replace('_', ' ').title()}: {value}")
print("-" * 20)


--- Audio Delivery Analysis ---
Analyzing audio features...
- Pace: 82 WPM
- Filler Words Count (basic): 0
- Average Volume (RMS): 0.0482
- Volume Variation (Std Dev RMS): 0.0420
- Average Pitch (F0): 96.61 Hz
- Pitch Variation (Std Dev F0): 14.40 Hz
- Number of Pauses (>0.2s): 0
- Pace Wpm: 82
- Filler Count: 0
- Avg Volume Rms: 0.048159483820199966
- Std Volume Rms: 0.04201605170965195
- Avg Pitch Hz: 96.6124966389549
- Std Pitch Hz: 14.395560301373827
- Num Pauses: 0
- Avg Pause Duration S: 0
--------------------


In [30]:
# 4. Analyze Video Features
print("\n--- Video Delivery Analysis ---")
video_metrics = analyze_video_features(args['video_file'])
if video_metrics is None:
    print("Could not perform video analysis.")
elif video_metrics and 'error' not in video_metrics :
    for key, value in video_metrics.items():
            print(f"- {key.replace('_', ' ').title()}: {value}")
elif video_metrics and 'error' in video_metrics:
        print(f"- Error: {video_metrics['error']}")
else:
    print("N/A")
print("-" * 20)


--- Video Delivery Analysis ---
Analyzing video features (using custom model)...
...processed video frame 40 (Total processed: 20)
...processed video frame 80 (Total processed: 40)
...processed video frame 120 (Total processed: 60)
...processed video frame 160 (Total processed: 80)
...processed video frame 200 (Total processed: 100)
...processed video frame 240 (Total processed: 120)
...processed video frame 280 (Total processed: 140)
...processed video frame 320 (Total processed: 160)
...processed video frame 360 (Total processed: 180)
...processed video frame 400 (Total processed: 200)
...processed video frame 440 (Total processed: 220)
...processed video frame 480 (Total processed: 240)
Video analysis complete.
- Dominant Emotion Detected: happy
- Emotion Distribution: {'angry': 14, 'disgusted': 5, 'fearful': 8, 'happy': 175, 'neutral': 15, 'sad': 0, 'surprised': 35}
- Estimated Upright Posture: 100.0%
- Dominant Emotion: happy
- Emotion Distribution: {'angry': 14, 'disgusted': 5, 

In [22]:
# 5. Clean up temporary audio file
if os.path.exists(OUTPUT_AUDIO_FILENAME):
    try:
        os.remove(OUTPUT_AUDIO_FILENAME)
        print(f"\nTemporary audio file {OUTPUT_AUDIO_FILENAME} deleted.")
    except OSError as e:
        print(f"\nWarning: Could not delete temporary audio file {OUTPUT_AUDIO_FILENAME}: {e}")


Temporary audio file temp_audio.wav deleted.


In [22]:
print("\n--- Analysis Summary ---")
print(f"Question: {args['question']}")
print(f"Video File: {args['video_file']}")
print("\nTranscript:")
print(transcript if transcript else "N/A")
print("\nAudio Metrics:")
if audio_metrics:
    for key, value in audio_metrics.items():
        print(f"- {key.replace('_', ' ').title()}: {value}")
else:
    print("N/A")
print("\nVideo Metrics:")
if video_metrics and 'error' not in video_metrics :
    for key, value in video_metrics.items():
            print(f"- {key.replace('_', ' ').title()}: {value}")
elif video_metrics and 'error' in video_metrics:
        print(f"- Error: {video_metrics['error']}")
else:
    print("N/A")
print("-" * 30)
print("Analysis Complete.")


--- Analysis Summary ---
Question: Tell me about yourself
Video File: confident_interview.mp4

Transcript:
 Good evening, I am Priyad Ji Bharkiv and I am a computer science student at Symbhazis Institute of Technology and I am doing my PTEC and I have worked on many projects including the Moonstack projects and AI as well.

Audio Metrics:
- Pace Wpm: 141
- Filler Count: 0
- Avg Volume Rms: 0.04377371445298195
- Std Volume Rms: 0.027856705710291862
- Avg Pitch Hz: 103.13692304414731
- Std Pitch Hz: 18.258189527073117
- Num Pauses: 0
- Avg Pause Duration S: 0

Video Metrics:
- Dominant Emotion: N/A
- Emotion Distribution: {}
- Eye Contact Percentage: 0.0
- Upright Posture Percentage: 100.0
------------------------------
Analysis Complete.


In [None]:
prompt = """You are an expert AI Interview Coach. Your task is to analyze a candidate's response to an interview question based on the provided transcript and objective delivery metrics (audio/video analysis).

**Interview Context:**
*   **Question Asked:** "{interview_question}"

**Candidate's Response:**
*   **Transcript:** "{transcript}"

**Objective Delivery Analysis Data:**
{audio_metrics_str}
{video_metrics_str}

**Analysis Instructions:**
Evaluate the candidate's performance comprehensively based *only* on the provided data. Consider both the *content* of the answer (using the transcript) and the *delivery* (using the provided audio/video metrics).

*   **Content & Delivery Integration:** Assess the following aspects, assigning a score from 1 (Poor) to 5 (Excellent) for each:
    *   **Relevance:** How directly and effectively does the answer address the specific question asked? (1=Irrelevant, 5=Highly Relevant)
    *   **Clarity:** How clear, concise, and easy to understand is the response, considering both the language used and the delivery (e.g., pace, fillers)? (1=Unclear/Rambling, 5=Very Clear/Concise)
    *   **Tone:** How appropriate and effective is the perceived tone for an interview? Consider confidence, professionalism, and engagement, inferred from vocal cues (pitch/volume variation), facial expressions (dominant emotion), and language. (1=Inappropriate/Disengaged/Unconfident, 5=Confident/Professional/Engaging)
    *   **Vocabulary:** How appropriate, professional, and articulate is the language used? (1=Inappropriate/Informal/Unclear, 5=Highly Professional/Articulate)
    *   **STAR Format Adhesion:** If the question is behavioral, how well does the answer adhere to the STAR method (Situation, Task, Action, Result)? Are all components present and distinct? (1=No Adherence/Not Applicable, 3=Partial Adherence, 5=Excellent Adherence - All parts clear). Assign 1 if not a behavioral question or if format is totally absent.

**Output Format:**
Provide your analysis *strictly* in JSON format. The JSON object should have the following keys ONLY:

*   `relevance_score`: (Integer) Score from 1-5.
*   `clarity_score`: (Integer) Score from 1-5.
*   `tone_score`: (Integer) Score from 1-5 assessing perceived tone's effectiveness.
*   `vocabulary_score`: (Integer) Score from 1-5 assessing language use.
*   `star_format_score`: (Integer) Score from 1-5 assessing STAR method adhesion (1 if N/A or completely missing).
*   `strengths`: (List of strings) 2-3 bullet points highlighting specific positive aspects related to the scored criteria (e.g., "Strong relevance (Score: 5).", "Tone perceived as confident (Score: 4).").
*   `areas_for_improvement`: (List of strings) 2-4 specific, actionable feedback points related to the scored criteria, referencing metrics or transcript parts where possible (e.g., "Improve STAR adhesion (Score: 2) by explicitly stating the Result.", "Reduce filler word count (count: Y) to enhance clarity (Score: 3).", "Work on varying vocal pitch (Std Dev: Z Hz) to improve tone perception (Score: 2).").

**Example JSON Structure:**
```json
{{
  "relevance_score": 4,
  "clarity_score": 3,
  "tone_score": 4,
  "vocabulary_score": 5,
  "star_format_score": 3,
  "strengths": [
    "Excellent vocabulary use, very professional (Score: 5).",
    "Answer was highly relevant to the question asked (Score: 4).",
    "Tone came across as generally confident (Score: 4)."
  ],
  "areas_for_improvement": [
    "Improve clarity (Score: 3) by structuring points more logically and reducing minor rambling.",
    "STAR format adhesion was partial (Score: 3); ensure the 'Result' is clearly articulated.",
    "Consider increasing eye contact (estimated Z%) to further enhance engagement aspect of tone.",
    "Slightly high filler word count (count: Y) impacted clarity."
  ]
}}"""