In [2]:
# Extracting Audio from Video
import subprocess

def extract_audio(video_file, output_audio_file):
    command = f"ffmpeg -i {video_file} -q:a 0 -map a {output_audio_file} -y"
    subprocess.run(command, shell=True, check=True)

# Example usage
video_file = "HR.mp4"
audio_file = "response_audio.wav"
extract_audio(video_file, audio_file)


In [5]:
# Noise Reduction
import noisereduce as nr
import librosa
import soundfile as sf

# Load audio
audio, sr = librosa.load("response_audio.wav", sr=16000)

# Apply noise reduction
reduced_noise = nr.reduce_noise(y=audio, sr=sr)

# Save processed audio
sf.write("processed_audio.wav", reduced_noise, sr)


In [8]:
# Audio to Text
import whisper
import wave

def real_time_transcription(audio_file):
    model = whisper.load_model("base")
    audio = whisper.load_audio(audio_file)
    transcription = model.transcribe(audio)
    print("Transcription:", transcription["text"])
    return transcription["text"]

# Usage
# Save live audio to 'audio.wav' in real time
transcription = real_time_transcription("processed_audio.wav")


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 66.0MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcription:  Have you received? Yes, my name is Rizim. What you could make? My name is Rizim Rekres from it. So, Rizim, today you are going to come take a walk into the hall or an extra right desk. So, 10 month ago, yes, firstly, and then to give you a 30-minute service, to introduce this writer, my name is Rizim Rekres from the time from the World Call of Studies and Data Fundamentals. I completed my data in the stream of computer science and engineering, Drakash Mejiri in the college at Harvuku, Bangladesh. I set up my data with 7.96 EPA. Coming to my family, there are proud members of my family, including me and my technical skills and good-dips, me, fighter, data switches and basics of the school. And my interpersonal skills, I am able to communicate with other people's and good listeners and my hobbies are spending time with family and friends. Cooking, business, music and crafting at three times. My strengths are I am a Holger and a self-motivator. And coming to my booth, my s

In [11]:
# Grammar Analysis
import language_tool_python

def grammar_analysis(text):
    tool = language_tool_python.LanguageTool('en-IN')
    matches = tool.check(text)
    errors = [match.ruleId for match in matches]
    suggestions = [match.message for match in matches]
    return errors, suggestions

# Example Usage
text = "This is an test sentence with error."
errors, suggestions = grammar_analysis(text)
print("Errors:", errors)
print("Suggestions:", suggestions)


Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:03<00:00, 69.9MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp74hsgobm.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /root/.cache/language_tool_python.


Errors: ['EN_A_VS_AN']
Suggestions: ['Use “a” instead of ‘an’ if the following word doesn’t start with a vowel sound, e.g. ‘a sentence’, ‘a university’.']


In [14]:
# Pronunciation Analysis
from nltk.corpus import cmudict

def check_pronunciation(word):
    pronunciation = cmudict.dict()
    return pronunciation.get(word.lower(), "Word not found in dictionary")

# Example Usage
word = "hello"
result = check_pronunciation(word)
print(f"Pronunciation for '{word}': {result}")


Pronunciation for 'hello': [['HH', 'AH0', 'L', 'OW1'], ['HH', 'EH0', 'L', 'OW1']]


In [17]:
# Speaking Rate Calculation
from pydub import AudioSegment, silence

def speaking_rate_from_speaking_segments(transcribed_text, audio_file):
    """
    Calculate speaking rate based on automatically detected speaking segments.

    :param transcribed_text: Transcribed text of the candidate's speech.
    :param audio_file: Path to the audio file.
    :return: Speaking rate (WPM) and total speaking time (seconds).
    """
    audio = AudioSegment.from_wav(audio_file)

    # Detect non-silent segments (candidate speaking)
    non_silent_ranges = silence.detect_nonsilent(audio, min_silence_len=700, silence_thresh=-40)

    # Calculate total speaking time
    total_speaking_duration = sum((end - start) for start, end in non_silent_ranges) / 1000.0  # in seconds

    # Calculate speaking rate (WPM)
    words = len(transcribed_text.split())
    if total_speaking_duration > 0:
        speaking_rate = words / (total_speaking_duration / 60)  # Words per minute
    else:
        speaking_rate = 0  # Handle edge case where no speaking is detected

    return speaking_rate, total_speaking_duration

# Example Usage
transcription = "This is a sample response provided by the candidate."
rate, speaking_time = speaking_rate_from_speaking_segments(transcription, "processed_audio.wav")
print(f"Speaking Rate: {rate:.2f} WPM, Total Speaking Time: {speaking_time:.2f} seconds")


Speaking Rate: 1843.00 WPM, Total Speaking Time: 0.29 seconds


In [18]:
# Pause Analysis
from pydub import AudioSegment, silence

def pause_analysis_during_speech(audio_file, transcription, min_silence_len=500, silence_thresh=-40):
    """
    Analyze pauses during candidate speaking time, excluding question-reading time.

    :param audio_file: Path to the audio file.
    :param transcription: Transcribed text of the entire audio.
    :param min_silence_len: Minimum length of silence to be considered a pause (in ms).
    :param silence_thresh: Silence threshold in dBFS.
    :return: Number of pauses, details of pause durations (start, end).
    """
    audio = AudioSegment.from_wav(audio_file)

    # Detect speaking segments (non-silent parts)
    non_silent_ranges = silence.detect_nonsilent(audio, min_silence_len=700, silence_thresh=silence_thresh)

    # Extract candidate's speaking segments (skip question-reading time)
    # Assuming the transcription helps identify when the candidate starts speaking
    speaking_segments = non_silent_ranges  # Adjust this if you have markers for when speaking starts

    # Analyze pauses within speaking segments
    total_pauses = 0
    pause_details = []

    for start, end in speaking_segments:
        # Extract the candidate's speaking segment
        segment_audio = audio[start:end]

        # Detect pauses within this segment
        pauses = silence.detect_silence(segment_audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)

        # Adjust pause times relative to the original audio
        pauses = [(start + pause_start, start + pause_end) for pause_start, pause_end in pauses]

        # Update total pauses and details
        total_pauses += len(pauses)
        pause_details.extend(pauses)

    return total_pauses, pause_details

# Example Usage
audio_file = "processed_audio.wav"
transcription = "The transcription of the candidate's entire speech."
num_pauses, pause_details = pause_analysis_during_speech(audio_file, transcription)
print(f"Number of Pauses During Candidate Speaking: {num_pauses}")
print("Pause Durations:", pause_details)


Number of Pauses During Candidate Speaking: 0
Pause Durations: []


In [19]:
# Filler Word Usage Count
def filler_word_usage(text):
    fillers = ["um", "uh", "like", "you know", "sort of"]
    filler_count = {filler: text.lower().count(filler) for filler in fillers}
    return filler_count

# Example Usage
text = "Um, I think this is, like, a sort of test."
fillers = filler_word_usage(text)
print("Filler Word Usage:", fillers)


Filler Word Usage: {'um': 1, 'uh': 0, 'like': 1, 'you know': 0, 'sort of': 1}


In [20]:
# Voice Clarity Analysis
from pydub import AudioSegment
import numpy as np

def calculate_snr(audio_file):
    audio = AudioSegment.from_file(audio_file)
    samples = np.array(audio.get_array_of_samples())
    signal = np.mean(samples**2)
    noise = np.var(samples)
    snr = 10 * np.log10(signal / noise)
    return snr

# Example Usage
snr = calculate_snr("processed_audio.wav")
print(f"Signal-to-Noise Ratio: {snr} dB")


Signal-to-Noise Ratio: -9.984965081863532 dB


In [23]:
# Feedback Generation
from pydub import AudioSegment, silence

def generate_feedback(transcription, audio_file):
    """
    Generate comprehensive feedback for a candidate's speech.

    :param transcription: Transcribed text of the candidate's speech.
    :param audio_file: Path to the processed audio file.
    :return: Dictionary containing detailed feedback.
    """
    # Grammar Analysis
    errors, suggestions = grammar_analysis(transcription)

    # Speaking Rate and Speaking Time
    speaking_rate, speaking_time = speaking_rate_from_speaking_segments(transcription, audio_file)

    # Pause Patterns (pauses while speaking)
    num_pauses, pauses = pause_analysis_during_speech(audio_file, transcription)

    # Filler Word Usage
    fillers = filler_word_usage(transcription)

    # Voice Clarity
    snr = calculate_snr(audio_file)
    if snr < 10:
        voice_clarity = "Low"
    elif 10 <= snr < 20:
        voice_clarity = "Medium"
    else:
        voice_clarity = "High"

    # Feedback Compilation
    feedback = {
        "Grammar Errors": errors,
        "Grammar Suggestions": suggestions,
        "Speaking Rate (WPM)": speaking_rate,
        "Total Speaking Time (seconds)": speaking_time,
        "Number of Pauses": num_pauses,
        "Pause Details (ms)": pauses,
        "Filler Word Usage": fillers,
        "Voice Clarity": voice_clarity,
    }
    return feedback

# Example Usage
transcription = "This is um a sample uh response with errors."
feedback = generate_feedback(transcription, "processed_audio.wav")
for key, value in feedback.items():
    print(f"{key}: {value}")


Grammar Errors: []
Grammar Suggestions: []
Speaking Rate (WPM): 1843.0034129692833
Total Speaking Time (seconds): 0.293
Number of Pauses: 0
Pause Details (ms): []
Filler Word Usage: {'um': 1, 'uh': 1, 'like': 0, 'you know': 0, 'sort of': 0}
Voice Clarity: Low
