In [None]:
### PHASE 1 : Filtering and Noise Reduction


In [None]:
## Imports and Installation


In [None]:
!pip install pydub noisereduce


In [None]:
from pydub import AudioSegment
import noisereduce as nr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
import IPython


In [None]:
print("Noise Audio file : ")
NoiseAudio = "/content/conversation.mp3"
IPython.display.Audio(NoiseAudio)


In [None]:
style.use('ggplot')

# Load audio file
audio = AudioSegment.from_file("/content/conversation.mp3")

# Convert audio to numpy array
samples = np.array(audio.get_array_of_samples())

# Reduce noise
reduced_noise = nr.reduce_noise(samples, sr=audio.frame_rate)

# Plot original and reduced noise signals
fig, ax = plt.subplots(2, 1, figsize=(15,8))
ax[0].set_title("Original signal")
ax[0].plot(samples)
ax[1].set_title("Reduced noise signal")
ax[1].plot(reduced_noise)
plt.show()

# Convert reduced noise signal back to audio
reduced_audio = AudioSegment(
    reduced_noise.tobytes(),
    frame_rate=audio.frame_rate,
    sample_width=audio.sample_width,
    channels=audio.channels
)

# Save reduced audio to file
reduced_audio.export("output.wav", format="wav")


In [None]:
print("Reduced and Clean Noise Audio file : ")
ReducedAudio = "/content/output.wav"
IPython.display.Audio(ReducedAudio)


In [None]:
### Diarization and Segmentation


In [None]:
### Installation and imports / Huggings authentication


In [None]:
!pip install pyannote.audio torch torchvision torchaudio
!pip install huggingface_hub


In [None]:
!pip install git+https://github.com/m-bain/whisperx.git


In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
import whisperx
from pyannote.audio import Pipeline
from pydub import AudioSegment


In [None]:
## Identifying the Number of Speaker Speaking the Audio Sample

from collections import defaultdict


In [None]:
def count_speakers(audio_path):
    """Detect number of speakers in an audio file."""
    # Load pre-trained speaker diarization pipeline
    pipeline = Pipeline.from_pretrained(
         "pyannote/speaker-diarization-3.1", use_auth_token="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"
    )

    # Apply the pipeline to the audio file
    diarization = pipeline(audio_path)

    # Collect speaker labels
    speakers = defaultdict(list)
    for segment, track, label in diarization.itertracks(yield_label=True):
        speakers[label].append(segment)

    # Count unique speakers
    num_speakers = len(speakers)
    return num_speakers

# Path to the audio file
audio_file = "output.wav"  # Replace with your file path

# Count speakers
num_speakers = count_speakers(audio_file)
print(f"Number of speakers detected: {num_speakers}")


In [None]:
# breaking into differnt Segments based on the Speaker


In [None]:
def segment_audio_by_speaker(audio_path):
    """Segment audio by speaker and provide start and end times."""
    # Load the speaker diarization pipeline
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1", use_auth_token="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"
    )

    # Apply the pipeline to the audio file
    diarization = pipeline(audio_path)

    # Prepare segments with speaker labels
    segments = []
    for segment, track, label in diarization.itertracks(yield_label=True):
        # Add only segments longer than a threshold to avoid noise artifacts
        if segment.end - segment.start > 0.5:  # Filter short segments
            segments.append({
                "speaker": label,
                "start_sec": segment.start,
                "end_sec": segment.end
            })

    # Merge adjacent segments with the same speaker
    merged_segments = []
    for seg in segments:
        if merged_segments and merged_segments[-1]["speaker"] == seg["speaker"]:
            merged_segments[-1]["end_sec"] = seg["end_sec"]  # Merge segments
        else:
            merged_segments.append(seg)

    return merged_segments


In [None]:
# Diarization to get segments
segments = segment_audio_by_speaker(audio_file)


In [None]:
#printing Segments
segments


In [None]:
# transcribing the segments into text for each segments


In [None]:
def transcribe_segments(audio_path, segments, model):
    """Transcribe each audio segment and add text to segments."""
    from pydub import AudioSegment

    audio = AudioSegment.from_wav(audio_path)

    for segment in segments:
        # Extract segment audio
        start_ms = segment["start_sec"] * 1000  # Convert to milliseconds
        end_ms = segment["end_sec"] * 1000  # Convert to milliseconds
        segment_audio = audio[start_ms:end_ms]

        # Save segment to temporary file
        temp_file = "temp_segment.wav"
        segment_audio.export(temp_file, format="wav")

        # Transcribe audio segment
        result = model.transcribe(temp_file)

        # Debugging: Print the result structure
        print(f"Transcription Result: {result}")

        # Safely add transcription to the segment
        segment["text"] = result['segments'][0]['text']

    return segments


In [None]:
# Load WhisperX model for transcription

model = whisperx.load_model("large-v1", device="cpu", compute_type="float32")

# Transcribe each segment
segments_with_text = transcribe_segments(audio_file, segments, model)


In [None]:
# Print the updated segments with text
for seg in segments_with_text:
    print("Speaker:" ,seg['speaker'] ,"Start_sec:" , seg['start_sec'],"End_sec:" , seg['end_sec'])
    print("Text:",seg['text'])
    print("")


In [None]:
1) audio(common_voice_en_37473806.mp3) and text ==> verified text (The band made some recordings with producer Tom Dowd overseeing, but they were scrapped.)
2) model(audio(common_voice_en_37473806.mp3)) ==> created text(sdsdsdsds)


In [None]:
# from typing import List
# import json
# from pydub import AudioSegment
# import torch

# # Levenshtein Distance Calculation
# def levenshtein_distance(ref: List[str], hyp: List[str]) -> int:
#     """Compute Levenshtein distance between two sequences."""
#     n, m = len(ref), len(hyp)
#     dp = [[0] * (m + 1) for _ in range(n + 1)]

#     for i in range(1, n + 1):
#         dp[i][0] = i
#     for j in range(1, m + 1):
#         dp[0][j] = j

#     for i in range(1, n + 1):
#         for j in range(1, m + 1):
#             if ref[i - 1] == hyp[j - 1]:
#                 dp[i][j] = dp[i - 1][j - 1]
#             else:
#                 dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

#     return dp[n][m]

# def word_error_rate(reference: str, hypothesis: str) -> float:
#     """Calculate Word Error Rate (WER)."""
#     ref_words = reference.split()
#     hyp_words = hypothesis.split()
#     distance = levenshtein_distance(ref_words, hyp_words)
#     return distance / len(ref_words)

# def character_error_rate(reference: str, hypothesis: str) -> float:
#     """Calculate Character Error Rate (CER)."""
#     ref_chars = list(reference)
#     hyp_chars = list(hypothesis)
#     distance = levenshtein_distance(ref_chars, hyp_chars)
#     return distance / len(ref_chars)

# # WhisperX-based transcription function
# def transcribe_segments(audio_path, segments, model):
#     """Transcribe each audio segment and add text to segments."""
#     audio = AudioSegment.from_wav(audio_path)

#     for segment in segments:
#         # Extract segment audio
#         start_ms = segment["start_sec"] * 1000  # Convert to milliseconds
#         end_ms = segment["end_sec"] * 1000  # Convert to milliseconds
#         segment_audio = audio[start_ms:end_ms]

#         # Save segment to temporary file
#         temp_file = "temp_segment.wav"
#         segment_audio.export(temp_file, format="wav")

#         # Load audio as PyTorch Tensor
#         # Read the audio file as a numpy array
#         audio_data = segment_audio.get_array_of_samples()
#         # Convert the numpy array to a PyTorch tensor and adjust the shape
#         audio_tensor = torch.tensor(audio_data).unsqueeze(0).float()

#         # Transcribe audio using the tensor
#         result = model.transcribe(audio_tensor) # Passing the Tensor to transcribe

#         # Safely add transcription to the segment
#         segment["text"] = result['segments'][0]['text']

#     return segments
# # Load the WhisperX model
# import whisperx
# model = whisperx.load_model("large-v1", device="cpu", compute_type="float32")

# # Integration for WER and CER comparison
# def compare_with_actual(audio_file, segments, actual_text):
#     # Transcribe using WhisperX
#     transcribed_segments = transcribe_segments(audio_file, segments, model)
#     model_generated_text = " ".join([seg["text"] for seg in transcribed_segments])

#     # Calculate WER and CER
#     wer = word_error_rate(actual_text, model_generated_text)
#     cer = character_error_rate(actual_text, model_generated_text)

#     print(f"Actual Transcription: {actual_text}")
#     print(f"Model-Generated Transcription: {model_generated_text}")
#     print(f"Word Error Rate (WER): {wer:.2%}")
#     print(f"Character Error Rate (CER): {cer:.2%}")

# Usage Example
audio_file = "/content/common_voice_en_37473806.mp3"  # Replace with actual audio file path
actual_transcription = "The band made some recordings with producer Tom Dowd overseeing, but they were scrapped."

# Run comparison
# compare_with_actual(audio_file, actual_transcription)


In [None]:
result = model.transcribe(audio_file)


In [None]:
result


In [None]:
predicted_text = result['segments'][0]['text']
predicted_text


In [None]:
# function to remove  spaces from the front and back from the string in python
def remove_leading_and_trailing_spaces(text):
  """Removes leading and trailing spaces from a string."""
  return text.strip()

predicted_text = remove_leading_and_trailing_spaces(predicted_text)
predicted_text


In [None]:
from typing import List
# Levenshtein Distance Calculation
def levenshtein_distance(ref: List[str], hyp: List[str]) -> int:
    """Compute Levenshtein distance between two sequences."""
    n, m = len(ref), len(hyp)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    for i in range(1, n + 1):
        dp[i][0] = i
    for j in range(1, m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if ref[i - 1] == hyp[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    return dp[n][m]

def word_error_rate(reference: str, hypothesis: str) -> float:
    """Calculate Word Error Rate (WER)."""
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    distance = levenshtein_distance(ref_words, hyp_words)
    return distance / len(ref_words)

def character_error_rate(reference: str, hypothesis: str) -> float:
    """Calculate Character Error Rate (CER)."""
    ref_chars = list(reference)
    hyp_chars = list(hypothesis)
    distance = levenshtein_distance(ref_chars, hyp_chars)
    return distance / len(ref_chars)


In [None]:
print("=== Comparison Results ===")
print(f"Actual Text:\n{actual_transcription}\n")
print(f"Model Transcription:\n{predicted_text}\n")

wer = word_error_rate(actual_transcription, predicted_text)
cer = character_error_rate(actual_transcription, predicted_text)

print(f"Word Error Rate (WER): {wer:.3%}")
print(f"Character Error Rate (CER): {cer:.3%}")


In [None]:
files.download('')
