In [None]:
# Először telepítsük a szükséges csomagokat
!pip install pyannote.audio
!pip install soundfile
!pip install pydub
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
from huggingface_hub import notebook_login

notebook_login()

CPU version

In [None]:
import torch
from pyannote.audio import Pipeline
from pyannote.core import Segment
import soundfile as sf
import numpy as np

# Initialize the speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

# Apply the pipeline to the audio file
diarization = pipeline("/content/input.wav")

# Load the audio file
audio, sr = sf.read('/content/input.wav')

# Create masks for each speaker
speaker1_audio = np.zeros_like(audio)
speaker2_audio = np.zeros_like(audio)

# Iterate over the diarization result and separate speakers
for segment, _, speaker in diarization.itertracks(yield_label=True):
    start = int(segment.start * sr)
    end = int(segment.end * sr)
    if speaker == 'SPEAKER_00':
        speaker1_audio[start:end] = audio[start:end]
    elif speaker == 'SPEAKER_01':
        speaker2_audio[start:end] = audio[start:end]

# Save the separated audio files to disk
sf.write('speaker1.wav', speaker1_audio, sr)
sf.write('speaker2.wav', speaker2_audio, sr)


GPU version

In [None]:
import torch
from pyannote.audio import Pipeline
from pyannote.core import Segment
import soundfile as sf
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN").to(device)

# Apply the pipeline to the audio file
diarization = pipeline("/content/orban.wav")

# Load the audio file
audio, sr = sf.read('/content/orban.wav')

# Create masks for each speaker
speaker1_audio = np.zeros_like(audio)
speaker2_audio = np.zeros_like(audio)

# Iterate over the diarization result and separate speakers
for segment, _, speaker in diarization.itertracks(yield_label=True):
    start = int(segment.start * sr)
    end = int(segment.end * sr)
    if speaker == 'SPEAKER_00':
        speaker1_audio[start:end] = audio[start:end]
    elif speaker == 'SPEAKER_01':
        speaker2_audio[start:end] = audio[start:end]

# Save the separated audio files to disk
sf.write('speaker1.wav', speaker1_audio, sr)
sf.write('speaker2.wav', speaker2_audio, sr)


GPU batch recursive

In [None]:
import torch
from pyannote.audio import Pipeline
from pyannote.core import Segment
import soundfile as sf
import numpy as np
import os
import tempfile
from pydub import AudioSegment

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the speaker diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="YOUR_HF_TOKEN").to(device)

def convert_to_wav(mp3_file):
    # Load MP3 file
    audio = AudioSegment.from_mp3(mp3_file)
    # Convert to WAV
    wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
    wav_file.close()  # Close to release handle
    audio.export(wav_file.name, format="wav")
    return wav_file.name

def process_audio_file(file_path):
    if file_path.lower().endswith(".mp3"):
        # Convert MP3 to WAV
        file_path = convert_to_wav(file_path)

    # Apply the pipeline to the audio file
    diarization = pipeline(file_path)

    # Load the audio file
    audio, sr = sf.read(file_path)

    # Create masks for each speaker
    speaker1_audio = np.zeros_like(audio)
    speaker2_audio = np.zeros_like(audio)

    # Iterate over the diarization result and separate speakers
    for segment, _, speaker in diarization.itertracks(yield_label=True):
        start = int(segment.start * sr)
        end = int(segment.end * sr)
        if speaker == 'SPEAKER_00':
            speaker1_audio[start:end] = audio[start:end]
        elif speaker == 'SPEAKER_01':
            speaker2_audio[start:end] = audio[start:end]

    # Save the separated audio files to disk
    base_filename = os.path.splitext(os.path.basename(file_path))[0]
    sf.write(f'{base_filename}_speaker1.wav', speaker1_audio, sr)
    sf.write(f'{base_filename}_speaker2.wav', speaker2_audio, sr)

    # Clean up temporary WAV file if MP3 was converted
    if file_path.lower().endswith(".mp3"):
        os.remove(file_path)

def process_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            process_audio_file(file_path)

# Directory containing the audio files
directory = '/content/audio_files'

# Process all audio files in the directory
process_directory(directory)

