<a href="https://colab.research.google.com/github/queens-supercluster/SpeechDiarization/blob/main/Resegmentation_Trials_Reference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#RESEGMENTATION

In [None]:
# Refine speaker labels using Variational Bayesian PLDA

from sklearn.mixture import BayesianGaussianMixture

plda = BayesianGaussianMixture(n_components=num_speakers, covariance_type='diag', weight_concentration_prior_type='dirichlet_process').fit(embeddings)

refined_labels = np.zeros_like(labels)
for i in range(len(segments)):
    segment_embeddings = embeddings[i].reshape(1, -1)
    scores = plda.score_samples(segment_embeddings)[0]
    refined_labels[i] = np.argmax(scores)

# Update segment speakers with refined labels
for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(refined_labels[i] + 1)

# Write transcript with refined segmentation
with open("transcript3.txt", "w") as f:
    for (i, segment) in enumerate(segments):
        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
            f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
        f.write(segment["text"][1:] + ' ')

In [None]:
#2nd Instance

from sklearn.mixture import GaussianMixture

def diarization_refinement(embeddings, labels):
    num_clusters = len(np.unique(labels))

    # Initialize GMM-HMM models for each cluster
    models = []
    for _ in range(num_clusters):
        gmm = GaussianHMM(num_speakers, covariance_type='diag',n_iter= 1000)
        models.append(gmm)

    # Iterate over each cluster and re-estimate the GMM
    for cluster_id in range(num_clusters):
        cluster_embeddings = embeddings[labels == cluster_id]
        models[cluster_id].fit(cluster_embeddings)

    # Perform Viterbi re-segmentation
    viterbi_segments = []
    for embedding in embeddings:
        scores = [model.score(embedding.reshape(1, -1)) for model in models]
        predicted_cluster = np.argmax(scores)
        viterbi_segments.append(predicted_cluster)

    return viterbi_segments

refined_labels = diarization_refinement(embeddings, labels)

# Assign refined labels to segments
for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(refined_labels[i] + 1)

In [None]:
# 3rd instance

import numpy as np
from hmmlearn.hmm import GaussianHMM

def resegmentation(initial_segments, embeddings):
    # Extracting initial cluster assignments from the initial segments
    initial_clusters = [segment['speaker'] for segment in initial_segments]
    unique_speakers = list(set(initial_clusters))  #trying to make code more dynamic - to remove num_speakers

    # Determining mumber of components for HMM
    n_components = len(unique_speakers)

    # Preparing features for HMM training
    features = embeddings.reshape(len(embeddings), -1)  # Reshape embeddings if needed

    hmm = GaussianHMM(n_components, covariance_type='diag', n_iter= 100)
    hmm.fit(features)

    #log_likelihoods, _ = hmm.score_samples(features)

    resegmented_segments = initial_segments.copy()

    #Viterbi re-segmentation
    n_frames = len(features)
    state_sequence = hmm.predict(features)  # Viterbi decoding

    # re-segmented segments based on the new state sequence
    for i, segment in enumerate(resegmented_segments):
        segment['speaker'] = 'SPEAKER ' + str(state_sequence[i] + 1)

    return resegmented_segments

resegmented_segments = resegmentation(segments, embeddings)


In [None]:
#4th Instance (ChatGPT3)

import numpy as np
from sklearn.cluster import KMeans
from hmmlearn import hmm

n_states = 2  # Speaker A, Speaker B, Non-speech
models = [hmm.GaussianHMM(n_components=n_states) for _ in range(2)]  # Two speakers
non_speech_model = hmm.GaussianHMM(n_components=n_states)

def calculate_posterior_probabilities(embeddings):
    # Define HMM parameters
    n_states = 2  # Number of HMM states (Speaker A, Speaker B, non-speech N)
    n_features = embeddings.shape[1]  # Number of features in the embeddings

    # Initialize the HMM model
    model = hmm.GaussianHMM(n_components=n_states)

    # Train the HMM model on the embeddings
    model.fit(embeddings)

    # Compute the posterior probabilities for each embedding using the trained HMM model
    posterior_probabilities = model.predict_proba(embeddings)

    return posterior_probabilities

# Baum-Welch training
for i, model in enumerate(models):
    # Get embeddings and posterior probabilities for the current speaker
    speaker_embeddings = embeddings[labels == i]
    speaker_posteriors = calculate_posterior_probabilities(speaker_embeddings)  # Replace with your calculation

    # Train the speaker model using Baum-Welch
    model.fit(speaker_embeddings, lengths=speaker_posteriors.shape[0])

# Viterbi re-segmentation
max_iterations = 20
for _ in range(max_iterations):
    # Initialize statistics for Baum-Welch re-estimation
    total_posteriors = np.zeros(n_states)
    total_features = np.zeros((n_states, embeddings.shape[1]))

    for i, model in enumerate(models):
        # Calculate posterior probabilities for each embedding
        speaker_posteriors = model.predict_proba(embeddings)

        # Accumulate statistics for re-estimation
        total_posteriors += np.sum(speaker_posteriors, axis=0)
        total_features += np.dot(speaker_posteriors.T, embeddings)

    # Update models using Baum-Welch re-estimation
    for i, model in enumerate(models):
        model.startprob_ = total_posteriors / np.sum(total_posteriors)
        model.means_ = total_features / total_posteriors[:, np.newaxis]

    # Assign frames to the model with the highest posterior probability
    new_labels = np.argmax(np.concatenate([model.predict_proba(embeddings) for model in models]), axis=1)

    # Check for convergence
    if np.array_equal(new_labels, labels):
        break

    labels = new_labels

# 4.2.4 Second Pass Refinements

# Initialize embeddings for each speaker
embeddings_A = np.zeros((np.sum(labels == 0), embeddings.shape[1]))
embeddings_B = np.zeros((np.sum(labels == 1), embeddings.shape[1]))

# Assign embeddings to the corresponding speaker
index_A = 0
index_B = 0
for i, label in enumerate(labels):
    if label == 0:
        embeddings_A[index_A] = embeddings[i]
        index_A += 1
    else:
        embeddings_B[index_B] = embeddings[i]
        index_B += 1

# Perform K-means clustering on embeddings to reassign segments
kmeans_A = KMeans(n_clusters=2)
labels_A = kmeans_A.fit_predict(embeddings_A)

kmeans_B = KMeans(n_clusters=2)
labels_B = kmeans_B.fit_predict(embeddings_B)

# Update segment assignments based on new clustering
new_labels = np.zeros_like(labels)
for i, label in enumerate(labels):
    if label == 0:
        new_labels[i] = labels_A[index_A]
        index_A += 1
    else:
        new_labels[i] = labels_B[index_B]
        index_B += 1

# Check for convergence
if np.array_equal(new_labels, labels):
    break

labels = new_labels

# Continue the refinement process until convergence

# Helper function to calculate posterior probabilities
def calculate_posterior_probabilities(embeddings):
    # Replace with your implementation to calculate posterior probabilities
    return np.random.rand(embeddings.shape[0], n_states)




IndexError: ignored