In [17]:
# Libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal

In [18]:
# Function to read the data
def read_mfcc_data(file_path):
    mfcc_blocks = []
    current_block = []
    with open(file_path, 'r') as file:
        # Iterates over each line in the file
        for line in file:
            
            # If the current block is finished, append it to the list of blocks
            if line.strip() == '':
                if current_block:
                    mfcc_blocks.append(current_block)
                    current_block = []
            else:
                # Convert each line to a list of floats (MFCC coefficients)
                frame = list(map(float, line.strip().split()))
                current_block.append(frame)
        # Append the last block if it ends without a blank line
        if current_block:
            mfcc_blocks.append(current_block)
    return mfcc_blocks

In [19]:
# Function to get the blocks for a desired digit
def get_digit_blocks(mfcc_blocks, digit):
    start = digit * 660
    end = start + 660
    
    return mfcc_blocks[start:end]

In [20]:
# Reads the data
file_path = './data/Train_Arabic_Digit.txt'
mfcc_blocks = read_mfcc_data(file_path)

In [21]:
# Initialize lists to store means and covariances for each digit
means_all = []
covariances_all = []

gmm_components = {
    0: 3, 1: 3, 2: 5, 3: 4, 4: 3, 5: 3, 6: 3, 7: 4, 8: 4, 9: 3
}

In [22]:
# Train KMeans and calculate covariances for each digit
for digit in range(10):
    # Gets the data from a specific digit
    digit_data = np.vstack(get_digit_blocks(mfcc_blocks, digit))
    n_clusters = gmm_components[digit]
    
    # Train k-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(digit_data)
    # Calculate mean vectors
    means = kmeans.cluster_centers_
    
    # Calculate covariance matrices
    covariances = []
    for cluster in range(n_clusters):
        cluster_data = digit_data[kmeans.labels_ == cluster]
        covariance = np.cov(cluster_data, rowvar=False)
        covariances.append(covariance)

    means_all.append(means)
    covariances_all.append(covariances)

In [None]:
def compute_likelihood(block, means, covariances):
    # Compute the likelihood of the block for given GMM parameters
    total_likelihood = 0
    for frame in block:
        frame_likelihoods = []
        for mean, cov in zip(means, covariances):
            try:
                likelihood = multivariate_normal(mean=mean, cov=cov).pdf(frame)
            except np.linalg.LinAlgError as e:
                likelihood = 0
            frame_likelihoods.append(likelihood)
        # Check for numerical stability
        
        
        total_likelihood += np.log(np.sum(frame_likelihoods) + 1e-9)  # Add a small value for numerical stability
    return total_likelihood