In [1]:
import numpy as np

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
%matplotlib inline

import glob
import pandas as pd

In [10]:
d1 = glob.glob('a/*.txt')
d2 = glob.glob('dA/*.txt')
d3 = glob.glob('tA/*.txt')

d_test = glob.glob('test_data/a-bA-tA/*.txt')

In [11]:
d_test

['test_data/a-bA-tA/1.txt',
 'test_data/a-bA-tA/2.txt',
 'test_data/a-bA-tA/3.txt']

In [12]:
def load_mfcc(list_paths):
    X = []
    for i in list_paths:
        X.append(np.array(pd.read_csv(i, sep = " ", skiprows = [0], header = None))[:,1:])

    return np.array(X)

def combine(X):
    temp_X = []

    for i in X:
        for j in i:
            temp_X.append(j)

    return np.array(temp_X)

In [13]:
def plot_elbow(X, K=10):
    
    distortions = []
    for k in range(1,K):
        kmeanModel = KMeans(n_clusters=k).fit(X)
        kmeanModel.fit(X)
        distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

    # Plot the elbow
    plt.plot(range(1,K), distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()
    
def get_labels(X, K):
    clf = KMeans(n_clusters = K)
    clf.fit(X)

    # Gives the final cetnroids of each cluster
    centroids = clf.cluster_centers_
    return (clf.labels_, centroids)

def assign_labels(X, centroids):
    temp_X = []
    for i in X:
        temp_X.append(np.argmin(np.sum((centroids-i)**2, axis=1)))
#         print(((centroids-i)**2).shape)
#         print(temp_X)
#         break
    return np.array(temp_X)

In [14]:
def build_train_data(label_data, before_combine_data):
    start = 0
    end = start
    temp_labels = []
    for i in before_combine_data:
        bound = i.shape[0]
        temp_labels.append(label_data[start:end+bound])
        start = end+bound
        end = start

    return np.array(temp_labels)


def generate_out(X1, X2, X3, K, test_flag = 0, centroids = None):
    cX1 = combine(X1)
    cX2 = combine(X2)
    cX3 = combine(X3)    

    X = np.vstack((cX1, cX2, cX3))
    
    c_d = None
    if test_flag == 0:
        (c_d, centroids) = get_labels(X, K)
    else:
        c_d = assign_labels(X, centroids)

    c1 = c_d[0:cX1.shape[0]]
    c2 = c_d[cX1.shape[0]:cX1.shape[0]+cX2.shape[0]]
    c3 = c_d[cX1.shape[0]+cX2.shape[0]:]

    d1_labels = build_train_data(c1, X1)
    d2_labels = build_train_data(c2, X2)
    d3_labels = build_train_data(c3, X3)

    return ((d1_labels, d2_labels, d3_labels), centroids)



def write_outfile(np_array, filename="np_array.out"):
    with open(filename, 'w') as f:
        for row in np_array:
            temp = ''
            for item in row:
                temp = temp+str(item)+' '
            f.write(temp+'\n')
    print("Dumped to "+filename)
        

In [15]:
# Load and structure data
X1_train = load_mfcc(d1)
X2_train = load_mfcc(d2)
X3_train = load_mfcc(d3)

X_test = load_mfcc(d_test)

In [16]:
# plot_elbow(np.vstack((combine(X1_train), combine(X2_train), combine(X3_train))), 15)

In [20]:
K = 30

((t1,t2,t3), ctds) = generate_out(X1_train, X2_train, X3_train, K)
((t1,t2,t3), dummy) = generate_out(X_test, X_test, X_test, K, test_flag = 1, centroids = ctds)
write_outfile(t1,'hmm-1.04/d_connected_test.out')

Dumped to hmm-1.04/d_connected_test.out


In [19]:
# print file built seq for test files
for i in d_test:
    print(i)

test_data/a-bA-tA/1.txt
test_data/a-bA-tA/2.txt
test_data/a-bA-tA/3.txt
