In [734]:
import scipy.io
import pandas as pd
import numpy as np
import os
import tkinter as tk
from tkinter import Grid
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from numpy.core.fromnumeric import argmin
from numpy import mean, std, max
import seaborn as sns
import random
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import math

np.random.seed(42)

In [230]:
ref_dir = 'H:/Jupyter_Notebooks/DSP Project/records mats/Team2_Ref/'
test_dir = 'H:/Jupyter_Notebooks/DSP Project/records mats/All_Records/'
try:
    os.mkdir(ref_dir)
    os.mkdir(test_dir)
except:
    print("Created")

Created


In [877]:
def collectWords(test_dir, ref_dir):
    ref_words, ref_mats = [], []
    test_words, test_mats = [], []
    for file in os.listdir(ref_dir):
        ref_words.append(file)
        ref_mats.append(scipy.io.loadmat(ref_dir+file))

    for file in os.listdir(test_dir):
        test_words.append(file)
        test_mats.append(scipy.io.loadmat(test_dir+file))
        
    return (ref_words, ref_mats), (test_words, test_mats)
#--------------------------------------------------------------------------------------------------------------------------------
def featureScaling(test_mats, ref_mats):
    df_total = pd.DataFrame()
    for mat in ref_mats:
        single_df = pd.DataFrame(mat['featuresMatrix'])
        df_total = pd.concat((df_total, single_df))     
    for mat in test_mats:
        single_df = pd.DataFrame(mat['featuresMatrix'])
        df_total = pd.concat((df_total, single_df))
        
    df_total2 = df_total.copy()
    SS = StandardScaler()
    MS = MinMaxScaler()
    SS.fit(df_total[df_total.columns])
    MS.fit(df_total2[df_total2.columns])
    df_total[df_total.columns] = SS.fit_transform(df_total[df_total.columns])
    df_total2[df_total2.columns] = MS.fit_transform(df_total2[df_total2.columns])
    return SS, MS, df_total, df_total2
#--------------------------------------------------------------------------------------------------------------------------------
def find_Group_Student(group_num, student_num, words, mat_files, StandardS):
    assert (int(student_num)>=1 and int(student_num)<=5), "Error in student number"
    assert (int(group_num)>=1 and int(group_num)<=14), "Error in group number"
    assert (len(words) == len(mat_files)), "Not equal in length"
    
    selected_words_GS, selected_mats_GS = [], []
    for i, (word, mat) in enumerate(zip(words, mat_files)):
        if ('G'+ str(group_num)) + ('S'+ str(student_num)) in word:
            selected_words_GS.append(word)
            mat['featuresMatrix'] = StandardS.transform(mat['featuresMatrix'])
            selected_mats_GS.append(mat['featuresMatrix'])
    assert (len(selected_mats_GS) == len(selected_words_GS)), "Output files not equal in length ?????"
    return (selected_words_GS, selected_mats_GS)
#--------------------------------------------------------------------------------------------------------------------------------    
def find_Pair_Word_Gender(pair_num, word_num, gender, words, mat_files, StandardS):
    assert (word_num==1 or word_num==2), "Error in word number"
    assert (int(pair_num)>=0 and int(pair_num)<=62), "Error in pair number"
    assert (gender=='F' or gender=='M' or gender=='C'), "Error in gender"
    assert (len(words) == len(mat_files)), "Not equal in length"
    assert (word_num == 1 or int(pair_num) != 62), "Last pair only have 1 word"
    
    selected_word_PWG, selected_mat_PWG = [], []
    for i, (word, mat) in enumerate(zip(words, mat_files)):
        if ('P'+ str(pair_num)) + ('W'+ str(word_num)) in word and word.find(gender, 4, 6)!=-1:
            selected_word_PWG.append(word)
            mat['featuresMatrix'] = StandardS.transform(mat['featuresMatrix'])
            selected_mat_PWG.append(mat['featuresMatrix'])
    assert (len(selected_mat_PWG) == len(selected_word_PWG)), "Output files not equal in length ?????"
    return (selected_word_PWG, selected_mat_PWG)
#--------------------------------------------------------------------------------------------------------------------------------
def dtw(reference, sequence ,dist = np.linalg.norm, reconstruct=False):
    reference = np.array(reference)
    sequence = np.array(sequence)
    assert np.shape(reference)[1] == np.shape(sequence)[1],"reference and y must have the same number of columns"
    k = np.shape(reference)[1]
    r, c = np.shape(sequence)[0], np.shape(reference)[0]
    # assert r>=c//2 and r<=c*2, "reference must have at least half as many rows as sequence"
    if not (r>=c//2 and r<=c*2):
        # print("reference must have at least half as many rows as sequence")
        if reconstruct:
            return np.inf, None, None
        return np.inf

    # Initialize the cost matrix
    D = np.zeros((r+1,c+1))
    D[0, 1:] = np.inf
    D[1:, 0] = np.inf
    D[0,0] = 0

    # Initialize the distance matrix
    d = np.zeros((r,c))

    # setting unwanted region to infinity
    j_limit = 0
    for i in range(r):
        j_limit += 2
        for j in range(j_limit,c):
            d[i,j] = np.inf
    i_limit = 0
    for j in range(c):
        i_limit += 2
        for i in range(i_limit,r):
            d[i,j] = np.inf
    j_limit = c
    for i in reversed(range(r)):
        j_limit -= 2
        for j in reversed(range(j_limit)):
            d[i,j] = np.inf
    i_limit = r
    for j in reversed(range(c)):
        i_limit -= 2
        for i in reversed(range(i_limit)):
            d[i,j] = np.inf
    
    # initializing optimal path matrix
    B = np.zeros((r,c,2),dtype=np.int)
    B[0,0] = [0,0]

    # computing cost matrix and optimal path matrix
    for i in range(r):
        for j in range(c):
            if d[i,j] == np.inf:
                D[i+1,j+1] = np.inf
                continue
            d[i,j] = dist(reference[j]-sequence[i])
            if (B[i-1,j,0] == i-2 and B[i-1,j,1] == j) and (B[i,j-1,0] == i and B[i,j-1,1] == j-2):
                D[i+1,j+1] = d[i,j]+D[i,j]
                B[i,j] = [i-1,j-1]
            elif B[i-1,j,0] == i-2 and B[i-1,j,1] == j:
                D[i+1,j+1] = d[i,j]+min(D[i+1,j],D[i,j])
                index = argmin([D[i+1,j],D[i,j]])
                B[i,j] = [i-index,j-1]
            elif B[i,j-1,0] == i and B[i,j-1,1] == j-2:
                D[i+1,j+1] = d[i,j]+min(D[i,j],D[i,j+1])
                index = argmin([D[i,j+1],D[i,j]])
                B[i,j] = [i-1, j-index]
            else:
                D[i+1,j+1] = d[i,j]+min(D[i+1,j],D[i,j+1],D[i,j])
                index = argmin([D[i+1,j],D[i,j+1],D[i,j]])
                B[i,j] = [i-int(index>0), j-1+int(index==1)]

    if reconstruct:
        i = r-1
        j = c-1
        path = [(i,j)]
        while i>0 or j>0:
            step = (B[i,j,0],B[i,j,1])
            path.insert(0, (step[0],step[1]))
            i,j = step
        constructed_sequence = np.zeros((c,k))
        skipNext = False
        k=0
        for i,j in path:
            if not skipNext:
                if k+1<c and k+1<len(path) and j == path[k+1][1]:
                    constructed_sequence[j] = (sequence[i]+sequence[i+1])/2
                    skipNext = True
                else:
                    constructed_sequence[j] = sequence[i]
            else:
                skipNext = False
            k += 1
        distances = np.zeros((c,1),np.float64)
        for i in range(c):
            distances[i] = dist(reference[i,]-constructed_sequence[i])   
        return D[r,c],constructed_sequence, distances
    return D[r, c]
#--------------------------------------------------------------------------------------------------------------------------------
def calc_dtw(pair_num, gender, Scaler, word_num=[1, 2]):
    dtw1_1, dtw1_2, dtw2_1, dtw2_2 = [], [], [], []     
    for num in word_num:
        if num==1:
            ref_word1, ref_mat1 = find_Pair_Word_Gender(pair_num = str(pair_num), word_num = num, 
                                                        gender=gender, words=ref_words, mat_files=ref_mats, StandardS=Scaler)
            test_words1, test_mats1 = find_Pair_Word_Gender(pair_num = str(pair_num), word_num = num, 
                                                        gender=gender, words=test_words, mat_files=test_mats, StandardS=Scaler)
        elif num==2: #instead of else, doesn't matter don't care.
            ref_word2, ref_mat2 = find_Pair_Word_Gender(pair_num = str(pair_num), word_num = num, 
                                                         gender=gender, words=ref_words, mat_files=ref_mats, StandardS=Scaler)
            test_words2, test_mats2 = find_Pair_Word_Gender(pair_num = str(pair_num), word_num = num, 
                                                        gender=gender, words=test_words, mat_files=test_mats, StandardS=Scaler) 
                                                        
    for i, (mat1, mat2) in enumerate(zip(test_mats1[:], test_mats2[:])):
        dtw1_1.append(dtw(ref_mat1[0], mat1, dist=np.linalg.norm))
        dtw1_2.append(dtw(ref_mat1[0], mat2, dist=np.linalg.norm))    
        dtw2_1.append(dtw(ref_mat2[0], mat1, dist=np.linalg.norm))
        dtw2_2.append(dtw(ref_mat2[0], mat2, dist=np.linalg.norm))
        # dtw1_1.append(fastdtw(ref_mat1[0], mat1, dist=euclidean)[0])
        # dtw1_2.append(fastdtw(ref_mat1[0], mat2, dist=euclidean)[0])    
        # dtw2_1.append(fastdtw(ref_mat2[0], mat1, dist=euclidean)[0])
        # dtw2_2.append(fastdtw(ref_mat2[0], mat2, dist=euclidean)[0])
            
    return dtw1_1, dtw1_2, dtw2_1, dtw2_2
#--------------------------------------------------------------------------------------------------------------------------------
def detect_outliers(data):
    indices = []
    data_mean, data_std = mean(data), std(data)
    cut_off = data_std * 3 #3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    if outliers == []:
        return

    for outlier in outliers:
        indices.append(data.index(outlier))

    return indices
#--------------------------------------------------------------------------------------------------------------------------------
def remove_elements(data, indices):
    if indices == None or indices == [] or indices == 'N':
        return 

    else:
        new_indices = list(set(indices))
        for index in reversed(indices):
            data.pop(index)

    return data
#--------------------------------------------------------------------------------------------------------------------------------
def detectandremove(data):
    indices = []
    data_mean, data_std = mean(data), std(data)
    #cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    if outliers == []:
        return

    for outlier in outliers:
        indices.append(data.index(outlier))

    for index in reversed(indices):
        data.pop(index)

    return data, indices
 #--------------------------------------------------------------------------------------------------------------------------------       
def detect_th(data1, data2):
    m1, m2 = mean(data1), mean(data2)
    th = (m1+m2)*0.7 #*0.7
    return th
#--------------------------------------------------------------------------------------------------------------------------------
def Judgement(threshold, dtw1, dtw2, pair, gender):
    total = 0
    correct, wrong, other = 0, 0, 0
    word1, word2 = 0, 0
    #dtw1_1 < dtw1_2 and <threshold ->Word1, else if dtw1_1 > dtw1_2 and <threshold ->Word2
    #else ->Other
    arr = np.zeros((6, 1))

    for i, (one, two) in enumerate(zip(dtw1, dtw2)):
        if one < two and one < threshold:
            correct+=1
            word1+=1
        elif one > two and one < threshold:
            wrong+=1
            word2+=1
        else:
            other+=1
            wrong+=1
        total+=1
    other_word = wrong-other    
    arr = [pair, gender, total, other, other_word, correct, wrong]   

    return arr
#--------------------------------------------------------------------------------------------------------------------------------
def to_numeric(df, features):
    df[features] = df[features].apply(pd.to_numeric)
    return df

    return df
#---------------------------------------------------------------------------------------------------------------------------------
def adjustDataFrame(df):
    Word_Num = df.index+1
    df.set_index(df['pair'], inplace=True)
    df.insert(0, 'Word_Number', Word_Num)
    df.drop(columns=['pair'], inplace=True)
    return df
    
#--------------------------------------------------------------------------------------------------------------------------------
def UsefulFeatures(df, features):
    df.loc['Summation',:] = df[features].sum(axis=0)
    df.loc[:,'Summation'] = df[features].sum(axis=1)
    df['Correct_Percentage'] = df['correct']/df['total']
    df['Wrong_Percentage'] = df['wrong']/df['total']
    df.drop(columns=['Summation'], inplace=True)
    return df

In [203]:
(ref_words, ref_mats), (test_words, test_mats) = collectWords(test_dir, ref_dir)
SS, MS, df1, df2 = featureScaling(test_mats, ref_mats)

In [None]:
types = ['M', 'F', 'C']
df_M =  pd.DataFrame(columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])
df_F =  pd.DataFrame(columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])
df_C =  pd.DataFrame(columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])
total_ths_M,  total_ths_F, total_ths_C = [], [], []
words = pd.read_csv('words.csv')
pairs = [i for i in range(62)]
pairs.pop(0)
pairs = list(map(str, pairs))
for i, pair in enumerate(pairs):
    if i == 9:
        break
    pairs[i] = '0'+ pair
total_ths = []

for gender in types:
    print(f'Start Type {gender}')
    for pair in pairs:
        print(f'Start Pair {pair}')
        dtw1_1, dtw1_2, dtw2_1, dtw2_2 = calc_dtw(pair_num=pair, gender=gender, Scaler= SS, word_num=[1, 2])
        a_infs1 = np.where(np.isinf(dtw1_1))
        a_infs2 = np.where(np.isinf(dtw1_2))
        a_infs3 = np.where(np.isinf(dtw2_1))
        a_infs4 = np.where(np.isinf(dtw2_2))
        for data, index in zip([dtw1_1, dtw1_2, dtw2_1, dtw2_2], [a_infs1, a_infs2, a_infs3, a_infs4]):
            data = remove_elements(data, list(index[0]))
        
        indices_total = []
        counter = 0
        condition = False
        while not condition:
            for i, data in enumerate([dtw1_1, dtw1_2, dtw2_1, dtw2_2]):
                indices = detect_outliers(data)
                if indices == None:
                    indices = 'N'
                indices_total.extend(indices)
                data = remove_elements(data, indices)
            counter+=1    
            if counter > 1:
                condition = any((indices_total[j]=='N' and indices_total[j+1]=='N' and indices_total[j+2]=='N') for j in range(1, len(indices_total)-3))

        lengths = [len(dtw1_1), len(dtw1_2), len(dtw2_1), len(dtw2_2)]
        max_length = [length for length in lengths if length==lengths[np.argmin(lengths)]]
        random_samples = list(np.random.randint(low=0, high=max_length[0], size=20))
        thresholds = []
        dtw1_1n, dtw2_2n = [dtw1_1[i] for i in random_samples], [dtw2_2[i] for i in random_samples]
        dtw1_2n, dtw2_1n = [dtw1_2[i] for i in random_samples], [dtw2_1[i] for i in random_samples]
        thresholds.append(detect_th(dtw1_1n, dtw1_2n))
        thresholds.append(detect_th(dtw2_2n, dtw2_1n))
        if gender == 'M':
            total_ths_M.append(thresholds)
            array1 = Judgement(thresholds[0], dtw1_1, dtw1_2, pair=pair, gender=gender)
            array2 = Judgement(thresholds[1], dtw2_2, dtw2_1, pair=pair, gender=gender)
            arr = np.vstack((array1, array2))
            df_M = pd.concat((df_M, pd.DataFrame(arr, columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])), axis=0)
        elif gender == 'F':
            total_ths_F.append(thresholds)
            array1 = Judgement(thresholds[0], dtw1_1, dtw1_2, pair=pair, gender=gender)
            array2 = Judgement(thresholds[1], dtw2_2, dtw2_1, pair=pair, gender=gender)
            arr = np.vstack((array1, array2))
            df_F = pd.concat((df_F, pd.DataFrame(arr, columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])), axis=0)
        else:
            total_ths_C.append(thresholds)
            array1 = Judgement(thresholds[0], dtw1_1, dtw1_2, pair=pair, gender=gender)
            array2 = Judgement(thresholds[1], dtw2_2, dtw2_1, pair=pair, gender=gender)
            arr = np.vstack((array1, array2))
            df_C = pd.concat((df_C, pd.DataFrame(arr, columns=['pair', 'type', 'total', 'other', 'other_word', 'correct', 'wrong'])), axis=0)  

        print(f'End Pair {pair}')
    print(f'End Type {gender}')

df1 = df_M.copy()
df2 = df_F.copy()
df3 = df_C.copy()

flat_list_M = [item for sublist in total_ths_M for item in sublist]
flat_list_F = [item for sublist in total_ths_F for item in sublist]
flat_list_C = [item for sublist in total_ths_C for item in sublist]
df_M['Threshold'] = flat_list_M
df_F['Threshold'] = flat_list_F
df_C['Threshold'] = flat_list_C

features = [col for col in df_C.columns if col not in ['type']]
features2 = [col for col in df_C.columns if col not in ['type', 'pair']]
for df in [df_M, df_F, df_C]:
    df = to_numeric(df, features)
    df = adjustDataFrame(df)
    df = UsefulFeatures(df, features2)

df_M.to_csv('df_M.csv')
df_F.to_csv('df_F.csv')
df_C.to_csv('df_C.csv')

In [883]:
from tkinter import messagebox, ttk
class GUI(tk.Tk):
    def __init__(self, *args, **kwargs):
        tk.Tk.__init__(self, *args, **kwargs)
        self.title("Speech Validation System")
        self.geometry("500x500")
        self.resizable(True,True)
        self._frame = None
        self.switch_frame(StartPage)

    def switch_frame(self, frame_class, *args, **kwargs):
        new_frame = frame_class(self)
        if self._frame is not None:
            self._frame.destroy()
        self._frame = new_frame
        self._frame.pack()

    def get_accuracy(self, df):
        messagebox.showinfo("Accuracy", df['Correct_Percentage'].iloc[-1])

    def getUsersAvg(self, df):
        messagebox.showinfo("Average Users", df['total'].iloc[0:122].mean())

class StartPage(tk.Frame):
    def __init__(self, master):
        tk.Frame.__init__(self, master)
        tk.Label(self, text="Hello and Welcome to my Humble System").grid(column=0,row=0)
        tk.Button(self, text="Select type please",
                  command=lambda: master.switch_frame(Type_Page)).grid(column=0,row=1)

class Type_Page(tk.Frame):
    def __init__(self, master):
        tk.Frame.__init__(self, master)
        tk.Label(self, text="Pick Type").grid(column=0,row=0)
        tk.Button(self, text="Male", command=lambda: master.switch_frame(Male)).grid(column=0,row=1)
        tk.Button(self, text="Child", command=lambda: master.switch_frame(Child)).grid(column=0,row=2)
        tk.Button(self, text="Female", command=lambda: master.switch_frame(Female)).grid(column=0,row=3)
        tk.Button(self, text="Return to Start Page", command=lambda: master.switch_frame(StartPage)).grid(column=0,row=4)

class Male(tk.Frame):

    def show_option(self):
        identifier = self.options.get()
        self.text.delete(1.0, tk.END)
        self.text.insert(tk.END, str(df_M[identifier]))

    def __init__(self, master):
        tk.Frame.__init__(self, master)
        tk.Label(self, text="Male Page").pack()
        tk.Button(self, text="Show Total Accuracy", pady=5, command=lambda: master.get_accuracy(df_M)).pack()
        tk.Button(self, text="Show Average User Number", pady=5, command=lambda: master.getUsersAvg(df_M)).pack()
        tk.Button(self, text="Return To Previous Page", pady=5, command=lambda: master.switch_frame(Type_Page)).pack()
        tk.Button(self, text="Return to Start Page", pady=5, command=lambda: master.switch_frame(StartPage)).pack()
        tk.Label(self, text='Select option:').pack()
        self.options = ttk.Combobox(self, values=list(df_M.columns))
        self.options.pack()
        tk.Button(self, text='Show option', command=self.show_option).pack()
        self.text = tk.Text(self)
        self.text.pack()



class Female(tk.Frame):
    def show_option(self):
        identifier = self.options.get()
        self.text.delete(1.0, tk.END)
        self.text.insert(tk.END, str(df_F[identifier]))

    def __init__(self, master):
        tk.Frame.__init__(self, master)
        tk.Label(self, text="Female Page").pack(side="top", fill="x", pady=15)
        tk.Button(self, text="Show Total Accuracy", pady=5, command=lambda: master.get_accuracy(df_F)).pack()
        tk.Button(self, text="Show Average User Number", pady=5, command=lambda: master.getUsersAvg(df_C)).pack()
        tk.Button(self, text="Return To Previous Page", pady=5, command=lambda: master.switch_frame(Type_Page)).pack()
        tk.Button(self, text="Return to Start Page", pady=5, command=lambda: master.switch_frame(StartPage)).pack()
        self.options = ttk.Combobox(self, values=list(df_F.columns))
        self.options.pack()
        tk.Button(self, text='Show option', command=self.show_option).pack()
        self.text = tk.Text(self)
        self.text.pack()

class Child(tk.Frame):
    def show_option(self):
        identifier = self.options.get()
        self.text.delete(1.0, tk.END)
        self.text.insert(tk.END, str(df_C[identifier]))

    def __init__(self, master):
        tk.Frame.__init__(self, master)
        tk.Label(self, text="Child Page").pack(side="top", fill="x", pady=15)
        tk.Button(self, text="Show Total Accuracy", pady=5, command=lambda: master.get_accuracy(df_C)).pack()
        tk.Button(self, text="Show Average User Number", pady=5, command=lambda: master.getUsersAvg(df_C)).pack()
        tk.Button(self, text="Return To Previous Page", pady=5, command=lambda: master.switch_frame(Type_Page)).pack()
        tk.Button(self, text="Return to Start Page", pady=5, command=lambda: master.switch_frame(StartPage)).pack()
        self.options = ttk.Combobox(self, values=list(df_C.columns))
        self.options.pack()
        tk.Button(self, text='Show option', command=self.show_option).pack()
        self.text = tk.Text(self)
        self.text.pack()

if __name__ == "__main__":
    App = GUI()
    App.mainloop()