<a href="https://colab.research.google.com/github/she-fa/speech-based-AD-diagnosis/blob/main/feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
# Installing giotto-dta
!pip install giotto-tda

Collecting giotto-tda
  Downloading giotto_tda-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting giotto-ph>=0.2.1 (from giotto-tda)
  Downloading giotto_ph-0.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.4/526.4 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyflagser>=0.4.3 (from giotto-tda)
  Downloading pyflagser-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting igraph>=0.9.8 (from giotto-tda)
  Downloading igraph-0.10.6-cp39-abi3-manylinux_2_17_x86_64.ma

# IMPORTING MODULES

In [5]:
from pathlib import Path

import chardet
import librosa # package for audio


import pandas as pd
import numpy as np

from scipy.io import wavfile
import scipy.signal as sps

import plotly.graph_objects as go

from gtda.time_series import SingleTakensEmbedding
from gtda.plotting import plot_point_cloud
import gtda.diagrams as diagrams
import gtda.homology as hl

import re

import seaborn as sns
import matplotlib.pyplot as plt

import zipfile
import os

# CODE IMPLEMENTATION

In [3]:
# Uploading file from Computer
from google.colab import files
uploaded = files.upload()

Saving dataset_sampleP.zip to dataset_sampleP.zip


In [6]:
zip_file_name = 'dataset_sampleP.zip'

# Extracting the zip file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall()

# Verifying the extracted files
extracted_files = os.listdir()
print(extracted_files)

['.config', 'P6.wav', 'P10.uh0', 'P1.uh0', 'P9 .uh0', 'P7.wav', 'P10.wav', 'P3.wav', 'P8.uh0', 'P1.wav', 'metadata.csv', 'P4.wav', 'P4.uh0', 'P5.uh0', 'P2.uh0', 'P7.uh0', 'P3.uh0', 'measurement_score.csv', 'dataset_sampleP.zip', 'P9 .wav', 'P8.wav', 'P6.uh0', 'P5.wav', 'P2.wav', 'sample_data']


In [9]:
# FUNCTION TO CONVERT PATH INTO DATAFRAME
def path_to_dataframe(path, file_format="*.wav"):
  '''
  converting string of path into dataframe consisting of the
  file names with requested file format
  Parameters
    path : string = name of path of orifin
    file_format: string = name of file format in the path
  Return
    DataFrame = DataFrame with path as columns and string of path as the values
  '''
  # Converting string-based path to Path object
  path = Path(path)

  list_file = []

  # Taking any file with requested file format to the list_file
  for file_path in path.glob(file_format):
    list_file.append(str(file_path))
  return pd.DataFrame(list_file, columns=['path'])


# CONVERTING ALL PATH NAME TO DATAFRAME
path_data = '/content' # path to folder
df = path_to_dataframe(path_data)
df.info() # resulting in 10 non-null

display(df.head())
# display(df.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    10 non-null     object
dtypes: object(1)
memory usage: 208.0+ bytes


Unnamed: 0,path
0,/content/P6.wav
1,/content/P7.wav
2,/content/P10.wav
3,/content/P3.wav
4,/content/P1.wav


In [21]:
# FUNCTION FIND ID
# implemented to extract ID data on the path string
def find_ID(cell, path_name='/content/'):
  '''
  extracting string of ID from the path string
  notes: the
  Parameter
    cell : string = string of path
    e.g. 'content/P2.wav'
    path_name : string = name of path of origin
    e.g. '/content/'
  Return
    string = string of subject ID
    e.g. 'P2'
  '''
  # taking the subject ID from the file name
  search_name = path_name + '(.{2})'
  match = re.search(search_name, cell) # need to be changed accordingly
  if match:
    return match.group(1)
  else:
    return None

print(find_ID(df.at[0, 'path'])) # trial on a cell

# TAKING SUBJECT ID TO NEW COLUMN
# Creating column ID
df['ID'] = None

# Taking subject ID from column 'path'
df['ID'] = df['path'].apply(find_ID)
df.head()

P6


Unnamed: 0,path,ID
0,/content/P6.wav,P6
1,/content/P7.wav,P7
2,/content/P10.wav,P1
3,/content/P3.wav,P3
4,/content/P1.wav,P1


In [22]:
# READING MEASUREMENT SCORE TO DATAFRAME
# measurement score is the mean of scores from the pilot study for each subject
path_score = '/content/measurement_score.csv'

df_score = pd.read_csv(path_score, sep=';')
df_score.info()
df_score.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      10 non-null     object 
 1   Score   10 non-null     float64
dtypes: float64(1), object(1)
memory usage: 288.0+ bytes


Unnamed: 0,ID,Score
0,P1,7.4
1,P2,8.2
2,P3,6.6
3,P4,6.4
4,P5,3.2


In [15]:
# MERGING DATAFRAME FOR PATH AND SCORE DATA
df_merge = df.merge(df_score, how='left', on='ID')
df_merge.info()
display(df_merge.head())
# display(df_merge)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   path    10 non-null     object 
 1   ID      10 non-null     object 
 2   Score   10 non-null     float64
dtypes: float64(1), object(2)
memory usage: 320.0+ bytes


Unnamed: 0,path,ID,Score
0,/content/P6.wav,P6,3.6
1,/content/P7.wav,P7,6.2
2,/content/P10.wav,P1,7.4
3,/content/P3.wav,P3,6.6
4,/content/P1.wav,P1,7.4


In [17]:
# Read the file paths from the DataFrame
file_paths = df_merge['path'].tolist()

# Create an empty numpy array to hold the audio data
audio_data = np.empty(len(file_paths), dtype=object)

# Loop through each file path and read the corresponding .wav file
for i, file_path in enumerate(file_paths):
    y, sr = librosa.load(file_path, sr=None)
    audio_data[i] = y

# Save the audio data as a new column in the DataFrame
df_merge['audio_data'] = audio_data
df_merge.head()

Unnamed: 0,path,ID,Score,audio_data
0,/content/P6.wav,P6,3.6,"[-0.01586914, -0.029571533, -0.041900635, -0.0..."
1,/content/P7.wav,P7,6.2,"[0.00015258789, -0.00015258789, 9.1552734e-05,..."
2,/content/P10.wav,P1,7.4,"[0.0, -3.0517578e-05, 0.00015258789, -6.103515..."
3,/content/P3.wav,P3,6.6,"[-0.00039672852, -0.00024414062, -3.0517578e-0..."
4,/content/P1.wav,P1,7.4,"[3.0517578e-05, 0.0, 0.0, 0.0, -3.0517578e-05,..."


In [18]:
# Checking the shape of the audio
print(df_merge.iloc[0]['audio_data'].shape)
print(df_merge.iloc[1]['audio_data'].shape)

(173853,)
(87014,)


In [23]:
# CREATING OBJECT FOR TDA

# 1) EMBEDDER
embedding_dimension = 3
embedding_time_delay = 16
embedding_stride = 20

embedder = SingleTakensEmbedding(
    parameters_type = 'fixed',
    n_jobs = 2,
    time_delay = embedding_time_delay,
    dimension = embedding_dimension,
    stride = embedding_stride,
)

# 2) PERSISTENCE
homology_dimensions = [0, 1]

# defining simplicial complex
persistence = hl.VietorisRipsPersistence(
    metric = 'euclidean',
    homology_dimensions = homology_dimensions
)

# 3) FEATURES
# Amplitude
amplitude_dict = {
    'amplitude_bottleneck':'bottleneck',
    'amplitude_wasserstein': 'wasserstein',
    'amplitude_betti': 'betti',
    'amplitude_landscape': 'landscape',
    'amplitude_silhouette': 'silhouette',
    'amplitude_heat': 'heat',
    'amplitude_persistence_image': 'persistence_image'
}
# Persistence Entropy
PE = diagrams.PersistenceEntropy()

# Number of Points
NoP = diagrams.NumberOfPoints()

# Complex Polynomial
polynomial_type = ['R', 'S', 'T']
complex_polynomial = {
    'complex_polynomial_'+ poly_type : diagrams.ComplexPolynomial(polynomial_type = poly_type)
    for poly_type in polynomial_type
    }

# creating dictionary for features
features_dict = {key: diagrams.Amplitude(metric=value)
                for (key,value) in amplitude_dict.items()}

features_dict.update(complex_polynomial)
features_dict['persistence_entropy'] = PE
features_dict['number_of_points'] = NoP
features_dict

{'amplitude_bottleneck': Amplitude(metric='bottleneck'),
 'amplitude_wasserstein': Amplitude(metric='wasserstein'),
 'amplitude_betti': Amplitude(metric='betti'),
 'amplitude_landscape': Amplitude(),
 'amplitude_silhouette': Amplitude(metric='silhouette'),
 'amplitude_heat': Amplitude(metric='heat'),
 'amplitude_persistence_image': Amplitude(metric='persistence_image'),
 'complex_polynomial_R': ComplexPolynomial(),
 'complex_polynomial_S': ComplexPolynomial(polynomial_type='S'),
 'complex_polynomial_T': ComplexPolynomial(polynomial_type='T'),
 'persistence_entropy': PersistenceEntropy(),
 'number_of_points': NumberOfPoints()}

In [24]:
# FUNCTION OF PIPELINE TO GENERATE FEATURE
# version
# input = path: dataframe path
# output = dataframe

def pipeline(df_path, embedder, persistence, features):
  '''
  generating dataframe of features from giotto.tda
  Parameter
    df_path : DataFrame = DataFrame consisting of path name
    embedder : SingleTakensEmbedding object = time series embedding class
    persistence: VietorisRipsPersistence object = creating simplicial complex
    features: dict = dictionary of features to calculate
  Return
    DataFrame = DataFrame consisting features with their calculated values
  '''

  #n_samples = len(df_path.index) # length of index
  df = pd.DataFrame(columns=
                     ['path'] +
                     [key + "_h0" for key in features.keys()] +
                     [key + "_h1" for key in features.keys()]
                     )
  df.set_index('path', inplace = True)

  for index, value in df_path['path'].items():
    path = value
    fs, wav = wavfile.read(path)
    point_cloud = embedder.fit_transform(wav)
    point_cloud= point_cloud[None, :, :]
    persistence_diagram = persistence.fit_transform(point_cloud)

    for (key,value) in features.items():
      calculate_feature = value.fit_transform(persistence_diagram)
      df.at[path, key + '_h0'] = calculate_feature[0][0]
      df.at[path, key + '_h1'] = calculate_feature[0][1]

  return df

# df_result = pd.DataFrame(df_merge['path'])
# df_result = df_result[:15]
# df_result

df_result = pipeline(df_merge[['path']],
               embedder,
               persistence,
               features_dict)

df_result

Unnamed: 0_level_0,amplitude_bottleneck_h0,amplitude_wasserstein_h0,amplitude_betti_h0,amplitude_landscape_h0,amplitude_silhouette_h0,amplitude_heat_h0,amplitude_persistence_image_h0,complex_polynomial_R_h0,complex_polynomial_S_h0,complex_polynomial_T_h0,...,amplitude_betti_h1,amplitude_landscape_h1,amplitude_silhouette_h1,amplitude_heat_h1,amplitude_persistence_image_h1,complex_polynomial_R_h1,complex_polynomial_S_h1,complex_polynomial_T_h1,persistence_entropy_h1,number_of_points_h1
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/content/P6.wav,1696.084717,15883.266452,94996.139942,57030.020135,3608.162527,107.823436,74.992402,0.0,0.0,-15422.394218,...,7313.848568,15878.771074,227.054517,17.730263,28.621454,-1706788544574.4517,-853394272287.2241,6114051.627881,11.302737,4794
/content/P7.wav,5023.588867,53382.575988,138674.347057,290705.566421,20709.664384,12.743393,9.141643,0.0,0.0,-45351.709438,...,14661.498034,83300.653721,4268.039678,1.724704,1.862951,-18117913497168.047,-9058956748584.031,-1049806118.938871,10.339645,2043
/content/P10.wav,2712.015869,58494.135285,249246.488643,115310.950864,14129.322378,39.178424,23.608693,0.0,0.0,101880.777942,...,27027.825194,55326.367626,2648.083164,4.654323,4.572296,-46346993963133.9,-23173496981566.977,2838164965.030613,11.587789,4939
/content/P3.wav,335.179443,4749.595583,47446.893305,5010.121716,491.776237,226.93937,130.951157,0.0,0.0,-3310.926173,...,4747.035318,4318.421883,72.352531,29.956924,26.804461,-171130680433.6091,-85565340216.80461,6040774.42792,10.766084,2736
/content/P1.wav,1103.3396,16143.268595,108360.317522,29922.372292,2444.436947,88.214243,57.202729,0.0,0.0,-8438.313936,...,12442.729205,8938.119767,375.219904,12.45566,10.893016,-2699694268534.8726,-1349847134267.4329,37948695.804285,11.179076,3447
/content/P4.wav,595.705872,9395.89377,76678.767305,11870.80128,1195.875547,150.098424,89.852649,0.0,0.0,-2537.001927,...,6862.901808,4964.103104,191.770177,18.880457,17.546705,-814439305465.7303,-407219652732.8654,-34847099.527282,10.991651,3277
/content/P9 .wav,9366.868164,59009.541663,181272.467165,740156.95648,19912.501073,11.51663,12.410569,0.0,0.0,-37880.465239,...,21833.586618,94298.690054,5380.367506,2.257968,1.913427,-29503969567190.137,-14751984783595.068,-438573393.380757,10.661382,2485
/content/P8.wav,4146.620117,58027.641809,147920.924419,218008.581537,22935.011538,13.743786,9.029027,0.0,0.0,27844.820551,...,17575.455799,80281.818351,4232.241726,1.839586,1.644614,-22626763075848.824,-11313381537924.42,-1649602655.070787,10.495954,2167
/content/P5.wav,930.918884,13805.172811,129074.755673,23189.958648,1536.165549,174.391607,116.025649,0.0,0.0,6880.985086,...,14363.170504,7496.318488,204.65926,28.273933,24.41622,-2661989374484.9834,-1330994687242.4907,-126634883.003723,11.818985,5532
/content/P2.wav,574.916504,8281.946257,73834.351817,11254.841354,937.331619,159.154285,104.563065,0.0,0.0,-6607.787029,...,8424.425266,4913.565684,143.206573,23.563214,19.559996,-661111905437.8531,-330555952718.9266,6493640.294504,11.107187,3262


In [25]:
# MERGING DATA FRAME OF RESULT WITH THE SCORE
df_result_score = df_result.merge(df_merge[['path', 'Score','ID']],
                                  how='left',
                                  on = 'path'
                                  )
display(df_result_score.head())
print(df_result_score.info())

Unnamed: 0,path,amplitude_bottleneck_h0,amplitude_wasserstein_h0,amplitude_betti_h0,amplitude_landscape_h0,amplitude_silhouette_h0,amplitude_heat_h0,amplitude_persistence_image_h0,complex_polynomial_R_h0,complex_polynomial_S_h0,...,amplitude_silhouette_h1,amplitude_heat_h1,amplitude_persistence_image_h1,complex_polynomial_R_h1,complex_polynomial_S_h1,complex_polynomial_T_h1,persistence_entropy_h1,number_of_points_h1,Score,ID
0,/content/P6.wav,1696.084717,15883.266452,94996.139942,57030.020135,3608.162527,107.823436,74.992402,0.0,0.0,...,227.054517,17.730263,28.621454,-1706788544574.4517,-853394272287.2241,6114051.627881,11.302737,4794,3.6,P6
1,/content/P7.wav,5023.588867,53382.575988,138674.347057,290705.566421,20709.664384,12.743393,9.141643,0.0,0.0,...,4268.039678,1.724704,1.862951,-18117913497168.047,-9058956748584.031,-1049806118.938871,10.339645,2043,6.2,P7
2,/content/P10.wav,2712.015869,58494.135285,249246.488643,115310.950864,14129.322378,39.178424,23.608693,0.0,0.0,...,2648.083164,4.654323,4.572296,-46346993963133.9,-23173496981566.977,2838164965.030613,11.587789,4939,7.4,P1
3,/content/P3.wav,335.179443,4749.595583,47446.893305,5010.121716,491.776237,226.93937,130.951157,0.0,0.0,...,72.352531,29.956924,26.804461,-171130680433.6091,-85565340216.80461,6040774.42792,10.766084,2736,6.6,P3
4,/content/P1.wav,1103.3396,16143.268595,108360.317522,29922.372292,2444.436947,88.214243,57.202729,0.0,0.0,...,375.219904,12.45566,10.893016,-2699694268534.8726,-1349847134267.4329,37948695.804285,11.179076,3447,7.4,P1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   path                            10 non-null     object 
 1   amplitude_bottleneck_h0         10 non-null     object 
 2   amplitude_wasserstein_h0        10 non-null     object 
 3   amplitude_betti_h0              10 non-null     object 
 4   amplitude_landscape_h0          10 non-null     object 
 5   amplitude_silhouette_h0         10 non-null     object 
 6   amplitude_heat_h0               10 non-null     object 
 7   amplitude_persistence_image_h0  10 non-null     object 
 8   complex_polynomial_R_h0         10 non-null     object 
 9   complex_polynomial_S_h0         10 non-null     object 
 10  complex_polynomial_T_h0         10 non-null     object 
 11  persistence_entropy_h0          10 non-null     object 
 12  number_of_points_h0             10 non-

In [26]:
# convert DataFrame to CSV with tab delimiter
df_result_score.to_csv('result_tda.tsv', sep='\t', index=False)

In [27]:
# downloading the result file
files.download('result_tda.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>