In [1]:
#without power transformation 
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub.playback import play
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = pd.read_excel(excel_path)

# Lists to store extracted features and ptau181 values for male patients
all_data = []
ptau181_values = []

# Loop through patient IDs in the DataFrame
for idx, row in data_df.iterrows():
    patient_id = row["PatientID"]
    sex = row["Sex"]
    
    # Filter data for male patients
    if sex != "M" and sex != "F" :
        continue
    
    # Iterate through audio files in the folder
    for filename in os.listdir(audio_folder):
        if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
            audio_filepath = os.path.join(audio_folder, filename)
            
            # Load the audio file
            y, sr = librosa.load(audio_filepath)
            
            # Extract Root Mean Square Energy
            rmse = librosa.feature.rms(y=y)
            rmse_mean = np.mean(rmse)
            
            # Extract Speech Rate (words per minute)
            speech_rate = len(librosa.effects.split(y)) / (len(y) / sr) * 60
            
            # Extract Harmonics-to-Noise Ratio (HNR) 
            # HNR measure how clear and smooth their voice sounds 
            hnr = librosa.effects.harmonic(y)
            hnr_mean = np.mean(hnr)
            
            # Extract Formants
            sound = AudioSegment.from_wav(audio_filepath)
            formants = sound.dBFS
            formants_mean = np.mean(formants)
            
            # Extract MFCCs
            mfccs = librosa.feature.mfcc(y=y, sr=sr)
            mfccs_mean = np.mean(mfccs)
            
            # Extract Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
            mel_spectrogram_mean = np.mean(mel_spectrogram)
            
            # Extract Spectral Centroid
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_centroid_mean = np.mean(spectral_centroids)
            
            # Get the ptau181 value for the male patient
            ptau181_value = row["ptau181"]
            
            all_data.append([rmse_mean, speech_rate, hnr_mean, formants_mean, mfccs_mean, mel_spectrogram_mean, spectral_centroid_mean])
            ptau181_values.append(np.log(ptau181_value))
            
            break  # Break after finding the first matching audio file

# Convert the lists to NumPy arrays
X = np.array(all_data)
y = np.array(ptau181_values)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Adding a constant to the independent variables matrix
X_train_with_const = sm.add_constant(X_train)

# Fit the model
model = sm.OLS(y_train, X_train_with_const).fit()

# Print the regression results
print(model.summary())



                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     4.740
Date:                Wed, 14 Feb 2024   Prob (F-statistic):           0.000111
Time:                        10:55:08   Log-Likelihood:                -59.154
No. Observations:                 118   AIC:                             134.3
Df Residuals:                     110   BIC:                             156.5
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9485      0.038     76.876      0.0

In [2]:
data_df

Unnamed: 0,PatientID,VisitNumber,sample_year,sample_dayssincebaseline,experiment_date,ab42,ttau,ptau181,platform,Status,Speech Data,Closest Visit Number,audio label,Sex,Average Label,AD
0,7173,6,2017,2542,2022-07-28,294.1,215.9,20.87,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1
1,7173,8,2019,3320,2022-07-29,308.0,217.2,21.17,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1
2,7210,3,2014,1128,2022-07-22,707.2,128.8,10.77,Elecsys,QCI,Y,11,QCI,M,QCI,1
3,7210,5,2016,2042,2022-11-15,613.3,131.0,11.47,Elecsys,QCI,Y,11,QCI,M,QCI,1
4,7471,5,2018,1955,2022-11-29,323.8,387.5,44.05,Elecsys,Normal,Y,9,Normal,M,QCI,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,10546,0,2023,93,2023-02-13,919.8,177.6,16.07,Elecsys,Normal,Y,0,Normal,M,Normal,N
144,10549,0,2022,87,2023-02-13,1778.0,225.9,19.52,Elecsys,Normal,Y,0,Normal,F,Normal,N
145,10554,0,2022,32,2023-02-13,837.2,120.9,10.37,Elecsys,Normal,,0,Normal,F,Normal,N
146,10555,0,2022,15,2023-02-13,314.6,192.1,18.43,Elecsys,Dementia,,0,Dementia,F,Dementia,1


In [3]:
# Calculate the ratio of ptau181 to ab42 and add it as a new column
data_df["ratio"] = data_df["ptau181"] / data_df["ab42"]


In [4]:
import pandas as pd

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = pd.read_excel(excel_path)

# Calculate the ratio of ptau181 to ab42 and add it as a new column
data_df["ratio"] = data_df["ptau181"] / data_df["ab42"]

# Now you can use data_df with the new "ratio" column
print(data_df.head())  # Check the first few rows to verify the new column


   PatientID  VisitNumber  sample_year  sample_dayssincebaseline  \
0       7173            6         2017                      2542   
1       7173            8         2019                      3320   
2       7210            3         2014                      1128   
3       7210            5         2016                      2042   
4       7471            5         2018                      1955   

  experiment_date   ab42   ttau  ptau181 platform    Status Speech Data  \
0      2022-07-28  294.1  215.9    20.87  Elecsys  Dementia           Y   
1      2022-07-29  308.0  217.2    21.17  Elecsys  Dementia           Y   
2      2022-07-22  707.2  128.8    10.77  Elecsys       QCI           Y   
3      2022-11-15  613.3  131.0    11.47  Elecsys       QCI           Y   
4      2022-11-29  323.8  387.5    44.05  Elecsys    Normal           Y   

   Closest Visit Number audio label Sex Average Label AD     ratio  
0                    11    Dementia   F      Dementia  1  0.070962  
1 

In [9]:
import pandas as pd

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = pd.read_excel(excel_path)

# Calculate the ratio of ptau181 to ab42 and add it as a new column
data_df["ratio"] = data_df["ptau181"] / data_df["ab42"]

data_df


Unnamed: 0,PatientID,VisitNumber,sample_year,sample_dayssincebaseline,experiment_date,ab42,ttau,ptau181,platform,Status,Speech Data,Closest Visit Number,audio label,Sex,Average Label,AD,ratio
0,7173,6,2017,2542,2022-07-28,294.1,215.9,20.87,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1,0.070962
1,7173,8,2019,3320,2022-07-29,308.0,217.2,21.17,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1,0.068734
2,7210,3,2014,1128,2022-07-22,707.2,128.8,10.77,Elecsys,QCI,Y,11,QCI,M,QCI,1,0.015229
3,7210,5,2016,2042,2022-11-15,613.3,131.0,11.47,Elecsys,QCI,Y,11,QCI,M,QCI,1,0.018702
4,7471,5,2018,1955,2022-11-29,323.8,387.5,44.05,Elecsys,Normal,Y,9,Normal,M,QCI,1,0.136041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,10546,0,2023,93,2023-02-13,919.8,177.6,16.07,Elecsys,Normal,Y,0,Normal,M,Normal,N,0.017471
144,10549,0,2022,87,2023-02-13,1778.0,225.9,19.52,Elecsys,Normal,Y,0,Normal,F,Normal,N,0.010979
145,10554,0,2022,32,2023-02-13,837.2,120.9,10.37,Elecsys,Normal,,0,Normal,F,Normal,N,0.012387
146,10555,0,2022,15,2023-02-13,314.6,192.1,18.43,Elecsys,Dementia,,0,Dementia,F,Dementia,1,0.058582


In [12]:
# Define the path for the new Excel file
output_excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD_with_ratio.xlsx"

# Save the DataFrame with the new "ratio" column to the new Excel file
data_df.to_excel(output_excel_path, index=False)

In [14]:
#without power transformation 
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub.playback import play
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD_with_ratio.xlsx"
data_df = pd.read_excel(excel_path)

# Lists to store extracted features and ptau181 values for male patients
all_data = []
ptau181_values = []

# Loop through patient IDs in the DataFrame
for idx, row in data_df.iterrows():
    patient_id = row["PatientID"]
    sex = row["Sex"]
    
    # Filter data for male patients
    if sex != "M" and sex != "F" :
        continue
    
    # Iterate through audio files in the folder
    for filename in os.listdir(audio_folder):
        if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
            audio_filepath = os.path.join(audio_folder, filename)
            
            # Load the audio file
            y, sr = librosa.load(audio_filepath)
            
            # Extract Root Mean Square Energy
            rmse = librosa.feature.rms(y=y)
            rmse_mean = np.mean(rmse)
            
            # Extract Speech Rate (words per minute)
            speech_rate = len(librosa.effects.split(y)) / (len(y) / sr) * 60
            
            # Extract Harmonics-to-Noise Ratio (HNR) 
            # HNR measure how clear and smooth their voice sounds 
            hnr = librosa.effects.harmonic(y)
            hnr_mean = np.mean(hnr)
            
            # Extract Formants
            sound = AudioSegment.from_wav(audio_filepath)
            formants = sound.dBFS
            formants_mean = np.mean(formants)
            
            # Extract MFCCs
            mfccs = librosa.feature.mfcc(y=y, sr=sr)
            mfccs_mean = np.mean(mfccs)
            
            # Extract Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
            mel_spectrogram_mean = np.mean(mel_spectrogram)
            
            # Extract Spectral Centroid
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_centroid_mean = np.mean(spectral_centroids)
            
            # Get the ptau181 value for the male patient
            ptau181_value = row["ttau"]
            
            all_data.append([rmse_mean, speech_rate, hnr_mean, formants_mean, mfccs_mean, mel_spectrogram_mean, spectral_centroid_mean])
            ptau181_values.append(np.log(ptau181_value))
            
            break  # Break after finding the first matching audio file

# Convert the lists to NumPy arrays
X = np.array(all_data)
y = np.array(ptau181_values)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Adding a constant to the independent variables matrix
X_train_with_const = sm.add_constant(X_train)

# Fit the model
model = sm.OLS(y_train, X_train_with_const).fit()

# Print the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.235
Model:                            OLS   Adj. R-squared:                  0.186
Method:                 Least Squares   F-statistic:                     4.831
Date:                Wed, 14 Feb 2024   Prob (F-statistic):           8.97e-05
Time:                        11:32:57   Log-Likelihood:                -49.798
No. Observations:                 118   AIC:                             115.6
Df Residuals:                     110   BIC:                             137.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.2878      0.035    149.245      0.0

In [20]:
import pandas as pd

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df2 = pd.read_excel(excel_path)

# Calculate the ratio of ptau181 to ab42 and add it as a new column
data_df2["ratio"] = data_df2["ptau181"] / data_df2["ab42"]

data_df2

Unnamed: 0,PatientID,VisitNumber,sample_year,sample_dayssincebaseline,experiment_date,ab42,ttau,ptau181,platform,Status,Speech Data,Closest Visit Number,audio label,Sex,Average Label,AD,ratio
0,7173,6,2017,2542,2022-07-28,294.1,215.9,20.87,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1,0.070962
1,7173,8,2019,3320,2022-07-29,308.0,217.2,21.17,Elecsys,Dementia,Y,11,Dementia,F,Dementia,1,0.068734
2,7210,3,2014,1128,2022-07-22,707.2,128.8,10.77,Elecsys,QCI,Y,11,QCI,M,QCI,1,0.015229
3,7210,5,2016,2042,2022-11-15,613.3,131.0,11.47,Elecsys,QCI,Y,11,QCI,M,QCI,1,0.018702
4,7471,5,2018,1955,2022-11-29,323.8,387.5,44.05,Elecsys,Normal,Y,9,Normal,M,QCI,1,0.136041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,10546,0,2023,93,2023-02-13,919.8,177.6,16.07,Elecsys,Normal,Y,0,Normal,M,Normal,N,0.017471
144,10549,0,2022,87,2023-02-13,1778.0,225.9,19.52,Elecsys,Normal,Y,0,Normal,F,Normal,N,0.010979
145,10554,0,2022,32,2023-02-13,837.2,120.9,10.37,Elecsys,Normal,,0,Normal,F,Normal,N,0.012387
146,10555,0,2022,15,2023-02-13,314.6,192.1,18.43,Elecsys,Dementia,,0,Dementia,F,Dementia,1,0.058582


In [21]:
#without power transformation 
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub.playback import play
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD_with_ratio.xlsx"
data_df = data_df2

# Lists to store extracted features and ptau181 values for male patients
all_data = []
ptau181_values = []

# Loop through patient IDs in the DataFrame
for idx, row in data_df.iterrows():
    patient_id = row["PatientID"]
    sex = row["Sex"]
    
    # Filter data for male patients
    if sex != "M" and sex != "F" :
        continue
    
    # Iterate through audio files in the folder
    for filename in os.listdir(audio_folder):
        if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
            audio_filepath = os.path.join(audio_folder, filename)
            
            # Load the audio file
            y, sr = librosa.load(audio_filepath)
            
            # Extract Root Mean Square Energy
            rmse = librosa.feature.rms(y=y)
            rmse_mean = np.mean(rmse)
            
            # Extract Speech Rate (words per minute)
            speech_rate = len(librosa.effects.split(y)) / (len(y) / sr) * 60
            
            # Extract Harmonics-to-Noise Ratio (HNR) 
            # HNR measure how clear and smooth their voice sounds 
            hnr = librosa.effects.harmonic(y)
            hnr_mean = np.mean(hnr)
            
            # Extract Formants
            sound = AudioSegment.from_wav(audio_filepath)
            formants = sound.dBFS
            formants_mean = np.mean(formants)
            
            # Extract MFCCs
            mfccs = librosa.feature.mfcc(y=y, sr=sr)
            mfccs_mean = np.mean(mfccs)
            
            # Extract Mel spectrogram
            mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
            mel_spectrogram_mean = np.mean(mel_spectrogram)
            
            # Extract Spectral Centroid
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
            spectral_centroid_mean = np.mean(spectral_centroids)
            
            # Get the ptau181 value for the male patient
            ptau181_value = row["ratio"]
            
            all_data.append([rmse_mean, speech_rate, hnr_mean, formants_mean, mfccs_mean, mel_spectrogram_mean, spectral_centroid_mean])
            ptau181_values.append(np.log(ptau181_value))
            
            break  # Break after finding the first matching audio file

# Convert the lists to NumPy arrays
X = np.array(all_data)
y = np.array(ptau181_values)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Adding a constant to the independent variables matrix
X_train_with_const = sm.add_constant(X_train)

# Fit the model
model = sm.OLS(y_train, X_train_with_const).fit()

# Print the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     4.180
Date:                Wed, 14 Feb 2024   Prob (F-statistic):           0.000406
Time:                        14:43:33   Log-Likelihood:                -99.825
No. Observations:                 118   AIC:                             215.6
Df Residuals:                     110   BIC:                             237.8
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.3498      0.054    -61.876      0.0

In [23]:
##new appraoch 

In [24]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = pd.read_excel(excel_path)

# Define a list of features
features = ["RMSE", "Speech Rate", "HNR", "Formants", "MFCCs", "Mel Spectrogram", "Spectral Centroid"]

# Iterate over each feature
for feature in features:
    # Lists to store extracted features and ptau181 values for male patients
    all_data = []
    ptau181_values = []

    # Loop through patient IDs in the DataFrame
    for idx, row in data_df.iterrows():
        patient_id = row["PatientID"]
        sex = row["Sex"]

        # Filter data for male patients
        if sex != "M" and sex != "F":
            continue

        # Iterate through audio files in the folder
        for filename in os.listdir(audio_folder):
            if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
                audio_filepath = os.path.join(audio_folder, filename)

                # Load the audio file
                y, sr = librosa.load(audio_filepath)

                # Extract the feature based on the current iteration
                if feature == "RMSE":
                    feature_value = np.mean(librosa.feature.rms(y=y))
                elif feature == "Speech Rate":
                    feature_value = len(librosa.effects.split(y)) / (len(y) / sr) * 60
                elif feature == "HNR":
                    hnr = librosa.effects.harmonic(y)
                    feature_value = np.mean(hnr)
                elif feature == "Formants":
                    sound = AudioSegment.from_wav(audio_filepath)
                    feature_value = np.mean(sound.dBFS)
                elif feature == "MFCCs":
                    mfccs = librosa.feature.mfcc(y=y, sr=sr)
                    feature_value = np.mean(mfccs)
                elif feature == "Mel Spectrogram":
                    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
                    feature_value = np.mean(mel_spectrogram)
                elif feature == "Spectral Centroid":
                    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
                    feature_value = np.mean(spectral_centroids)

                # Get the ptau181 value for the male patient
                ptau181_value = row["ptau181"]

                all_data.append(feature_value)
                ptau181_values.append(np.log(ptau181_value))

                break  # Break after finding the first matching audio file

    # Convert the lists to NumPy arrays
    X = np.array(all_data)
    y = np.array(ptau181_values)

    # Reshape X to be a 2D array
    X = X.reshape(-1, 1)

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Adding a constant to the independent variables matrix
    X_train_with_const = sm.add_constant(X_train)

    # Fit the model
    model = sm.OLS(y_train, X_train_with_const).fit()

    # Print the regression results for the current feature
    print(f"Regression results for feature: {feature}")
    print(model.summary())


Regression results for feature: RMSE
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     5.963
Date:                Wed, 14 Feb 2024   Prob (F-statistic):             0.0161
Time:                        16:01:12   Log-Likelihood:                -71.751
No. Observations:                 118   AIC:                             147.5
Df Residuals:                     116   BIC:                             153.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          

In [25]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = pd.read_excel(excel_path)

# Define a list of features
features = ["RMSE", "Speech Rate", "HNR", "Formants", "MFCCs", "Mel Spectrogram", "Spectral Centroid"]

# Iterate over each feature
for feature in features:
    # Lists to store extracted features and ptau181 values for male patients
    all_data = []
    ptau181_values = []

    # Loop through patient IDs in the DataFrame
    for idx, row in data_df.iterrows():
        patient_id = row["PatientID"]
        sex = row["Sex"]

        # Filter data for male patients
        if sex != "M" and sex != "F":
            continue

        # Iterate through audio files in the folder
        for filename in os.listdir(audio_folder):
            if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
                audio_filepath = os.path.join(audio_folder, filename)

                # Load the audio file
                y, sr = librosa.load(audio_filepath)

                # Extract the feature based on the current iteration
                if feature == "RMSE":
                    feature_value = np.mean(librosa.feature.rms(y=y))
                elif feature == "Speech Rate":
                    feature_value = len(librosa.effects.split(y)) / (len(y) / sr) * 60
                elif feature == "HNR":
                    hnr = librosa.effects.harmonic(y)
                    feature_value = np.mean(hnr)
                elif feature == "Formants":
                    sound = AudioSegment.from_wav(audio_filepath)
                    feature_value = np.mean(sound.dBFS)
                elif feature == "MFCCs":
                    mfccs = librosa.feature.mfcc(y=y, sr=sr)
                    feature_value = np.mean(mfccs)
                elif feature == "Mel Spectrogram":
                    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
                    feature_value = np.mean(mel_spectrogram)
                elif feature == "Spectral Centroid":
                    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
                    feature_value = np.mean(spectral_centroids)

                # Get the ptau181 value for the male patient
                ptau181_value = row["ttau"]

                all_data.append(feature_value)
                ptau181_values.append(np.log(ptau181_value))

                break  # Break after finding the first matching audio file

    # Convert the lists to NumPy arrays
    X = np.array(all_data)
    y = np.array(ptau181_values)

    # Reshape X to be a 2D array
    X = X.reshape(-1, 1)

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Adding a constant to the independent variables matrix
    X_train_with_const = sm.add_constant(X_train)

    # Fit the model
    model = sm.OLS(y_train, X_train_with_const).fit()

    # Print the regression results for the current feature
    print(f"Regression results for feature: {feature}")
    print(model.summary())


Regression results for feature: RMSE
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.044
Method:                 Least Squares   F-statistic:                     6.434
Date:                Wed, 14 Feb 2024   Prob (F-statistic):             0.0125
Time:                        16:09:14   Log-Likelihood:                -62.428
No. Observations:                 118   AIC:                             128.9
Df Residuals:                     116   BIC:                             134.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          

In [26]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from pydub import AudioSegment

# Folder containing the audio files
audio_folder = "/Volumes/MyBook/ADRC/Segmented ADRC Audio/All"

# Load the data from Excel into a DataFrame
excel_path = "/Volumes/MyBook/ADRC/csf_2022_averaged_labels_AD.xlsx"
data_df = data_df2

# Define a list of features
features = ["RMSE", "Speech Rate", "HNR", "Formants", "MFCCs", "Mel Spectrogram", "Spectral Centroid"]

# Iterate over each feature
for feature in features:
    # Lists to store extracted features and ptau181 values for male patients
    all_data = []
    ptau181_values = []

    # Loop through patient IDs in the DataFrame
    for idx, row in data_df.iterrows():
        patient_id = row["PatientID"]
        sex = row["Sex"]

        # Filter data for male patients
        if sex != "M" and sex != "F":
            continue

        # Iterate through audio files in the folder
        for filename in os.listdir(audio_folder):
            if filename.startswith(f"{patient_id}_") and filename.endswith(".wav"):
                audio_filepath = os.path.join(audio_folder, filename)

                # Load the audio file
                y, sr = librosa.load(audio_filepath)

                # Extract the feature based on the current iteration
                if feature == "RMSE":
                    feature_value = np.mean(librosa.feature.rms(y=y))
                elif feature == "Speech Rate":
                    feature_value = len(librosa.effects.split(y)) / (len(y) / sr) * 60
                elif feature == "HNR":
                    hnr = librosa.effects.harmonic(y)
                    feature_value = np.mean(hnr)
                elif feature == "Formants":
                    sound = AudioSegment.from_wav(audio_filepath)
                    feature_value = np.mean(sound.dBFS)
                elif feature == "MFCCs":
                    mfccs = librosa.feature.mfcc(y=y, sr=sr)
                    feature_value = np.mean(mfccs)
                elif feature == "Mel Spectrogram":
                    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
                    feature_value = np.mean(mel_spectrogram)
                elif feature == "Spectral Centroid":
                    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
                    feature_value = np.mean(spectral_centroids)

                # Get the ptau181 value for the male patient
                ptau181_value = row["ratio"]

                all_data.append(feature_value)
                ptau181_values.append(np.log(ptau181_value))

                break  # Break after finding the first matching audio file

    # Convert the lists to NumPy arrays
    X = np.array(all_data)
    y = np.array(ptau181_values)

    # Reshape X to be a 2D array
    X = X.reshape(-1, 1)

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Adding a constant to the independent variables matrix
    X_train_with_const = sm.add_constant(X_train)

    # Fit the model
    model = sm.OLS(y_train, X_train_with_const).fit()

    # Print the regression results for the current feature
    print(f"Regression results for feature: {feature}")
    print(model.summary())


Regression results for feature: RMSE
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.067
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     8.348
Date:                Wed, 14 Feb 2024   Prob (F-statistic):            0.00461
Time:                        16:20:08   Log-Likelihood:                -109.64
No. Observations:                 118   AIC:                             223.3
Df Residuals:                     116   BIC:                             228.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -

In [27]:
#without test

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Adding a constant to the independent variables matrix
X_with_const = sm.add_constant(X_scaled)

# Fit the model using all the data
model = sm.OLS(y, X_with_const).fit()

# Print the regression results for the current feature
print(f"Regression results for feature: {feature}")
print(model.summary())


Regression results for feature: Spectral Centroid
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     2.185
Date:                Wed, 14 Feb 2024   Prob (F-statistic):              0.142
Time:                        16:38:30   Log-Likelihood:                -146.73
No. Observations:                 148   AIC:                             297.5
Df Residuals:                     146   BIC:                             303.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co