In [21]:
import numpy as np
import pandas as pd
import glob
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [4]:

## This function takes in the file path as input and returns a 20 length array with the mean of each
## mfcc coefficient 

def compute_mean_mfcc_from_csv(file_path) :
    mfcc_df = pd.read_csv(file_path,header = None)
    mean_mfcc = mfcc_df.mean(axis = 1)
    return mean_mfcc



In [17]:
### Compiling the 20 means for each file in feature_file.csv

all_means = []
mfcc_files = glob.glob('*-MFCC.csv')

for file in mfcc_files :
    mean_mfcc = compute_mean_mfcc_from_csv(file)
    all_means.append(mean_mfcc)

mean_mfcc_df = pd.DataFrame(all_means)
mean_mfcc_df.columns = [f'MFCC_{i+1}_mean' for i in range(mean_mfcc_df.shape[1])]
mean_mfcc_df['file_name'] = mfcc_files
mean_mfcc_df = mean_mfcc_df[['file_name'] + [f'MFCC_{i+1}_mean' for i in range(mean_mfcc_df.shape[1]-1)]]
mean_mfcc_df.to_csv('feature_file.csv', index = False)
print("Feature Vector Mean created")

Feature Vector Mean created


In [2]:
def compute_std_from_csv(file_path) :
    mfcc_df = pd.read_csv(file_path,header = None)
    std_mfcc = mfcc_df.std(axis = 1)
    return std_mfcc

In [5]:
file_paths = glob.glob("*-MFCC.csv")  # Adjust the pattern if necessary
all_features = []

for file_path in file_paths:
    # Load each MFCC CSV file
    
    
    # Calculate mean and standard deviation using the functions
    means = compute_mean_mfcc_from_csv(file_path)
    std_devs = compute_std_from_csv(file_path)
    
    # Combine the file name, means, and standard deviations into one feature vector
    features = np.concatenate(([file_path], means, std_devs))
    all_features.append(features)

columns = ['file_name'] + [f'mean_mfcc_{i+1}' for i in range(20)] + [f'std_mfcc_{i+1}' for i in range(20)]
master_df = pd.DataFrame(all_features, columns=columns)
master_df.to_csv("feature_file.csv", index=False)

In [16]:
def calculate_mean(df):
    return df.mean(axis=1).values

def calculate_std(df):
    return df.std(axis=1).values

def calculate_skew(df):
    return df.skew(axis=1).fillna(0).values  # Handle NaNs by filling with 0

def calculate_kurtosis(df):
    return df.kurtosis(axis=1).fillna(0).values  # Handle NaNs by filling with 0

In [19]:
file_paths = glob.glob("*-MFCC.csv")
all_features = []

for file_path in file_paths:
    # Load each MFCC CSV file
    df = pd.read_csv(file_path, header = None)
    
    # Ensure the data has 20 rows (each representing an MFCC coefficient)
    if df.shape[0] != 20:
        print(f"Warning: File {file_path} does not have 20 rows for MFCC coefficients. Skipping.")
        continue

    # Calculate each feature
    means = calculate_mean(df)
    std_devs = calculate_std(df)
    skews = calculate_skew(df)
    kurtoses = calculate_kurtosis(df)
    
    # Confirm each feature has exactly 20 elements
    if len(means) == 20 and len(std_devs) == 20 and len(skews) == 20 and len(kurtoses) == 20:
        # Combine file name, means, std devs, skews, and kurtoses
        features = np.concatenate(([file_path], means, std_devs, skews, kurtoses))
        all_features.append(features)
    else:
        print(f"Error: Feature calculation for {file_path} did not yield 20 values for each feature.")
        print(f"Means: {len(means)}, Std Devs: {len(std_devs)}, Skews: {len(skews)}, Kurtoses: {len(kurtoses)}")

# Define column names: 20 each for mean, std_dev, skew, and kurtosis
columns = ['file_name'] + \
          [f'mean_mfcc_{i+1}' for i in range(20)] + \
          [f'std_mfcc_{i+1}' for i in range(20)] + \
          [f'skew_mfcc_{i+1}' for i in range(20)] + \
          [f'kurtosis_mfcc_{i+1}' for i in range(20)]

# Ensure the length of each row matches the column count before creating DataFrame
if all(len(features) == len(columns) for features in all_features):
    # Create DataFrame with the computed features
    master_df = pd.DataFrame(all_features, columns=columns)
    # Save to master CSV file
    master_df.to_csv("feature_file.csv", index=False)
else:
    print("Error: Mismatch in the number of features and columns. Please check the feature extraction.")

In [35]:
data = pd.read_csv('feature_file.csv')
skew_kurt_features = data[[f'skew_mfcc_{i}' for i in range(1, 21)] + [f'kurtosis_mfcc_{i}' for i in range(1, 21)]]
scaler = StandardScaler()
skew_kurt_scaled = scaler.fit_transform(skew_kurt_features)
# Fit PCA without specifying n_components to check explained variance
pca = PCA()
skew_kurt_pca_full = pca.fit(skew_kurt_scaled)

# Calculate cumulative explained variance
explained_variance = pca.explained_variance_ratio_.cumsum()

# Determine the number of components that capture desired variance (e.g., 95%)
# n_components = (explained_variance >= 0.8).argmax() + 1  # 95% variance threshold
n_components = 2
# Apply PCA with the selected number of components
pca = PCA(n_components=n_components)
skew_kurt_pca = pca.fit_transform(skew_kurt_scaled)
for i in range(n_components):
    data[f'skew_kurt_pca_{i+1}'] = skew_kurt_pca[:, i]
data.to_csv('feature_file.csv')