In [1]:
import unify_data as dp # my file for dealing with temporal data
import pandas as pd
import numpy as np
from pykalman import KalmanFilter
import librosa as lb # dealing with audio files

In [4]:
# support fucntions
def one_d_kalman_filter(df):
    """basic 1d kalman filterfunction based on the function provided by the professor (sigh)"""
    for i,column in enumerate(df.columns):
        print(f"{i} column out of {len(df.columns)}")
        if not is_numeric_dtype(df[column]):
            continue
        else:
            kf =  KalmanFilter(transition_matrices=[[1]], observation_matrices=[[1]]) #transition and observation matrices for computations
            masked_values = np.ma.masked_invalid(df[column].values.astype(np.float32))

            kf_params = kf.em(masked_values,n_iter=3) #optimizes Q and R (the noise estimates) and so on of the kalman filter to improve outlier detection and imputation
            imputed_data, covariances = kf_params.smooth(masked_values) #applies the filter
            df[column] = imputed_data
    return df

def multivariate_kalman_filter(df):
    """updates function from professor to multivariate kalman filter"""
    columns_to_impute = [column for column in df.columns if  is_numeric_dtype(df[column])]
    kf =KalmanFilter(
        transition_matrices = np.eye(len(columns_to_impute)), #initialize transition matrix to just next state
        observation_matrices = np.eye(len(columns_to_impute)), #initialize observation matrix to just next state
        transition_covariance = np.eye(len(columns_to_impute))*0.5, #initialize Q to moderate uncertainty about transition model
        observation_covariance = np.eye(len(columns_to_impute)) *0.5)#initialize R to moderate uncertainty about measurements 
    
    masked_df = np.ma.masked_invalid(df[columns_to_impute].values.astype(np.float32))
    kf_params = kf.em(masked_df,n_iter=3) #optimizes Q and R (the noise estimates) and so on of the kalman filter to improve outlier detection and imputation
    imputed_data, covariances = kf_params.smooth(masked_df) #applies the filter
    print("imputation of multivariate done")
    for i, column in enumerate(columns_to_impute):
        df[column] = imputed_data[:,i]
    return df

def mixed_kalman_filter(df):
    """function that applies multivariate approach for highly correlated features and univariate for others"""
    print("now correlated")
    correlated_columns =df[["Latitude (°)","Longitude (°)","Height (m)"]]
    correlated_df = multivariate_kalman_filter(correlated_df)
    print("now uncorrelated")
    uncorrelated_columns = df.drop(df[correlated_columns])
    uncorrelated_df = one_d_kalman_filter(uncorrelated_columns)
    df_concat = pd.concat([correlated_df, uncorrelated_df], axis=1)
    return df_concat

def audio_to_csv(audio_file, fft_ws, w_ss, file_output):
    """
    Transforms audio data into frequency and amplitude via FastFurrier Transform

    Parameters:

    - fft_ws: fast furrier transform window size;
        higer => better freq resolution but worse time resolution
    - w_ss: window step size
        higher => better resolution but more overlap
    """
    # extract audio array and sampling rate 
    y, sr = lb.load(audio_file, sr=None)

    # create short time furier transform; freq as rows and time-frames as columns 
    # entries containeding amplituede and phase as complex numbers
    mat = lb.stft(y, n_fft=fft_ws, hop_length=w_ss)
    
    # get magnitude  and frequencies 
    magnitude = np.abs(mat) 
    freq = lb.fft_frequencies(sr=sr, n_fft=fft_ws) # map indices in fft matrix to real values 

    # extract maximum freq from matrix 
    dfreq_idx =  np.argmax(magnitude, axis=0)
    dfreq = freq[dfreq_idx]

    d_magnitudes =[]
    for i in range(magnitude.shape[1]):
        highest = dfreq_idx[i]
        d_magnitudes.append(magnitude[highest,i]) #collects the amplitudes per time

    # create time 
    time = lb.frames_to_time(np.arange(magnitude.shape[1]),sr=sr,hop_length=w_ss)

    # convert to df and save 
    pd.DataFrame({"Common time (s)":time,"amplitude":d_magnitudes,"frequency":dfreq}).to_csv(f"{file_output}.csv")
    print("Audio file saved!")



### Test Data

In [5]:
# transfrom audio to csv file 
audio_to_csv("test_audio.wav",1024, 512, "test_data\\audio")

Audio file saved!


In [None]:
# information required for generating data set
path = "test_data"
activs = ["study", "socializing", "walk", "stairs", "phone", "walking", "rest"] # activiy sequence 
rtimes= []

## generate tet data set ##
test_data = dp.get_dataset(path, activs, rtimes, "Activity", impute=True, custom_impute=multivariate_kalman_filter)
test_data.head()

In [None]:
test_data.isna().sum()

#### Add survey data

In [None]:
test_survey = pd.read_csv("testdata_sresp.csv")

 # add relative time experiment 
test_survey["Linear time"] = None
start_time = None

for idx, time_val in enumerate(test_survey.iloc[:, 0]):
    # extract time from timestamp and convert to min:sec,msec
    time = time_val.split()[1][3:] + ",00"
    
    # get relative time 
    if start_time is None: start_time = dp.strtime_to_sec(time)

    linear_time = dp.strtime_to_sec(time) - start_time

    test_survey.loc[idx, "Linear time"] = linear_time

test_survey.head()

In [None]:
# add labels from survey data
ctime = test_survey["Linear time"]

# loop over each variable to add labels 
for col in test_survey.columns[1:-1]:
    dp.add_labels(test_data, test_survey[col], ctime, col)

In [None]:
test_data.to_csv("test_data.csv")

### Validation Data

In [None]:
# transfrom audio to csv file 
audio_to_csv("val_audio.wav",1024, 512, "val_data\\audio")

In [None]:
# information required for generating data set
path = "val_data"
activs = ["study", "socializing", "walk", "stairs", "phone", "walking", "rest"] # activiy sequence 
rtimes= []

## generate tet data set ##
val_data = dp.get_dataset(path, activs, rtimes, "Activity", impute=True, custom_impute=multivariate_kalman_filter)
val_data.head()

In [None]:
val_data.isna().sum()

#### Add survey values

In [None]:
val_survey = pd.read_csv("testdata_sresp.csv")

 # add relative time experiment 
val_survey["Linear time"] = None
start_time = None

for idx, time in enumerate(val_survey.iloc[:, 0]):
    # extract time from timestamp and convert to min:sec,msec
    time = time.split()[1][3:] + ",00"
    
    # get relative time 
    if start_time is None: start_time = dp.strtime_to_sec(time)

    linear_time = dp.strtime_to_sec(time) - start_time

    val_survey.loc[idx, "Linear time"] = linear_time

val_survey.head()

In [None]:
# add labels from survey data
ctime = val_survey["Linear time"]

# loop over each variable to add labels 
for col in test_survey.columns[1:-1]:
    dp.add_labels(val_data, val_survey[col], ctime, col)

In [None]:
val_data.to_csv("val_data.csv")