# Alexa Project Code Contribution

> Author: Rene Gamino Jr (alias, renegamino012)  
> * Create a pandas dataframe with clip path, speaker, sex, and whether the speaker is an adult.
> * Go path by path, and create a pandas dataframe that contains mfcc features from the audio alongside the speaker's identity
> * Note: This should have "X1" ... "XN" features as columns and one "y" column  
> * Create a multinomial logistic regression model on these features and y column.
> * Test and improve the draft?

In [1]:
#imports
import numpy as np
import pandas as pd
import re
from pydub import AudioSegment as audio
from pydub.silence import split_on_silence as sos
from pydub.playback import play
import glob
import time
from sklearn.linear_model import LogisticRegression
import librosa
import librosa.display
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
import matplotlib.pyplot as plt
import os

### Alexa Project Code Samples

> Author: Chet (alias, techds)  

In [2]:
def get_fname_df_csv(path, ext):
    """ Chet's contribution.
    """
    clips = glob.glob(path + ext)
    clips = sorted(clips)
    files = []
    for clip in clips:
        print(clip)
        file = re.sub(path,'', clip)
        files.append(file)
    tag = input('Enter the speaker\'s first name followed by _ and last initial (all lower case, no spaces: ' )
    tag = re.sub('\s+','', tag)
    #sex = input('Is the speaker male (m) or female (f)?: ' )
    #sex = re.sub('\s+','', sex)
    #age = input('Is the speaker at least 16 years old (y/n)?: ' )
    #age = re.sub('\s+','', age)
    print('\nCreating df...')
    #df = pd.DataFrame({'audio':files, 'speaker':tag,'gender':sex, 'adult':age})
    #file_prefix = input('Enter the speaker\'s first and last initial (lower case) w/o spaces: ' )
    #file_prefix = re.sub('\s+','', file_prefix)
    df = pd.DataFrame({'audio':files, 'speaker':tag})
    file_prefix = path
    print('\nExporting df as csv...')
    df.to_csv(tag + '_audio_df.csv')
    print(df)
    return(df)

## Code to return a dataframe, from directories and stated number of features

In [3]:
from os import listdir
from os import listdir
from os.path import isfile, join

directory = '/Users/Owner/Downloads/voice/all/'
n_features = 20

def df_from_audio_folders(directory, n_features, max_iter = None):
        subdirectories = next(os.walk(directory))[1]
        key_list = [directory + sub + "/" for sub in subdirectories]
        value_list = range(len(subdirectories))
        direct = {key: value for (key, value) in zip(key_list, value_list)}
        func = lambda sub: df_from_audio_files(directory + sub + "/",
                                               n_features = n_features,
                                               labels = direct,
                                               max_iter = max_iter)
        tuples = map(func, subdirectories)
        df_og = pd.concat([tp[1] for tp in tuples], ignore_index=True)
        df = df_og.sample(frac=1).reset_index(drop=True) # Shuffle the df
        return df, df_og 

def df_from_audio_files(directory, n_features, labels, max_iter = None):
    columns = ["f"+ str(i+1) for i in np.arange(n_features)]
    columns.append("speaker")
    df_og = pd.DataFrame()
    df = pd.DataFrame()
    n_files = len(os.listdir(directory)) # Total number of audio files present in directory
    audio_mfcc_features = np.zeros([n_files,n_features+1]) # array that holds all features extracted size is # of audio_files x # of features
    if max_iter is not None:
        pass
    else:
        max_iter = len(os.listdir(directory))
    print("Extracting mfcc from {}".format(directory))
    for i,audio_split in enumerate(os.listdir(directory)):  
        if audio_split.endswith('.wav') and i < max_iter:
            y, sr = librosa.load(directory+audio_split, res_type ='kaiser_fast')
            audio_mfcc = librosa.feature.mfcc(y,sr = sr, n_mfcc=n_features).T # Get 20 features from each audio file
            audio_mfcc_features[i,0:n_features] = audio_mfcc.mean(axis = 0) # Take mean of each feature and create an array
            if not bool(labels):
                audio_mfcc_features[i,-1] = int(0)
            else:
                audio_mfcc_features[i,-1] = int(labels[directory])
    
    # Create a dataframe and name each feature column f1-f20
    df_og = pd.DataFrame(data= audio_mfcc_features, columns = columns, index = os.listdir(directory))
    df = df_og.sample(frac=1).reset_index(drop=True) # Shuffle the df
    return df, df_og

## Creates multinomial logistic regression model

In [4]:
def model(features_table, solver = "newton-sg", accuracy = False):
    X_tr, X_te, y_tr, y_te = train_test_split(features_table.drop(['speaker'], axis = 1),
                                                    features_table['speaker'],
                                                    test_size=0.2,
                                                   random_state=42)
    model = LogisticRegression(solver = solver, multi_class = "multinomial", max_iter = 5000)
    model = model.fit(X_tr, y_tr)
    if accuracy:
        X_k7 = features_table.drop(['speaker'], axis=1)
        y_k7 = features_table['speaker']
        cv_results = cross_validate(model, X_k7, y_k7, cv = 7, return_train_score=True)
        print(solver + " model: ")
        print(cv_results)
        train_acc =  accuracy_score(y_tr, model.predict(X_tr))
        test_acc = accuracy_score(y_te, model.predict(X_te))
        prec_tr = precision_score(y_tr, model.predict(X_tr), average=None)
        prec_te = precision_score(y_te, model.predict(X_te), average=None)
        recall_tr = recall_score(y_tr, model.predict(X_tr), average=None)
        recall_te = recall_score(y_te, model.predict(X_te), average=None)
        print("train_acc: " + str(train_acc))
        print("test_acc: " + str(test_acc))
        print("prec_tr: " + str(prec_tr))
        print("prec_te: " + str(prec_te))
        print("recall_tr: " + str(recall_tr))
        print("recall_te: " + str(recall_te))
    return model

## Testing

> Algorithm to use in the optimization problem.
>
>For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
>For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one->versus-rest schemes.
>‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.
>Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. You can >preprocess the data with a scaler from sklearn.preprocessing.
>
[sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [5]:
import time
start = time.time()

main_folder = '/Users/Owner/Downloads/voice/all/'
raw_df, raw_df_og = df_from_audio_folders(main_folder, 10)

end = time.time()
print(end - start)

Extracting mfcc from /Users/Owner/Downloads/voice/all/ac_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/cs_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/earthakitt_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/eddiegriffin_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/eddieizzard_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/eddiemcclintock_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/eddiethomas_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/edgarwright_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/eduardonoriega_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/edwardasner_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/edwestwick_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/efrenramirez_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/elainecassidy_audio/
Extracting mfcc from /Users/Owner/Downloads/voice/all/elaine

In [6]:
#start = time.time()

#sub_folder = '/Users/Owner/Downloads/voice/sub/'
#sub_df,sub_dg_og = df_from_audio_files(sub_folder, 20, {})

#end = time.time()
#print(end - start)

In [7]:
#Avoid redoing the above AGAIN.
rw_df = raw_df.copy() 
#sb_df = sub_df.copy()
raw_df_og

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,speaker
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,-318.759248,128.144553,-48.451606,-3.291041,-4.439119,-44.898382,-44.994301,-4.803273,-33.221644,-5.774512,0.0
2,-334.442376,128.725198,-62.451485,-10.505905,2.544294,-42.107184,-47.824702,-6.869523,-32.724969,-6.214163,0.0
3,-294.091121,90.402714,-56.128486,4.393008,7.183726,-31.997761,-52.708531,1.184789,-31.553006,-9.048841,0.0
4,-301.952337,110.361349,-50.917652,-15.185147,-2.373628,-34.954239,-45.630943,3.382940,-31.221670,-10.426952,0.0
5,-292.861850,100.226839,-53.361850,-7.668813,8.346736,-30.654490,-51.783797,4.681058,-33.280498,-8.534793,0.0
6,-304.906029,84.052176,-51.714222,1.556833,13.491340,-26.252297,-53.232791,0.326892,-28.902197,-9.460180,0.0
7,-260.011410,104.388979,-57.680261,0.870310,10.184573,-26.734998,-56.833026,1.913528,-35.206748,-9.246219,0.0
8,-276.618570,108.110832,-32.943063,-12.618713,-1.986332,-35.707011,-48.991157,1.565453,-32.857197,-7.179775,0.0
9,-285.496413,113.276978,-42.990195,-4.025746,5.592159,-41.504476,-51.757594,1.973218,-32.627777,-3.038513,0.0


In [8]:
start = time.time()
model(raw_df, solver = "newton-cg", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(raw_df, solver = "saga", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(rw_df, solver = "sag", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(rw_df, solver = "lbfgs", accuracy = True)
end = time.time()
print(end - start)
print('\n')

KeyboardInterrupt: 

In [None]:
"""start = time.time()
model(sb_df, solver = "newton-cg", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(sb_df, solver = "saga", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(sb_df, solver = "sag", accuracy = True)
end = time.time()
print(end - start)
print('\n')

start = time.time()
model(sb_df, solver = "lbfgs", accuracy = True)
end = time.time()
print(end - start)
print('\n')

model(sb_df, solver = "newton-sg", accuracy = True)
model(sb_df, solver = "saga", accuracy = True)
model(sb_df, solver = "sag", accuracy = True)
model(sb_df, solver = "lbfgs", accuracy = True)
print('\n')
"""

In [None]:
algths = ["newton-cg", "saga", "sag", "lbfgs"]
for n in [10,20, 30, 40, 50]:
    print("\nNumber of Features: " + str(n))
    import time
    start = time.time()
    folder = '/Users/Owner/Downloads/voice/all/'
    raw_df, raw_df_og = df_from_audio_files(folder, n)
    for alg in algths:
        model(rw_df, solver = alg, accuracy = True)
    end = time.time()
    print(end - start)