In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import scale
from tqdm import tqdm

## Save features

In [6]:
max_lpc_order = 50
lpc_lengths = np.arange(1, max_lpc_order + 1)

train_features = pd.DataFrame()


for l in tqdm(lpc_lengths, total=max_lpc_order):
    lpcorder_df = pd.read_pickle('/nas/home/cborrelli/bot_speech/features/lpc/train_LPC_' + str(l) +'.pkl')
    
    lpcorder_df = lpcorder_df.reindex(sorted(lpcorder_df.columns), axis=1)

    old_columns_names = lpcorder_df.columns.values
    
    keep_names = [o for o in old_columns_names if not(o.startswith('lpc') or o.startswith('ltp'))]
    mod_columns_names = [n +'_'+ str(l) for n in old_columns_names if n.startswith('lpc') or n.startswith('ltp')]
    new_columns_names = keep_names + mod_columns_names
    
    old_columns_names.sort()
    new_columns_names.sort()
    
    lpcorder_df.rename(columns=dict(zip(old_columns_names, new_columns_names)), inplace=True)
    lpcorder_df.index = lpcorder_df['audio_filename']

    if l == 1:
        train_features = lpcorder_df
        train_features.index = train_features['audio_filename']
        continue
    else:
        train_features = pd.concat([train_features, lpcorder_df], axis=1)

train_features = train_features.loc[:,~train_features.columns.duplicated()]
train_features.reset_index(inplace=True, drop=True)

train_features.to_pickle('../features/lpc/dataframe/train.pkl')

100%|██████████| 50/50 [00:15<00:00,  3.16it/s]


In [7]:
max_lpc_order = 50
lpc_lengths = np.arange(1, max_lpc_order + 1)

dev_features = pd.DataFrame()


for l in tqdm(lpc_lengths, total=max_lpc_order):
    lpcorder_df = pd.read_pickle('/nas/home/cborrelli/bot_speech/features/lpc/dev_LPC_' + str(l) +'.pkl')
    
    lpcorder_df = lpcorder_df.reindex(sorted(lpcorder_df.columns), axis=1)

    old_columns_names = lpcorder_df.columns.values
    
    keep_names = [o for o in old_columns_names if not(o.startswith('lpc') or o.startswith('ltp'))]
    mod_columns_names = [n +'_'+ str(l) for n in old_columns_names if n.startswith('lpc') or n.startswith('ltp')]
    new_columns_names = keep_names + mod_columns_names
    
    old_columns_names.sort()
    new_columns_names.sort()
    
    lpcorder_df.rename(columns=dict(zip(old_columns_names, new_columns_names)), inplace=True)
    lpcorder_df.index = lpcorder_df['audio_filename']

    if l == 1:
        dev_features = lpcorder_df
        dev_features.index = dev_features['audio_filename']
        continue
    else:
        dev_features = pd.concat([dev_features, lpcorder_df], axis=1)

        
dev_features = dev_features.loc[:,~dev_features.columns.duplicated()]
dev_features.reset_index(inplace=True, drop=True)

dev_features.to_pickle('../features/lpc/dataframe/dev.pkl')

100%|██████████| 50/50 [00:15<00:00,  3.24it/s]


In [8]:
max_lpc_order = 50
lpc_lengths = np.arange(1, max_lpc_order + 1)

eval_features = pd.DataFrame()

for l in tqdm(lpc_lengths, total=max_lpc_order):
    lpcorder_df = pd.read_pickle('/nas/home/cborrelli/bot_speech/features/lpc/eval_LPC_' + str(l) +'.pkl')
    
    lpcorder_df = lpcorder_df.reindex(sorted(lpcorder_df.columns), axis=1)

    old_columns_names = lpcorder_df.columns.values
    
    keep_names = [o for o in old_columns_names if not(o.startswith('lpc') or o.startswith('ltp'))]
    mod_columns_names = [n +'_'+ str(l) for n in old_columns_names if n.startswith('lpc') or n.startswith('ltp')]
    new_columns_names = keep_names + mod_columns_names
    
    old_columns_names.sort()
    new_columns_names.sort()
    
    lpcorder_df.rename(columns=dict(zip(old_columns_names, new_columns_names)), inplace=True)
    lpcorder_df.index = lpcorder_df['audio_filename']

    if l == 1:
        eval_features = lpcorder_df
        eval_features.index = eval_features['audio_filename']
        continue
    else:
        eval_features = pd.concat([eval_features, lpcorder_df], axis=1)
        
        
eval_features = eval_features.loc[:,~eval_features.columns.duplicated()]
eval_features.reset_index(inplace=True, drop=True)

eval_features.to_pickle('../features/lpc/dataframe/eval.pkl')

100%|██████████| 50/50 [00:43<00:00,  1.15it/s]


In [5]:
train_features

Unnamed: 0_level_0,audio_filename,end_voice,label,lpc_gain_max_1,lpc_gain_mean_1,lpc_gain_min_1,lpc_gain_var_1,lpc_res_max_1,lpc_res_mean_1,lpc_res_min_1,...,lpc_res_min_50,lpc_res_var_50,ltp_gain_max_50,ltp_gain_mean_50,ltp_gain_min_50,ltp_gain_var_50,ltp_res_max_50,ltp_res_mean_50,ltp_res_min_50,ltp_res_var_50
audio_filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LA_T_1138215,LA_T_1138215,54719.0,bonafide,334.889876,17.986281,1.000066,1504.845991,0.077560,0.002197,-0.101673,...,-0.027288,0.000137,4.027678,1.266452,1.068358,0.131375,0.014815,0.000101,-0.016340,0.000101
LA_T_1271820,LA_T_1271820,67679.0,bonafide,182.073410,25.101672,1.000034,1704.149138,0.059192,0.002185,-0.083696,...,-0.019598,0.000070,4.706324,1.220221,1.041708,0.068551,0.011334,0.000058,-0.011847,0.000058
LA_T_1272637,LA_T_1272637,42239.0,bonafide,143.274297,18.543832,1.072179,1062.292829,0.071678,0.002179,-0.092725,...,-0.029850,0.000141,3.434502,1.246960,1.079782,0.130723,0.016157,0.000098,-0.016302,0.000097
LA_T_1276960,LA_T_1276960,44639.0,bonafide,212.028063,27.865050,1.028308,2670.857142,0.106493,0.002571,-0.105728,...,-0.030141,0.000104,3.151442,1.248164,1.062743,0.073647,0.017438,0.000089,-0.019430,0.000088
LA_T_1341447,LA_T_1341447,52799.0,bonafide,172.204711,17.320865,0.999900,1045.427982,0.086018,0.002491,-0.098070,...,-0.032055,0.000168,3.422221,1.186312,1.062075,0.037548,0.019422,0.000146,-0.021293,0.000145
LA_T_1363611,LA_T_1363611,34559.0,bonafide,37.559787,7.899162,1.000123,71.040230,0.111494,0.004663,-0.147110,...,-0.033116,0.000139,2.181834,1.210358,1.055357,0.035590,0.017528,0.000113,-0.021625,0.000112
LA_T_1596451,LA_T_1596451,33119.0,bonafide,159.898543,26.436717,1.000040,1692.502247,0.067170,0.001351,-0.088507,...,-0.022900,0.000058,2.653719,1.249128,1.073408,0.053464,0.011987,0.000044,-0.013023,0.000044
LA_T_1608170,LA_T_1608170,31199.0,bonafide,169.528335,13.915069,1.000079,770.640327,0.065127,0.001359,-0.081191,...,-0.026299,0.000080,6.921853,1.323803,1.061492,0.353750,0.013191,0.000062,-0.014314,0.000061
LA_T_1684951,LA_T_1684951,37919.0,bonafide,153.012184,15.058045,1.000020,949.910559,0.099249,0.004213,-0.140696,...,-0.043968,0.000434,20.308203,1.576451,1.065591,3.477495,0.020564,0.000181,-0.021709,0.000180
LA_T_1699801,LA_T_1699801,47519.0,bonafide,199.839853,14.052543,1.005014,884.745517,0.095078,0.002585,-0.109202,...,-0.029128,0.000082,4.870164,1.272755,1.068648,0.178039,0.014441,0.000062,-0.016255,0.000062
