In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn import decomposition
import matplotlib.pyplot as plt
from sklearn import tree
import librosa.display
import seaborn as sns
import pandas as pd
import numpy as np
import statistics
import json
import os

# Upload Files

In [None]:

training_data = []
validation_data = []
testing_data = []
split_ratio = 0
data_files = []
for folder in os.listdir(cwd):
    #Finds data folder in directory
    correct_folder = folder
    testing_correct_folder = correct_folder.isnumeric()
    #Ensuring it is a data folder 

    if testing_correct_folder == False:
        #Ensuring it is a data folder; if not, continues to next folder
        continue
    else:
        for file in sorted(os.listdir(folder)):
            #for each file in the folder
            wav_file = file.split(".")[0]
            # gets the beginning of the files name
            # *This is only possible due to wav & json having same file structure names*
            if not file.endswith(".json"): continue
                #if not a json file continue to next file
            filePath = os.path.join(folder, file)
            #join directory paths of the folder and current file
            wave_file_path = filePath.split(".")[0]
            with open(filePath, 'r+') as f:
                data = json.load(f)
                #load json file to extract tags
                data_files.append({wave_file_path + ".wav":data["tags"]})
                           

# Feature Extraction Process:

# Amplitude Envelope 

In [None]:
amp_envelope = []
for items in data_files:
    
    wav_file = list(items.keys())[0]
   
    current_wav_file,sr = librosa.load(wav_file)
    
    sample_duration = 1/sr
    #purpose of sample duration is it is neeeded to calculate the duration of the audio signal in seconds
    duration = sample_duration * len(current_wav_file)
    
    # going to have subplot(s) to stack the waveforms for all the different audio signals vertically
    
    
    #calculate amp envelope
    #frame_size = 512
    frame_size = 1024
    hop_length = 512
    #hop length is only implmented if it's the case where there is overlapping frames
    #given the current frame how many signals we shift to the right for calc max_frame
    def amplitude_envelope(signal,frame_size): #hop_length)
        amplitude_envelope = []
        
        #calculating amp_env for each frame
        
        for i in range(0,len(signal),frame_size): #hop_length)
            current_frame_amp_envelope = max(signal[i:i +frame_size])
            # slices the signals of the samples for a given frame
            amplitude_envelope.append(current_frame_amp_envelope)
        
        return np.array(amplitude_envelope)
    
    amp_env_of_current_wave_file = amplitude_envelope(current_wav_file,frame_size) #hop_length)
    wave_file_frames = len(amp_env_of_current_wave_file)
    #is the number of frame within the current wave file signal
    
    amp_envelope.append(round(statistics.mean(amp_env_of_current_wave_file),5))
    
    

# Root Mean Sqaure Energy

In [None]:
RMSE = []
for items in data_files:
    wav_file = list(items.keys())[0]
   
    current_wav_file = librosa.load(wav_file)
    #extract RMSE with librosa
    
    frame_size = 1024
    hop_len = 512
   
    rms_current_wave_file = librosa.feature.rms(y = current_wav_file[0],frame_length=frame_size,hop_length=hop_len)[0]
    RMSE.append(round(statistics.mean((rms_current_wave_file)),5))
    #outputs an aggregated result of the rms feature 
    

# Zero Crossing Rate

In [None]:
zero_crossing_rate = []
for items in data_files:

    wav_file = list(items.keys())[0]
    #print(wav_file)
   
    current_wav_file = librosa.load(wav_file)
    
    frame_size = 1024
    hop_len = 512
    
    zero_cross_rate_current_wave_file = librosa.feature.zero_crossing_rate(current_wav_file[0],frame_length=frame_size,hop_length=hop_len)[0]
    zero_crossing_rate.append(round(statistics.mean((zero_cross_rate_current_wave_file)),5))
    
    

# Band Energy Ratio

In [None]:
BER = []

for items in data_files:
   
    wav_file = list(items.keys())[0]
    
    current_wav_file = librosa.load(wav_file)
    
    
    frame_size = 2848
    hop_len = 512
    #extract the spectrogram of the wave file
    
    wave_file_spectrogram = librosa.stft(current_wav_file[0], n_fft=frame_size, hop_length=hop_len)
    
    #calculating band energy ratio
    
    def calculating_freq_bin(spectrogram, split_freq, sample_rate):
        freq_range = sample_rate/2
        #freq range captured within the spectrogram
        
        freq_delt_per_bin = freq_range / spectrogram.shape[0]
        #calculating delta freq between two bins
        
        split_freq_bin = np.floor(split_freq/ freq_delt_per_bin)
        #maps the continous frequnecy onto the closest frequnecy bin availiable 
        # np.floor is taking a number i.e 10.6 and rounding two to 10 to make an int
        return int(split_freq_bin)
    
    split_freq_bin = calculating_freq_bin(wave_file_spectrogram,2000, current_wav_file[1])
 
    
    def band_energy_ratio(spectrogram,split_freq,sample_rate):
        split_freq_bin = calculating_freq_bin(wave_file_spectrogram,2000, current_wav_file[1])
        
        power_spectrogram = np.abs(spectrogram)**2
        power_spectrogram = power_spectrogram.T
        
        band_energy_ratio = []
        
        for freqs_in_frame in power_spectrogram:
            #iterating through frames to get values of freqenies for each frame
            #we're calcualting each band energy ratio for each frame
            sum_power_lower_freqs = np.sum(freqs_in_frame[:split_freq_bin])
            sum_power_high_freqs = np.sum(freqs_in_frame[split_freq_bin:])
            
            ber_current_frame = sum_power_lower_freqs/sum_power_high_freqs
            
            band_energy_ratio.append(ber_current_frame)
        
        return np.array(band_energy_ratio)
    
    band_energy_ratio_feature = band_energy_ratio(wave_file_spectrogram,2000, current_wav_file[1])
    BER.append(round(statistics.mean(band_energy_ratio_feature),3))
         

# Spectral Centroid

In [None]:
spec_centroid = []
for items in data_files:
    wav_file = list(items.keys())[0]
    current_wav_file = librosa.load(wav_file)
    frame_size = 1024
    hop_len = 512

    
    spectral_centroid = librosa.feature.spectral_centroid(y = current_wav_file[0], sr= current_wav_file[1])[0]    
    spec_centroid.append(round(statistics.mean(spectral_centroid),2))
    


# Bandwidth

In [None]:
band_width = []
for items in data_files:
  
    wav_file = list(items.keys())[0]
    current_wav_file = librosa.load(wav_file)
    
    frame_size = 1024
    hop_len = 512
    
    band_wid = librosa.feature.spectral_bandwidth(y = current_wav_file[0],sr= current_wav_file[1])[0]
    #amounts of freqiecies that are significant around the spectral centroid
    band_width.append(round(statistics.mean(band_wid),2))
    
   

# Spectrogram

In [None]:
log_spec = []
spectrogram = []
spectrogram_std = []
spec_std = []  
spec_mean = []  
for items in data_files:
  
    wav_file = list(items.keys())[0]
    current_wav_file = librosa.load(wav_file)
    
    frame_size = 1024
    hop_len = 512
    
    #extracting Short time fourier transfrom
    st_fourier_transform = librosa.stft(y = current_wav_file[0],n_fft = frame_size,hop_length = hop_len)
    
    #calculating spectrogram
    spec_gram = np.abs(st_fourier_transform)**2
    
    #Extracting composite value from spectrogram
    spectrogram.append(np.mean(spec_gram,axis = 0))
       
    #Extracting composite value from spectrogram
    spectrogram_std.append(np.std(spec_gram,axis = 0))
 
    #calculating log spectrogram
    log_spectro = librosa.power_to_db(spec_gram)
   
      
for nums in spectrogram_std:
    spec_std.append(np.std(nums))


for nums in spectrogram:
    spec_mean.append(np.mean(nums))


# Mel Spectrogram

In [None]:
melspectrogram_mean = []
melspectrogram_std = []
log_mel_spec = [ ]
for items in data_files:

  
    wav_file = list(items.keys())[0]
    current_wav_file = librosa.load(wav_file)
    
    frame_size = 1024
    hop_len = 512
    sr = current_wav_file[1]
    #mel filter banks
    filter_banks = librosa.filters.mel(n_fft = 2048,sr = sr,n_mels = 10)
    
    mel_spectrogram = librosa.feature.melspectrogram(y = current_wav_file[0],sr=sr,n_fft = 2048,hop_length = hop_len,n_mels = 10)
    
    melspectrogram_mean.append(np.mean(mel_spectrogram,axis = 0))
    
    melspectrogram_std.append(np.std(mel_spectrogram,axis = 0))

    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    log_mel_spec.append(log_mel_spectrogram)

mel_spec_mean = []
for nums in melspectrogram_mean:
    mel_spec_mean.append(np.mean(nums))
mel_spec_std = []
for nums in melspectrogram_std:
    mel_spec_std.append(np.std(nums))
   

# Mel-frequency cepstral coefficients

In [None]:
MFCC = []
MFCC_Std = []

for items in data_files:
  
    wav_file = list(items.keys())[0]
    current_wav_file = librosa.load(wav_file)
    
    frame_size = 1024
    hop_len = 512
    sr = current_wav_file[1]

    mel_freq_cepco = librosa.feature.mfcc(y = current_wav_file[0],sr=sr,n_fft = 2048,hop_length = hop_len,n_mels = 10)
    
    MFCC.append(np.mean(mel_freq_cepco,axis =0))
    MFCC_Std.append(np.std(mel_freq_cepco,axis = 0))
    
    
    
mfcc_mean = []
for nums in MFCC:
    mfcc_mean.append(np.mean(nums))
mfcc_std = []
for nums in MFCC_Std:
    mfcc_std.append(np.std(nums))
    

# Updating Existence of Most Prevelant Tags

In [None]:
water = []
train = []                
voice = []
people = []
nature = []
city = []
bird_song = []

for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "water" in words:
                water.append(1)
            else:
                water.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "train" in words:
                train.append(1)
            else:
                train.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "voice" in words:
                voice.append(1)
            else:
                voice.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "people" in words:
                people.append(1)
            else:
                people.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "nature" in words:
                nature.append(1)
            else:
                nature.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "city" in words:
                city.append(1)
            else:
                city.append(0)
for items in data_files:
    for keylist in list(items.values()):
        for words in keylist:
            if "birdsong" in words:
                bird_song.append(1)
            else:
                bird_song.append(0)

In [None]:
bird_song_count = [bird_song.count(1)]
water_count = [water.count(1)]
nature_count = [nature.count(1)]
city_count = [city.count(1)]
voice_count = [voice.count(1)]
train_count = [train.count(1)]
people_count = [people.count(1)]

# Tag Distribution

In [None]:
data = {'Water': water,
        'Train': train,
        'Nature': nature,
        'Voice': voice,
        'People': people,
        'Birdsong': bird_song,
        'City': city}

df = pd.DataFrame(data)
df1 = df.melt(var_name='Audio Tags', value_name='Count')
plt.figure(figsize=(13.7,8.27))
ax =sns.countplot(x='Audio Tags', hue='Count', data=df1)
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.show()

# Audio File ID List

In [None]:
wave_file_list = []
for items in list(data_files):
    key = list(items.keys())[0]
    key = key.split("/")[1]
    wave_file_list.append(key)

# Audio File Dataframe

In [None]:
audio_df = pd.DataFrame(list(zip(wave_file_list,spec_mean,spec_std,mel_spec_mean,mel_spec_std,mfcc_mean,mfcc_std,amp_envelope,RMSE,zero_crossing_rate,BER,spec_centroid,band_width,water,train,nature,voice,people,bird_song,city)),
               columns =['WaveFiles',"Spec_mean" ,"Spec_std","Mel_Spec_mean","Mel_Spec_std","Mfcc_mean","Mfcc_std",'Amp_Env',"RMSE","Zero_Cross","BER","Spec_Cen","Band_Width","Water_Tag","Train_Tag","Nature_Tag","Voice_Tag","People_Tag","Birdsong_Tag","City_Tag"])

In [None]:
audio_df

In [None]:
features =["Spec_mean" ,"Spec_std","Mel_Spec_mean","Mel_Spec_std","Mfcc_mean","Mfcc_std",'Amp_Env',"RMSE","Zero_Cross","BER","Band_Width"]
audio_features = audio_df[features]

# Correlation Matrix

In [None]:
sns.set_theme(style="white")
corr = audio_features.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Feature Sets

In [None]:
feature_set_one = ["Spec_mean" ,"Spec_std"]
feature_set_two = ["Mel_Spec_mean","Mel_Spec_std"]
feature_set_three = ["Mfcc_mean","Mfcc_std"]
feature_set_four = ["Spec_mean" ,"Spec_std","Mel_Spec_mean","Mel_Spec_std"]
feature_set_five = ["Spec_mean" ,"Spec_std","Mel_Spec_mean","Mel_Spec_std","Mfcc_mean","Mfcc_std"]
feature_set_six = ["Spec_mean" ,"Spec_std","Mel_Spec_mean","Mel_Spec_std","Mfcc_mean",'Amp_Env',"RMSE"]

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = audio_df["Birdsong_Tag"]


X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)


# Binary Classification of Birdsong_Tag - Grid Search

# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))

criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])

GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

#  Grid Search for SVM 

In [None]:
from sklearn.svm import SVC
std_slc = StandardScaler()
pca = decomposition.PCA()

svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))

gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search SGDC

In [None]:
from sklearn.linear_model import SGDClassifier

std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]

parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])

GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

#  Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])

GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of Birdsong_Tag - Prediction

In [None]:
dec_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = SVC(C=0.1, gamma=10, kernel='sigmoid')
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = SGDClassifier(alpha=0.001, epsilon=0.01, loss='perceptron', penalty='l1')
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = LogisticRegression(C=100, max_iter=10000, penalty='none', solver='newton-cg')
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
bird_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["Bird_Tag","Decision Tree", "SVM", "SGDC", "Log"])

In [None]:
bird_train_prediction_df 

# Binary Classification of Water_Tag - Grid Search

#  Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["Water_Tag"])

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:

std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:

std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of Water_Tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 

predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
water_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["Water_Tag","Decision Tree", "SVM", "SGDC", "Log"])

In [None]:
comb_model_two = bird_train_prediction_df.append(water_train_prediction_df )
water_train_prediction_df

# Binary Classification of Train_Tag - Grid Search

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["Train_Tag"])

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:

std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of Train_Tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
train_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["Train_Tag","Decision Tree", "SVM", "SGDC", "Log"])


In [None]:
comb_model_3 = comb_model_two.append(train_train_prediction_df )
train_train_prediction_df

# Binary Classification of Nature_Tag - Grid Search

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["Nature_Tag"])

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)


# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:

std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of Nature_Tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
nature_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["Nature_Tag","Decision Tree", "SVM", "SGDC", "Log"])


In [None]:
comb_model_4 = comb_model_3.append(nature_train_prediction_df)
nature_train_prediction_df

# Binary Classification of People_Tag - Grid Search

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["People_Tag"])
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of People_Tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
people_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["People_Tag","Decision Tree", "SVM", "SGDC", "Log"])

In [None]:
comb_model_5 = comb_model_4.append(people_train_prediction_df)
people_train_prediction_df

# Binary Classification of City_tag - Grid Search

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["City_Tag"])

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of City_tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
city_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["City_Tag","Decision Tree", "SVM", "SGDC", "Log"])

In [None]:
comb_model_6 = comb_model_5.append(city_train_prediction_df)
city_train_prediction_df

#  Binary Classification of Voice_Tag - Grid Search

# Train - Validation- Test Split

In [None]:
predict_data = []
train_data = []
X = audio_df[feature_set_six]
y = np.array(audio_df["Voice_Tag"])

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)


# Grid Search For Decision Trees

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,3,4,5,6,7,8,9,20,50]
    
    
parameters = dict(pca__n_components=n_components,
                    dec_tree__criterion=criterion,
                    dec_tree__max_depth=max_depth)

DT_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
DT_GS.fit(X_train, y_train)
scores = DT_GS.cv_results_["mean_test_score"]
scores_std = DT_GS.cv_results_["std_test_score"]  
    
print('Best Criterion:', DT_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', DT_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', DT_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(DT_GS.best_estimator_.get_params()['dec_tree'])
model_one = DT_GS.best_estimator_.get_params()['dec_tree']
GS_TREE__SCORES = scores
GS_TREE__Std = scores_std
train_data.append("-")
train_data.append(round(max(GS_TREE__SCORES),3))

# Grid Search For SVM

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
svm = SVC()

pipe = Pipeline(steps=[('std_slc', std_slc),
                         ('pca', pca),
                           ('svm', svm)])

n_components = list(range(1,X.shape[1]+1,1))
gamma = [10,100,10000,20000]
kernel = ['sigmoid']
C =   [0.001, 0.01, 0.1,10,100]

parameters = dict(pca__n_components=n_components,
                    svm__gamma = gamma,
                    svm__C =C,
                    svm__kernel = kernel)

svm_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)

svm_GS.fit(X_train, y_train)
scores = svm_GS.cv_results_["mean_test_score"]
scores_std = svm_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', svm_GS.best_estimator_.get_params()['svm__gamma'])
print('Best C:', svm_GS.best_estimator_.get_params()['svm__C'])
print('Best kernel:', svm_GS.best_estimator_.get_params()['svm__kernel'])
print('Best Number Of Components:', svm_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(svm_GS.best_estimator_.get_params()['svm'])
model_two = svm_GS.best_estimator_.get_params()['svm']
GS_SVM__SCORES = scores
GS_SVM__Std = scores_std
train_data.append(round(max(GS_SVM__SCORES),3))

# Grid Search For SGDC

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
sgdc = SGDClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('sgdc', sgdc)])

n_components = list(range(1,X.shape[1]+1,1))
loss = ["hinge","log","modified_huber","perceptron"]
epsilon = [.01,.001,.1,.0001,.05]
penalty =  ["l1","l2","elasticnet"]
alpha =  [.00001,.0001,.001,.01,.1]


parameters = dict(pca__n_components=n_components,
                    sgdc__loss = loss,
                    sgdc__epsilon =epsilon,
                    sgdc__penalty = penalty,
                    sgdc__alpha = alpha)

clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  

    
print('Best gamma:', clf_GS.best_estimator_.get_params()['sgdc__loss'])
print('Best C:', clf_GS.best_estimator_.get_params()['sgdc__epsilon'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['sgdc__penalty'])
print('Best alpha:', clf_GS.best_estimator_.get_params()['sgdc__alpha'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['sgdc'])
model_three = clf_GS.best_estimator_.get_params()['sgdc']
GS_SGDC_scores = scores
train_data.append(round(max(GS_SGDC_scores),3))

# Grid Search For Logistic Regression

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
log = LogisticRegression()

pipe = Pipeline(steps=[('std_slc', std_slc),
                        ('pca', pca),
                           ('log', log)])
n_components = list(range(1,X.shape[1]+1,1))

solver = ["newton-cg","sag","saga"]
penalty =  ["l1","l2","elasticnet","none"]
C = [100, 10, 1.0, 50, 20,.001,.0001]
max_iter = [10000,30000,50000]
multi_class = ["auto","ovr","multinomial"]
    
parameters = dict(pca__n_components=n_components,
                    log__solver =solver,
                    log__penalty=penalty,
                    log__C= C,
                    log__max_iter = max_iter,
                    log__multi_class = multi_class)
          
clf_GS = GridSearchCV(pipe, parameters,cv = StratifiedKFold(5),scoring='roc_auc',n_jobs = -1)
clf_GS.fit(X_train, y_train)
scores = clf_GS.cv_results_["mean_test_score"]
scores_std = clf_GS.cv_results_["std_test_score"]  
    
print('Best solver:', clf_GS.best_estimator_.get_params()['log__solver'])
print('Best penalty:', clf_GS.best_estimator_.get_params()['log__penalty'])
print('Best Iter:', clf_GS.best_estimator_.get_params()['log__max_iter'])
print('Best multi class:', clf_GS.best_estimator_.get_params()['log__multi_class'])
print('Best C:', clf_GS.best_estimator_.get_params()['log__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['log'])
model_four = clf_GS.best_estimator_.get_params()['log']
GS_LOG__SCORES = scores
GS_LOG__Std = scores_std
GS_LOG__SCORES = [x for x in GS_LOG__SCORES if str(x) != 'nan']
train_data.append(round(max(GS_LOG__SCORES),3))

# Binary Classification of Voice_Tag - Prediction

In [None]:
dec_tree = model_one
dec_tree.fit(X_valid,y_valid)
test_score = dec_tree.score(X_test, y_test)
y_pred = dec_tree.predict(X_test) 
predict_data.append("-")
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

svm = model_two
svm.fit(X_valid,y_valid)
test_score = svm.score(X_test, y_test)
y_pred = svm.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

sgdc = model_three
sgdc.fit(X_valid,y_valid)
test_score = sgdc.score(X_test, y_test)
y_pred = sgdc.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

log = model_four
log.fit(X_valid,y_valid)
test_score = log.score(X_test, y_test)
y_pred = log.predict(X_test) 
predict_data.append(round(roc_auc_score(y_test, y_pred),2))

In [None]:
voice_train_prediction_df = pd.DataFrame(list(zip(train_data,predict_data)),
                                  columns = ["Training", "Prediction"],
                                  index = ["Voice_Tag","Decision Tree", "SVM", "SGDC", "Log"])


In [None]:
final_eval_df = comb_model_6.append(voice_train_prediction_df)
voice_train_prediction_df

In [None]:
final_eval_df