# Using VotingClassifier to build a pipeline

In [1]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score

from datetime import datetime

import joblib

#for Audio processing
from IPython.display import Audio
import librosa
import librosa.display

### 1. Load our train and test dataset

In [10]:
#load the data
df_all = pd.read_csv("./dataset/all_raw.csv")

#remove the index column
df_all = df_all.drop(columns='Unnamed: 0')

In [11]:
df_all.shape

(4528, 3)

In [12]:
df_all.head()

Unnamed: 0,file,emotion,audio
0,./raw_source/RAVDESS/Actor_16/03-01-05-01-02-0...,angry,[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ...
1,./raw_source/RAVDESS/Actor_16/03-01-06-01-02-0...,fear,[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ...
2,./raw_source/RAVDESS/Actor_16/03-01-06-02-01-0...,fear,[ 3.0036153e-05 2.7443759e-05 9.8903274e-07 ...
3,./raw_source/RAVDESS/Actor_16/03-01-05-02-01-0...,angry,[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ...
4,./raw_source/RAVDESS/Actor_16/03-01-07-01-01-0...,disgust,[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ...


## Split the dataset

### 3. Define X and Y

In [5]:
#converting categorical labels to numeric for test dataset
le = LabelEncoder()
df_train['label'] = le.fit_transform(df_train['label'])
df_test['label']= le.transform(df_test['label'])

In [6]:
#since we already split our data to train and test in the prior notebook, we will just assign the values accordingly
X_train = df_train.drop(columns=["label", "file"])
y_train = df_train["label"]

X_test = df_test.drop(columns=["label", "file"])
y_test = df_test["label"]

### 4. Need to scale the data before we build our models

In [51]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 5. Building Models

***BEST PARAMETERS obtained by using OPTUNA***

In [60]:
## 5.1 Multi-layer Perceptron classifier.

##Optuna
#Trial 8 finished with value: 0.8530609147840515 and 
#    parameters: {'activation': 'logistic', 'solver': 'lbfgs', 'hidden_layer_sizes': 880, 
#                 'alpha': 0.3768510253499107, 'batch_size': 300, 'learning_rate': 'invscaling'}. 
#        Best is trial 8 with value: 0.8530609147840515.
            
mlp = MLPClassifier(activation= 'logistic', #'relu', 
              solver= 'lbfgs', #'sgd', 
              hidden_layer_sizes= 880, #1200,
              alpha= 0.376851, #0.255, 
              batch_size= 300, #200, 
              learning_rate= 'invscaling', #'constant',
              max_iter=10000
             ,random_state= 0)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"MLP Model Fit.. Current Time = {current_time}")
    
mlp.fit(X_train_scaled, y_train)

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print(f"MLP Model Completion.. Current Time = {current_time}")



MLP Model Fit.. Current Time = 18:51:07
MLP Model Completion.. Current Time = 18:55:05


### Save the best model for future prediction -- Voting Classifier !

In [20]:
#save the model
joblib.dump(models['vc']['model'], './model/cv_ser_model.bin', compress=True)

#save the labelencoder
joblib.dump(le, './model/cv_label_encoder.bin', compress=True)

#save the scaler
joblib.dump(scaler, './model/cv_std_scaler.bin', compress=True)

['./model/cv_std_scaler.bin']