# Traditional ML

Using the csv files we generated from the "02 VoiceSentiment-Feature_Extraction", we will build models using the “traditional” machine learning  to be used as baseline when we build our deep learning model. 

1. Load our train and test dataset
2. Base Model
3. Define X and Y
4. Need to scale the data before we build our models
5. Building Models
    - 5.1 MLP
    - 5.2 RandomForest
    - 5.3 Logistic Regression
    - 5.4 VotingClassifier
6. Metrics
7. Summary

In [1]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#boosting
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold, cross_val_score

from datetime import datetime
from sklearn.neural_network import MLPClassifier
import optuna

### 1. Load our train and test dataset

In [2]:
#load the data
df_train = pd.read_csv("./dataset/train.csv")
df_test = pd.read_csv("./dataset/test.csv")

#remove the index column
df_train = df_train.drop(columns='Unnamed: 0')
df_test = df_test.drop(columns='Unnamed: 0')

In [3]:
df_train.shape

(10188, 186)

In [4]:
df_test.shape

(1132, 186)

## 2. Base Model

In [5]:
df_test["label"].value_counts(normalize=True)

disgust      0.143993
fear         0.143993
happy        0.143993
sad          0.143993
angry        0.143993
surprised    0.143993
neutral      0.136042
Name: label, dtype: float64

In [6]:
df_train["label"].value_counts(normalize=True)

fear         0.143993
happy        0.143993
surprised    0.143993
sad          0.143993
angry        0.143993
disgust      0.143993
neutral      0.136042
Name: label, dtype: float64

## Base Model is around 14%

### 3. Define X and Y

In [7]:
#converting categorical labels to numeric for test dataset
factor_test = pd.factorize(df_test['label'], sort=True)
df_test['label'] = factor_test[0]   

In [8]:
#converting categorical labels to numeric for train dataset
factor_train= pd.factorize(df_train['label'], sort=True)
df_train['label'] = factor_train[0]   

In [9]:
#since we already split our data to train and test in the prior notebook, we will just assign the values accordingly
X_train = df_train.drop(columns=["label", "file"])
y_train = df_train["label"]

X_test = df_test.drop(columns=["label", "file"])
y_test = df_test["label"]

### 4. Need to scale the data before we build our models

In [10]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
def build_model(name, model):
    
    models[name] = {}
    
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f"Model Fitting.. {name} Current Time = {current_time}")
        
    model.fit(X_train_scaled, y_train)
    
    
    models[name]["train_score"] = model.score(X_train, y_train)
    models[name]["test_score"] = model.score(X_test, y_test)
    models[name]["model"] = model

    y_pred = model.predict(X_test_scaled)
    
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize = (10, 5))
    cm = pd.DataFrame(cm , index = [i for i in factor_test[1]] , columns = [i for i in factor_test[1]])
    sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')

    plt.title('Confusion Matrix', size=20)
    plt.xlabel('Predicted Labels', size=14)
    plt.ylabel('Actual Labels', size=14)
    plt.show()

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(f"Model Completion.. {name} Current Time = {current_time}")
    
    return model

### 5. Building Models

***BEST PARAMETERS obtained by using OPTUNA***

In [12]:
models = {}

In [None]:
## 5.1 Multi-layer Perceptron classifier.

mlp = MLPClassifier(activation= 'relu', 
              solver= 'sgd', 
              hidden_layer_sizes= 1200, 
              alpha= 0.255, 
              batch_size= 200, 
              learning_rate= 'constant',
              max_iter=10000)
build_model('mlp', mlp)

Model Fitting.. mlp Current Time = 19:16:59


In [None]:
## 5.2 Random Forest

rf = RandomForestClassifier(n_estimators= 650
                            , max_depth= 85
                            , criterion= 'entropy')
rf = build_model('rf', rf)

In [None]:
## 5.3 Logistic Regression

lr = LogisticRegression(multi_class = 'multinomial'
                        , penalty = 'l2'
                        ,solver = 'saga'
                        , max_iter= 10000)

lr = build_model("lr", lr)

In [None]:
## 5.4 Voting Classifier

vote = VotingClassifier([
    ('mlp', MLPClassifier(activation= 'relu', 
              solver= 'sgd', 
              hidden_layer_sizes= 1200, 
              alpha= 0.255, 
              batch_size= 200, 
              learning_rate= 'constant',
              max_iter=10000)),
    ('rf', RandomForestClassifier(max_depth = 85
                                  ,n_estimators = 650
                                  ,random_state= 0))
    ], voting="soft")

vote = build_model("vc", vote)

In [None]:
models

### 6. Metrics

In [None]:
score_list = []

for model in models:
        dic = {"model": model,
                "test_score": models[model]["test_score"]
        }
        score_list.append(dic)
df_score = pd.DataFrame(score_list)
df_score

### Save the best model for future prediction

## 7. SUMMARY

## OPTUNA.
#### This ran for a long time and used to search for the best params using Optuna

In [None]:
def objective_mlp(trial):

    params = {
        'activation': trial.suggest_categorical('activation', ['logistic', 'tanh', 'relu']),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
        'hidden_layer_sizes':trial.suggest_int('hidden_layer_sizes', 100, 1500),
        'alpha': trial.suggest_uniform('alpha', 0.001, 0.99),
        'batch_size':trial.suggest_int('batch_size', 150, 300), 
        'learning_rate': trial.suggest_categorical('learning_rate', ['adaptive', 'constant', 'invscaling']),
        'max_iter': 10000
        }
  
    model = MLPClassifier(**params, random_state = 22) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy'))

In [None]:
## MLP

study = optuna.create_study(direction='maximize')
study.optimize(objective_mlp, n_trials=10)

In [None]:
def objective_rf(trial):

    kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
        
    params = {
         'n_estimators': trial.suggest_categorical('n_estimators', [650, 700]),
         'max_depth': trial.suggest_categorical('max_depth', [80, 85]),
         'criterion': trial.suggest_categorical('criterion', ['entropy'])
        }
  
    model = RandomForestClassifier(**params, random_state = 0) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='accuracy'))

In [None]:
## RANDOM FOREST

study = optuna.create_study(direction='maximize')
study.optimize(objective_rf, n_trials=10)

In [None]:
def objective_lr(trial):

    kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
        
    params = {
         'multi_class': trial.suggest_categorical('multi_class', ['multinomial']),
         'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
         'solver': trial.suggest_categorical('solver', ['saga']),
         'max_iter': 10000
        }
  
    model = LogisticRegression(**params, random_state = 0) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='accuracy'))

In [None]:
## Logistic Regression

study = optuna.create_study(direction='maximize')
study.optimize(objective_lr, n_trials=10)