# 0. Helper Functions

Helper functions for Automating Selection and Making it Easier as explained near the end of the notebook.

In [None]:
def evaluate_model_fn(classifier, X_train, y_train, X_test, y_test):
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    ac = accuracy_score(y_test, y_pred)

    return ac

In [None]:
def choose_best_model_fn(X_train, y_train, X_test, y_test):
    """Get the best model based on its accuracy where the models'
    hyperparameters were tuned after feature selection that is based on the
    initial Exploratory Data Analysis (EDA).
    """

    model_accuracy_dict = defaultdict(int)

    # Logistic Regression
    lr = LogisticRegression(random_state=0)
    model_accuracy_dict['Logistic Regression'] = evaluate_model_fn(lr, X_train,
                                                    y_train, X_test, y_test)
    
    # Support Vector Machine
    svc = SVC(kernel='linear', random_state=0)
    model_accuracy_dict['Support Vector Machine'] = evaluate_model_fn(svc,
                                        X_train, y_train, X_test, y_test)
    
    # KNN
    knn = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2)
    model_accuracy_dict['KNN'] = evaluate_model_fn(knn, X_train, y_train,
                                                   X_test, y_test)
    
    # Kernel SVM
    svm = SVC(kernel = 'rbf', random_state = 0)
    model_accuracy_dict['Kernel SVM'] = evaluate_model_fn(svm, X_train, y_train,
                                                          X_test, y_test)
    
    # Naive Bayes
    nv = GaussianNB()
    model_accuracy_dict['Naive Bayes'] = evaluate_model_fn(nv, X_train, y_train,
                                                           X_test, y_test)
    
    # Decision Tree (DT)
    dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    model_accuracy_dict['DT'] = evaluate_model_fn(dt, X_train, y_train, X_test,
                                                  y_test)
    
    # Random Forest (RF)
    rf = grid_search.best_estimator_
    model_accuracy_dict['RF'] = evaluate_model_fn(rf, X_train, y_train,
                                                  X_test, y_test)

    # get the key of the classifier with the maximum accuracy
    best_classifier = max(model_accuracy_dict.items(),
                          key=operator.itemgetter(1))[0]
    best_classifier_accuracy = model_accuracy_dict[best_classifier]

    # return model_accuracy_dict
    print(f'Best classifier is {best_classifier} with accuracy '
          f'{best_classifier_accuracy :.3f}.')
    

In [None]:
def columns_predictive_power_fn(csv_file_path):
    """Obtain the accuracy of each column on using it independently on all
    models."""
    # read the csv file
    dataframe = pd.read_csv(csv_file_path)
    # get labels
    y = dataframe.iloc[:, -1].values
    # remove the label column from the dataframe
    dataframe.drop(['label'], axis=1, inplace=True)
    # loop over all dataframe columns
    for col in dataframe.columns:
        # select the current column as the only feature
        X = dataframe[col]
        # split data into training and testing set
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                test_size=0.25, random_state=0)
        # reshape X_train and X_test for using then in feature scaling
        X_train = np.array(X_train).reshape(-1, 1)
        X_test = np.array(X_test).reshape(-1, 1)
        # feature scaling
        sc = StandardScaler()
        # get scaling factors based on the training set
        X_train = sc.fit_transform(X_train)
        # apply the same training set scaling facotrs to the testing set
        X_test = sc.transform(X_test)
        print(f"Using {col} column")
        choose_best_model_fn(X_train, y_train, X_test, y_test)
        print("")

# 1. Import Libraries & Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# for evaluating models
from sklearn.metrics import confusion_matrix, accuracy_score,\
classification_report

import operator
from collections import defaultdict

import scipy.stats as stats
import copy
import warnings

warnings.filterwarnings('ignore')
# graphics in retina format
%config InlineBackend.figure_format = 'retina'

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
df.head()

# 2. Detect Missing Data

In [None]:
print(f"Data Shape: {df.shape}")

In [None]:
df.info()

In [None]:
# for more assertion check for nans
df.isna().apply(pd.value_counts)

In [None]:
# unique labels
df['label'].unique()

## Results

* No missing data.
* 20 columns contain float numbers.
* 1 column contain category `['male', 'female']`.

**Objective:** Build a classifier that accurately predicts if the voice is a male's or female's voice given all the input features.

# 3. Exploratory Data Analysis (EDA) and Data Preprocessing

In [None]:
# number of unique values per column
df.nunique()

## 3.1 Remove Columns with very low Predictive power

We could observe from the above number of unique values per column, that features that have **3166** unique values are nearly **100%** unique since number of examples are **3168**. Therefore I will remove all these columns from the dataframe since they nearly won't have any predictive power in deducing the label (Male or Female).

In [None]:
np.sum(df.nunique() == 3166)

Therefore I will remove 8 columns.

In [None]:
df.nunique() == 3166

In [None]:
# drop low predictive power columns
df.drop(labels=['meanfreq', 'sd', 'skew', 'kurt', 'sp.ent', 'sfm', 'centroid',
                'meanfun', 'median'], axis=1, inplace=True)
df.head()

In [None]:
print(f"New Data Shape: {df.shape}")

Now things seems much easier for classifiers since noise should be reduced.

## 3.2 Remove Columns with very low Predictive power

Now I will visualize if the left features have any considerable difference betweeen males and females by plotting continuous probability density curves.

In [None]:
plt.subplots(3, 4, figsize=(15, 15))

for i in range(1, 12):
    plt.subplot(3, 4, i)
    sns.kdeplot(df.loc[df['label'] == 'male', df.columns[i-1]], color='blue',
                label='Male')
    sns.kdeplot(df.loc[df['label'] == 'female', df.columns[i-1]],
                color='green', label='Female')

Some features seem to have big overlap between males, and females. Therefore I will drop those features from the dataframe.

In [None]:
df.drop(labels=['Q75', 'minfun', 'maxfun', 'modindx'], axis=1, inplace=True)
df.head()

## 3.3 Split data into training and testing set

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
# use the random state for having reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=0)

## 3.4 Feature Scaling

In [None]:
# value ranges before feature scaling
df.describe()

It would be better for the models to put values on the same scale by appling feature scaling to avoid the dominance of some features over others due to their higher scale although they could have less predictive power as a feature.

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 4. Modeling (Trying different classifiers)

I will do this to understand more the different linear and non-linear classifiers.

In [None]:
df['label'].value_counts()

> **Evaluation:** I will only use accuracy as my evaluation metric since precision, recall and F1-score are more useful in case of having unequal labels as far as I know, and the number of males and females in the dataset are exactly the same. Moreover, I think voice detection is not as critical as medical domains where precision and recall will be more important.

## 1. Linear Classifiers

### 1.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
evaluate_model_fn(lr, X_train, y_train, X_test, y_test)

### 1.2 Support Vector Machine

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='linear', random_state=0)
evaluate_model_fn(svc, X_train, y_train, X_test, y_test)

## 2. Non-linear Classifiers

### 2.1 K-Nearest Neighbors (K-NN)

If K is too small it will be sensitive to noise, and if it is too high it may include majority points from other classes.

> Therefore a rule of thumb is to choose $K < \sqrt(n)$ where n is the number of training examples.

> Also choose K to be an odd number to avoid ties.

In [None]:
np.sqrt(df.shape[0])

In [None]:
def knn_optimize_fn(k, X_train, y_train, X_test, y_test):
    """Get the best number of nearest neighbors in KNN."""
    accuracies = []
    K = range(1, k)
    for i in K:
        # we will set metric='minkowski' with p=2 for choosing the euclidean
        # distance as written in the sklearn documentation 
        knn = KNeighborsClassifier(n_neighbors=i, metric='minkowski', p=2)
        accuracies.append(evaluate_model_fn(knn, X_train, y_train, X_test,
                                            y_test))
        
    plt.plot(K, accuracies, linestyle='dashed', marker='o',
             markerfacecolor='red')
    plt.xlabel('K')
    plt.ylabel('Accuracy')
    plt.show()
    print(f"Best K = {K[accuracies.index(max(accuracies))]}")


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_optimize_fn(25, X_train, y_train, X_test, y_test)

In [None]:
# we will set metric='minkowski' with p=2 for choosing the euclidean distance as
# written in the sklearn documentation 
knn = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=2)
evaluate_model_fn(knn, X_train, y_train, X_test, y_test)

### 2.2 Kernel Support Vector Machine (SVM)

In [None]:
def svm_optimize_fn(X_train, y_train, X_test, y_test):
    """Get the best SVM kernel."""
    accuracies = []
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    for kernel in kernels:
        # we will set metric='minkowski' with p=2 for choosing the euclidean
        # distance as written in the sklearn documentation 
        svm = SVC(kernel = kernel, random_state = 0)
        accuracies.append(evaluate_model_fn(svm, X_train, y_train, X_test,
                                            y_test))
        
    plt.plot(kernels, accuracies, linestyle='dashed', marker='o',
             markerfacecolor='red')
    plt.xlabel('Kernel')
    plt.ylabel('Accuracy')
    plt.show()
    print(f"Best Kernel = {kernels[accuracies.index(max(accuracies))]}")


In [None]:
from sklearn.svm import SVC

svm_optimize_fn(X_train, y_train, X_test, y_test)

In [None]:
svm = SVC(kernel = 'rbf', random_state = 0)
evaluate_model_fn(knn, X_train, y_train, X_test, y_test)

### 2.3 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nv = GaussianNB()
evaluate_model_fn(nv, X_train, y_train, X_test, y_test)

### 2.4 Decision Tree (DT)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# using entropy criterion
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
evaluate_model_fn(dt, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# using gini criterion
dt = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
evaluate_model_fn(dt, X_train, y_train, X_test, y_test)

Therefore **entropy** is a better criterion in this case.

### 2.5 Random Forest (RD)

I was guided in here by this blog post [Hyperparameter Tuning the Random Forest in Python](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74).

#### 2.5.1 Random Hyperparameter Grid

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# number of features to consider at every split
max_features = ['auto', 'sqrt']
# maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 150, num = 15)]
# minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# method of selecting samples for training each tree
bootstrap = [True, False]
# create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)               

#### 2.5.2 Random Search Training

Use the random grid to search for best hyperparameters.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# create the base model to tune
rf = RandomForestClassifier()
# random search of parameters using 5 fold cross validation, 
# search across 200 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 200, cv = 5, verbose=2, random_state=0,
                               n_jobs = -1)
# fit the random search model
rf_random.fit(X_train, y_train)

pprint(rf_random.best_params_)

From these results, we should be able to narrow the range of values for each hyperparameter.

Now we could narrow the range of values for each hyperparameter using the **Random Search** results.

#### 2.5.3 Grid Search with Cross Validation

Using Random Search results we could do **GridSearchCV** which evaluates all the combinations that we define instead of random sampling. We should make another grid based on the best values we obtained by random search for using Grid Search.

In [None]:
from sklearn.model_selection import GridSearchCV

# create the parameter grid based on the random search results
param_grid = {
    'bootstrap': [False],
    'max_depth': [140, 150, 160, 170],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [1900, 2000, 2100]
}

# create a based model
rf = RandomForestClassifier()
# instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

# fit the grid search to the data
grid_search.fit(X_train, y_train)

pprint(grid_search.best_params_)

#### 2.5.4 Implement the RF Algorithm using the best obtained hyperparameters

The result of `rf = grid_search.best_estimator_` I obtained was
```
RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=140, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1900,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
```

In [None]:
rf = grid_search.best_estimator_
evaluate_model_fn(rf, X_train, y_train, X_test, y_test)

So unitl now **Random Forest** final obtained model using grid search has obtained the highest accuracy on the test set with `accuracy = 0.9479495268138801`.

# 5. Optimization (Feature Selection)

Now that we have nearly obtained the best models given the features we have now, why not see if these features were the best or not by reselecting features based on their performance on these newly obtained models.

So now instead of feature engineering then modeling, I will somehow inverse the process. Therefore, based on having some models, I will:
1. Get the predictive power of each feature independently by using it alone across all models and see which feature will give the highest accuracy with the best model.
2. Based on their predictive power, I will select features that obtained certain accuracies.
3. Using these features, I will get the best model which resulted in the highest accuracy.
4. Then I will return back to step 2 by decreasing the accuracy threshold a little bit, and therefore adding more features to the model, and moving until step 3 to get the model with highest accuracy.
5. Repeat the above process and see which model has overall obtained the highest accuracy and choose this model as your final model.

## 5.1 Get Each Feature Predictive Power

We could check the predictive power on using them only by using the `columns_predictive_power_fn` function defined above in the helpers functions.

In [None]:
columns_predictive_power_fn('../input/voicegender/voice.csv')

In [None]:
def choose_best_model_given_df_fn(dataframe):
    """Extract features and labels, split data into training and testing sets,
    perforem feature scaling and finally fit the best model."""
    print(f'New Data Shape: {dataframe.shape}')

    X = dataframe.iloc[:, :-1].values
    y = dataframe.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    choose_best_model_fn(X_train, y_train, X_test, y_test)

## 5.2 Using Features with Accuracy above 90%

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
df = df[['IQR', 'meanfun', 'label']]
choose_best_model_given_df_fn(df)

## 5.3 Using Features with Accuracy above 80%

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
df = df[['IQR', 'meanfun', 'sd', 'Q25', 'label']]
choose_best_model_given_df_fn(df)

## 5.4 Using Features with Accuracy above 70%

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
df = df[['IQR', 'meanfun', 'sd', 'Q25', 'sp.ent', 'sfm', 'mode', 'label']]
choose_best_model_given_df_fn(df)

## 5.5 Using Features with Accuracy above 60%

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
df = df[['IQR', 'meanfun', 'sd', 'Q25', 'sp.ent', 'sfm', 'mode', 'meanfreq',
         'median', 'skew', 'centroid', 'meandom', 'mindom', 'maxdom', 'dfrange',
         'label']]
choose_best_model_given_df_fn(df)

## 5.5 Using all features

In [None]:
df = pd.read_csv('../input/voicegender/voice.csv')
choose_best_model_given_df_fn(df)

# 6. Conclusion

By using the final optimization technique we obtained models with higher accuracies than using the features obtained from the first Exploratory Data Analysis (EDA) which got us an accuracy of 0.94 by using **Random Search** then **Grid Search** in **Random Forest**. Now, after optimization the best models are:
1. Using all features: **RBF Kernel SVM** with accuracy 0.985.
2. Using these features: `['IQR', 'meanfun', 'sd', 'Q25', 'sp.ent', 'sfm', 'mode', 'label']` **RBF Kernel SVM** with accuracy 0.985.

So I would prefer to go with the second option to avoid including unnecessary features which will need more computations since both obtained the same accuracy to the third decimal place but the second option has less features with higher predictive power if when they were acting in the models independently.