# Introduction

The Dataset

The following acoustic properties of each voice are measured and included within the CSV:

    meanfreq: mean frequency (in kHz)
    sd: standard deviation of frequency
    median: median frequency (in kHz)
    Q25: first quantile (in kHz)
    Q75: third quantile (in kHz)
    IQR: interquantile range (in kHz)
    skew: skewness (see note in specprop description)
    kurt: kurtosis (see note in specprop description)
    sp.ent: spectral entropy
    sfm: spectral flatness
    mode: mode frequency
    centroid: frequency centroid (see specprop)
    peakf: peak frequency (frequency with highest energy)
    meanfun: average of fundamental frequency measured across acoustic signal
    minfun: minimum fundamental frequency measured across acoustic signal
    maxfun: maximum fundamental frequency measured across acoustic signal
    meandom: average of dominant frequency measured across acoustic signal
    mindom: minimum of dominant frequency measured across acoustic signal
    maxdom: maximum of dominant frequency measured across acoustic signal
    dfrange: range of dominant frequency measured across acoustic signal
    modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range
    label: male or female

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
voice = pd.read_csv('../input/voicegender/voice.csv')

In [None]:
voice.info()

In [None]:
voice.head()

In [None]:
voice.describe()

Check for any null values

In [None]:
voice.isnull().sum()

In [None]:
voice.isnull().any(axis=1).sum()

# Data Visualization

In [None]:
sns.countplot(voice['label'])

In [None]:
voice.groupby('label').mean()

Measuring Cohen's d effect size 

Interactive Visualization - https://rpsychologist.com/d3/cohend/

In [None]:
def cohens_d(feature):
    m1 = voice[voice['label']=='male'][feature].mean()
    m2 = voice[voice['label']=='female'][feature].mean()
    n1 = voice[voice['label']=='male'][feature].size
    n2 = voice[voice['label']=='female'][feature].size
    s1 = voice[voice['label']=='male'][feature].std()
    s2 = voice[voice['label']=='female'][feature].std()
    s = np.sqrt((((n1-1)*s1**2) + ((n2-1)*s2**2)) / (n1+n2-2))
    d = (m1 - m2) / s
    return np.abs(d)

In [None]:
cohens_d_effect = pd.Series([cohens_d(i) for i in voice.columns[:-1]], index= voice.columns[:-1])

In [None]:
plt.figure(figsize=(20,5))
plt.plot(cohens_d_effect)

1. Mean fundamental frequency has highest effect size of 3
2. sd, Q25, IQR and spectral entropy have values around 1 to 1.5

Ratio of female to male mean values

In [None]:
voice.groupby('label').mean().loc['female']/voice.groupby('label').mean().loc['male']

In [None]:
plt.figure(figsize=(20,5))
plt.plot((voice.groupby('label').mean().loc['female']/voice.groupby('label').mean().loc['male']))
plt.plot([1]*20, '--')
plt.title('Ratio of female to male values')
plt.show()

1. for Q25, meanfun, meandom, mindom, maxdom and dfrange ratio of male to female is more than 1.25
2. for IQR and kurt ratio is less than 0.6

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(voice.corr(), cmap='Spectral', annot=True)
plt.show()

In [None]:
voice.columns

Distribution plots of all features by label

In [None]:
long_voice = pd.melt(voice, id_vars='label', value_vars=voice.columns[:-1], var_name='properties')

In [None]:
g = sns.FacetGrid(long_voice, col='properties', col_wrap=5, hue='label', sharex=False, sharey=False, height=4)
g = g.map(sns.kdeplot, 'value').add_legend().set_titles("{col_name}").set_axis_labels('')

Violin plots of each feature

In [None]:
df1 = voice.iloc[:, :10]
df1 =  (df1-df1.mean())/df1.std()

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(data=df1)

sd and IQR show clear bimodal distribution.

In [None]:
df1 = pd.concat([df1, voice['label']], axis=1)

In [None]:
df1 = pd.melt(df1, id_vars='label', var_name='properties')

In [None]:
plt.figure(figsize=(20,7))
sns.violinplot(x='properties', y='value', hue='label', split=True, inner='quart', data=df1)

In [None]:
df2 = voice.iloc[:, 10:-1]
df2 =  (df2-df2.mean())/df2.std()

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(data=df2)

meanfun has clear bimodal distribution

In [None]:
df2 = pd.melt(pd.concat([df2, voice['label']], axis=1), id_vars='label', var_name='properties')

In [None]:
plt.figure(figsize=(20,7))
sns.violinplot(x='properties', y='value', hue='label', split=True, inner='quart', data=df2)

# Preprocessing

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, precision_recall_curve

In [None]:
X = voice.iloc[:, :-1]
y = voice['label']

In [None]:
def preprocess(X, y, rand):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = rand)
    min_ = X_train.min()
    max_ = X_train.max()
    X_train = (X_train - min_)/(max_ - min_)
    X_test = (X_test - min_)/(max_ - min_)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess(X, y, 53)

In [None]:
def scores(X, y, clf, n):
    """
        X, y are input and output variables.
        clf is classifier algorithm
        n is number of random states used for splitting the dataframe
        this function returns array of scores for random states 0 to n.
    """
    scores = []
    for i in range(n):
        X_train, X_test, y_train, y_test = preprocess(X, y, i)
        clf.fit(X_train, y_train)
        scores.append(clf.score(X_test, y_test))
    return np.array(scores)

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
score_list = scores(X, y, ExtraTreesClassifier(n_estimators=200), 100)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(score_list)
plt.xlabel('random state')
plt.ylabel('mean accuracy score')

In [None]:
score_list.mean()

In [None]:
cross_val_score(ExtraTreesClassifier(criterion='entropy', n_estimators=200), X_train, y_train, cv=5).mean()

In [None]:
etc = ExtraTreesClassifier(n_estimators=200, criterion='entropy')

In [None]:
etc.fit(X_train, y_train)

In [None]:
predictions = etc.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
fig, axes = plt.subplots(1,1, figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, ax=axes)

Plotting probabilities for each case.

In [None]:
pred_prob = etc.predict_proba(X_test)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob[:, 1], pos_label='male')

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, pred_prob[:, 1], pos_label='male')

In [None]:
fig, axes = plt.subplots(1,2, figsize=(15,5))
axes[0].plot(fpr, tpr)
axes[0].plot([0, 1], [0, 1],'r--')
axes[0].set_xlim([-0.05, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')

axes[1].plot(recall, precision)
axes[1].plot([0, 1], [0, 1],'r--')
axes[1].set_xlim([0.0, 1.05])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5), gridspec_kw={'width_ratios': [1, 2]})
sns.heatmap(pred_prob[np.argsort(pred_prob[:, 0])], ax=ax1)
ax2.plot(pred_prob[np.argsort(pred_prob[:, 0])])
ax2.set_xlabel('test case number')
ax2.set_ylabel('probability')
ax2.legend(['female', 'male'])

In [None]:
feature_imp = pd.DataFrame(etc.feature_importances_, voice.iloc[:, :-1].columns, columns=['importance']).sort_values(by='importance', ascending=False)

In [None]:
feature_imp.head()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
logreg = LogisticRegression(solver='lbfgs', max_iter=500)

In [None]:
cross_val_score(logreg, X_train, y_train, cv=5).mean()

In [None]:
logreg.fit(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

Grid Search CV

In [None]:
parameters = {'solver':( 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[0.001,0.01,.1,1,5,10,25,100]}

In [None]:
clf = GridSearchCV(logreg, parameters)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

Randomized Search CV

In [None]:
parameters = {'solver':( 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[i for i in range(1,100)]}

In [None]:
clf = RandomizedSearchCV(logreg, param_distributions=parameters, n_iter=50)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=100)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
cross_val_score(rfc, X_train, y_train, cv=5).mean()

In [None]:
print(classification_report(y_test, predictions))

In [None]:
fig, axes = plt.subplots(1,1, figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, ax=axes)

In [None]:
pred_prob = rfc.predict_proba(X_test)

Plotting probabilities for each case.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5), gridspec_kw={'width_ratios': [1, 2]})
sns.heatmap(pred_prob[np.argsort(pred_prob[:, 0])], ax=ax1)
ax2.plot(pred_prob[np.argsort(pred_prob[:, 0])])
ax2.set_xlabel('test case number')
ax2.set_ylabel('probability')
ax2.legend(['female', 'male'])

In [None]:
feature_imp = pd.DataFrame(rfc.feature_importances_, voice.iloc[:, :-1].columns, columns=['importance']).sort_values(by='importance', ascending=False)

In [None]:
feature_imp.head()

# Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC(C=10, gamma='scale')

In [None]:
svc.fit(X_train, y_train)

In [None]:
predictions = svc.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
cross_val_score(svc, X_train, y_train, cv=5).mean()

In [None]:
pred_prob = svc.decision_function(X_test)

In [None]:
from sklearn.calibration import calibration_curve

In [None]:
fop, mpv = calibration_curve(y_test, pred_prob, normalize=True, n_bins=10)

In [None]:
plt.plot(mpv, fop, '*-')
plt.plot([0,1])

SVC does not give probability of each prediction, so we have to use predict_proba method from CalibratedClassifierCV.

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
ccc = CalibratedClassifierCV(svc, 'sigmoid')

In [None]:
ccc.fit(X_train,  y_train)

In [None]:
pred_prob = ccc.predict_proba(X_test)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5), gridspec_kw={'width_ratios': [1, 2]})
sns.heatmap(pred_prob[np.argsort(pred_prob[:, 0])], ax=ax1)
ax2.plot(pred_prob[np.argsort(pred_prob[:, 0])])
ax2.set_xlabel('test case number')
ax2.set_ylabel('probability')
ax2.legend(['female', 'male'])

In [None]:
fop1, mpv1 = calibration_curve(y_test, ccc.predict_proba(X_test)[:, 1], n_bins=10, normalize=True)

In [None]:
plt.plot(mpv1, fop1, '^-', label='calibrated')
plt.plot(mpv, fop, '*-')
plt.plot([0,1])
plt.legend()

In [None]:
ccc.score(X_test, y_test)

[](http://)# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier()

In [None]:
cross_val_score(gbc, X_train, y_train, cv=5).mean()

In [None]:
gbc.fit(X_train, y_train)

In [None]:
predictions = gbc.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
fig, axes = plt.subplots(1,1, figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, predictions), annot=True, ax=axes)