In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/voicegender/voice.csv')
data_set=pd.DataFrame(data)
data_set.head()


* meanfreq: mean frequency of the voice audio of the person (in kHz)
* sd: standard deviation of the frequency of the voice audio
* median: median frequency of the voice audio (in kHz)
* Q25: first quantile (in kHz)
* Q75: third quantile (in kHz)
* IQR: interquantile range (in kHz)
* skew: Skewness refers to a distortion or asymmetry that deviates from the symmetrical bell curve, or normal distribution
* kurt: Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution.
* sp.ent: spectral entropy
* sfm: spectral flatness
* mode: mode frequency
* centroid: frequency centroid (see specprop)
* meanfun: mean fundamental frequency measured across acoustic signal
* minfun: minimum fundamental frequency measured across acoustic signal
* maxfun: maximum fundamental frequency measured across acoustic signal
* meandom: mean of dominant frequency measured across acoustic signal
* mindom: minimum of dominant frequency measured across acoustic signal
* maxdom: maximum of dominant frequency measured across acoustic signal
* dfrange: range of dominant frequency measured across acoustic signal
* modindx: modulation index

In [None]:
data.isnull().sum()

In [None]:
data_set.describe()

In [None]:
import seaborn
import matplotlib.pyplot as plt
plt.figure(figsize=(21,21))
seaborn.heatmap(data.corr(),annot=True,cmap='viridis',linewidth=0.5)

In [None]:
!pip install mglearn
import mglearn
gen = pd.read_csv('../input/voicegender/voice.csv')
gen_data = pd.DataFrame(gen)
gen_data.head()
male = gen.loc[gen['label']=='male']
female = gen.loc[gen['label']=='female']
fig, axes = plt.subplots(10, 2, figsize=(10,20))
ax = axes.ravel()
for i in range(20):
    ax[i].hist(male.iloc[:,i], bins=20, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(female.iloc[:, i], bins=20, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(list(male)[i])
    ax[i].set_yticks(())
    ax[i].set_xlabel("Feature magnitude")
    ax[i].set_ylabel("Frequency")
    ax[i].legend(["male", "female"], loc="best")

fig.tight_layout()

On analysing the above plots we can conclude that we can drop some features due to high correlation or because of not having great diffrence in male and female data sets. These features are sfm,kurt,meandom,meanfreq,dfrange,modindx

In [None]:
new_data_set = data_set.drop(['sfm','kurt','meandom','meanfreq','dfrange','modindx'],axis=1)
new_data_set.head()

In [None]:

plt.figure(figsize=(16,16))
seaborn.heatmap(new_data_set.corr(),annot=True,cmap='viridis',linewidth=0.5)

Now we can see we have relatively less features having high correlations

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(new_data_set.iloc[ : , : -1].values, new_data_set.iloc[ : ,  -1].values, test_size=0.2)

In [None]:
from sklearn.svm import SVC
classifier1 = SVC(kernel = 'rbf')
classifier1.fit(X_train, y_train)
print('Using SVM classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier1.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier1.score(X_test,y_test)))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier2.fit(X_train, y_train)
print('Using K nearest Classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier2.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier2.score(X_test,y_test)))

In [None]:
from sklearn.svm import SVC
classifier3 = SVC(kernel = 'linear')
classifier3.fit(X_train, y_train)
print('Using SVM classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier3.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier3.score(X_test,y_test)))

In [None]:
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'rbf')
classifier4.fit(X_train, y_train)
print('Using kernel SVM classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier4.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier4.score(X_test,y_test)))

In [None]:

from sklearn.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, y_train)
print('Using Naive Bayes classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier5.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier5.score(X_test,y_test)))

In [None]:

from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy')
classifier6.fit(X_train, y_train)
print('Using Decision tree classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier6.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier6.score(X_test,y_test)))

**We can clearly see the highest accuracy on training set is for RandomForest Classifier that is of 97%
So we can say this models suits best**