# Gender Voice  Classificaiton using Support Vector Machine
  To find out gender is Male or Female based on dataset.

  https://data.world/ml-research/gender-recognition-by-voice (for more about the dataset just check on this link)

### importing libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### importing dataset

In [None]:
dataset = pd.read_csv("../input/voicegender/voice.csv")

dataset.head()


## Exploratary data analysis

### checking the missing values

In [None]:
dataset.isnull().sum()

Note: there is no null values.

### checking the datatype

In [None]:
# info

dataset.info()

## Countplot 

In [None]:
sns.countplot(dataset['label'])

In [None]:
count_values = pd.value_counts(dataset['label'])
count_values

### Find the correlation b/w the dataset

In [None]:
dataset.corr()

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(dataset.corr(),annot = True,  cmap = 'RdYlGn')

### dependend and independent dataset

In [None]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,20]


### encode the dependent dataset using LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y)


### Visualizing the independent dataset

In [None]:
import seaborn as sns
plt.subplots(4,5,figsize = (15,15))

for i in range(1,21):
    plt.subplot(4,5,i)
    plt.title(dataset.columns[i-1])
    
    sns.kdeplot(dataset.loc[y == 0, dataset.columns[i-1]], color = "green",label = "F")
    sns.kdeplot(dataset.loc[y == 1, dataset.columns[i-1]], color = "blue", label = "M")

    
# Note : at the first glance we can clearly see the Q25,IQR,meanfun are the most significant features ,


At first glance, most significant features are Q25, IQR and meanfun. We can build models by using the 20 features or by 3 distinct features.

In [None]:
print(X)

### splitting  the dataset into train and test set

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)


### feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### finding the best parameter for kernel SVM


##### Parameter Tuning with GridSearchCV

In [None]:
from sklearn.svm import SVC
classifier = SVC()
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

### fitting  SVM to the train set 

In [None]:
# fitting 'rbf' kernel svm to train set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf',C = 1, gamma= 0.2)
classifier.fit(X_train, y_train)


### predict the testset

In [None]:

y_pred = classifier.predict(X_test)


### evaluate the prediction

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm  = confusion_matrix(y_test, y_pred)
print(cm)

print(accuracy_score(y_test,y_pred),"\n")

### Validate the performance of SVM Model

In [None]:
from sklearn.model_selection import cross_val_score
crs = cross_val_score(estimator = classifier, X= X_train, y = y_train, cv = 10)
crs.mean()
print('cross validate mean accuracy {}'.format(crs.mean()))
print('Trian accuracy {}'.format(classifier.score(X_train, y_train)))
print('Test accuracy  {}'.format(classifier.score(X_test, y_test)))