In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
voicedata=pd.read_csv('../input/voicegender/voice.csv')
voicedata.head()

In [None]:
voicedata.label = [1 if each == "female" else 0 for each in voicedata.label]
voicedata.info()

In [None]:
voicedata.tail()

# Visualizing the correlation among the features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

colormap = plt.cm.viridis
plt.figure(figsize=(13,13))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(voicedata.iloc[:,:-1].astype(float).corr(),vmax=1.0, square=True,cmap="RdBu_r",annot=True)

## Data Standardisation

In [None]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler
x = voicedata.iloc[:, :-1]
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
y = voicedata.iloc[:,-1]

In [None]:
#separating features and labels
#x = voicedata.iloc[:, :-1]
#x = (x - np.min(x)) / (np.max(x)-np.min(x))
#print(x[:10])
#y = voicedata.iloc[:,-1]

## Splitting dataset into training set and testing set for better generalisation

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state = 0)
#test_size=0.2 means %20 test datas, %80 train datas
names_all = []
scores_all = []

# K-Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier
score_list=[]
for k in range(1,15):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    score_list.append(knn.score(x_test,y_test))

maxAccuracy = score_list.index(max(score_list))

print('Max accuracy at K = '+str(maxAccuracy+1))
print('Max accuracy is ' + str(score_list[maxAccuracy]))
plt.figure(figsize=(9,6))
plt.plot(range(1,15),score_list,color='blue', marker='s',markerfacecolor='red')
plt.xlabel("different k values")
plt.ylabel("score")


In [None]:
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(x_train,y_train)
print("Accuracy for K = 5 is {}".format(knn_5.score(x_test,y_test)))

## Confusion Matrix on KNN

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = knn_5.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

## CV on KNN

In [None]:
knn_5 = KNeighborsClassifier(n_neighbors=5)
scores_knn = cross_val_score(knn_5, x, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores_knn)

In [None]:
print(scores_knn.mean())

In [None]:
names_all.append("KNN")
scores_all.append(scores_knn.mean())

# Support Vector Machine

## Default Linear kernel

In [None]:
from sklearn.svm import SVC
svm_linear=SVC(kernel='linear')
svm_linear.fit(x_train,y_train)
print("SVM Score is: {}".format(svm_linear.score(x_test,y_test)))

## CV on Linear kernel

In [None]:
svm_linear=SVC(kernel='linear')
scores_linear = cross_val_score(svm_linear, x, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores_linear)

In [None]:
print(scores_linear.mean())

In [None]:
method_names = []
method_scores = []
method_names.append("Linear")
method_scores.append(scores_linear.mean())

## Default RBF kernel

In [None]:
svm_rbf=SVC(kernel='rbf')
svm_rbf.fit(x_train,y_train)
print("svm_rbf Score is: {}".format(svm_rbf.score(x_test,y_test)))

## CV on default RBF kernel

In [None]:
svc_rbf=SVC(kernel='rbf')
scores_rbf = cross_val_score(svc_rbf, x, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores_rbf)

In [None]:
print(scores_rbf.mean())

In [None]:
method_names.append("rbf")
method_scores.append(scores_rbf.mean())

## Default Polynomial kernel

In [None]:
svm_ploy=SVC(kernel='poly')
svm_ploy.fit(x_train,y_train)
print("svm_rbf Score is: {}".format(svm_ploy.score(x_test,y_test)))

## CV on Polynomial kernel

In [None]:
svc_ploy=SVC(kernel='poly')
scores_ploy = cross_val_score(svc_ploy, x, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores_ploy)

In [None]:
print(scores_ploy.mean())

In [None]:
method_names.append("Polynomial")
method_scores.append(scores_ploy.mean())

## Compare three kernels' score

In [None]:
plt.figure(figsize=(9,6))
plt.ylim([0.93,0.975])
plt.bar(method_names,method_scores,width=0.5)
plt.xlabel('Different Kernel with Default Parameter')
plt.ylabel('Method Score')

## RBF SVM parameters

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
#cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=10)
grid.fit(x, y)
scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
                                                     len(gamma_range)) 

In [None]:
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

## Heatmap Visualization
Draw heatmap of the validation accuracy as a function of gamma and C

In [None]:
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

plt.figure(figsize=(9, 7))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

In [None]:
svc_gamma=SVC(kernel='rbf',C=100000,gamma=1e-06)
svc_gamma.fit(x_train,y_train)
print("svm_rbf Score is: {}".format(svc_gamma.score(x_test,y_test)))

## CV on RBF Kernel

In [None]:
svc_gamma=SVC(kernel='rbf',C=100000,gamma=1e-06)
svc_gamma.fit(x_train,y_train)
scores_gamma = cross_val_score(svc_gamma, x, y, cv=10, scoring='accuracy')
print(scores_gamma)
print(scores_gamma.mean())

## Confusion Matrix on RBF Kernel

In [None]:
y_pred = svc_gamma.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

## Linear SVM parameters

In [None]:
C_values=list(np.arange(1,20))
acc_score=[]
for c in C_values:
    svc = SVC(kernel='linear', C=c)
    scores = cross_val_score(svc, x, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
plt.figure(figsize=(9,6))
plt.plot(C_values,acc_score,color='blue', marker='s',markerfacecolor='red')
plt.xticks(np.arange(0,20,2))
plt.xlabel('Value of C for SVC ')
plt.ylabel('Cross-Validated Accuracy')   

In [None]:
maxAccuracy = acc_score.index(max(acc_score))
print('Max accuracy at C = '+str(maxAccuracy+1))
print('Max accuracy is ' + str(acc_score[maxAccuracy]))

In [None]:
C_values=list(np.arange(0.1,2,0.1))
acc_score=[]
for c in C_values:
    svc = SVC(kernel='linear', C=c)
    scores = cross_val_score(svc, x, y, cv=10, scoring='accuracy')
    acc_score.append(scores.mean())
    
plt.figure(figsize=(9,6))
plt.plot(C_values,acc_score,color='blue', marker='s',markerfacecolor='red')
plt.xticks(np.arange(0,2,0.2))
plt.xlabel('Value of C for SVC ')
plt.ylabel('Cross-Validated Accuracy')    


In [None]:
maxAccuracy = acc_score.index(max(acc_score))
print('Max accuracy at C = '+str(C_values[maxAccuracy]))
print('Max accuracy is ' + str(acc_score[maxAccuracy]))

In [None]:
svc_linear=SVC(kernel='linear',C=0.1)
svc_linear.fit(x_train,y_train)
print("svm_linear Score is: {}".format(svc_linear.score(x_test,y_test)))

## CV on Linear Kernel

In [None]:
svc_linear=SVC(kernel='linear',C=0.1)
svc_linear.fit(x_train,y_train)
scores_linear = cross_val_score(svc_linear, x, y, cv=10, scoring='accuracy')
print(scores_linear)

In [None]:
print(scores_linear.mean())

## Confusion Matrix on Linear Kernel

In [None]:
y_pred = svc_linear.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
names_all.append("Linear SVM")
scores_all.append(scores_linear.mean())

# Neural Network

In [None]:
from sklearn import neural_network
score_list = []
hidden_layer=range(1,20)
for i in hidden_layer:
        model = neural_network.MLPClassifier(solver='adam', alpha=1e-5,
                                       hidden_layer_sizes=i,
                                       activation='logistic',random_state=17,
                                       max_iter=2000)
        model.fit(x_train, y_train)
        score_list.append(model.score(x_test,y_test))
kloc = score_list.index(max(score_list))
print("Max accuracy is %s occurs at H = %s." % (score_list[kloc], hidden_layer[kloc]))

In [None]:
plt.figure(figsize=(9,6))
plt.plot(hidden_layer, score_list, color='blue', marker='s',markerfacecolor='red')
plt.xticks(np.arange(0,20,2))
plt.title('Accuracy vs. Hidden Layer Size')
plt.xlabel('Size')
plt.ylabel('Accuracy')
plt.show()

## CV on Neural Network

In [None]:
model_NN = neural_network.MLPClassifier(solver='adam', alpha=1e-5,
                                       hidden_layer_sizes=9,
                                       activation='logistic',random_state=17,
                                       max_iter=2000)
model_NN.fit(x_train,y_train)
scores_NN = cross_val_score(model_NN, x, y, cv=10, scoring='accuracy') 
print(scores_NN)

In [None]:
print(scores_NN.mean())

## Confusion Matrix on Neural Network

In [None]:
model_NN.fit(x_train,y_train)
y_pred = model_NN.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
names_all.append("Neural Network")
scores_all.append(scores_NN.mean())

# CONCLUSION

In [None]:
plt.figure(figsize=(9,6))
plt.ylim([0.85,1])
plt.bar(names_all,scores_all,width=0.5)
plt.xlabel('Method Name')
plt.ylabel('Method Score')