## Today I'll work on classifying genders by their voice using classical ML and some data analysis and visualizations

### Importing libraries 

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import GridSearchCV
#preprocess.
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Reading our dataset.

In [None]:
df_train= pd.read_csv('/kaggle/input/voicegender/voice.csv')
df_train.head()

In [None]:
df_train.shape

In [None]:
df_train.columns

### from data description we know that :

meanfreq: mean frequency (in kHz)

sd: standard deviation of frequency

median: median frequency (in kHz)

Q25: first quantile (in kHz)

Q75: third quantile (in kHz)

IQR: interquantile range (in kHz)

skew: skewness (see note in specprop description)

kurt: kurtosis (see note in specprop description)

sp.ent: spectral entropy

sfm: spectral flatness

mode: mode frequency

centroid: frequency centroid (see specprop)

peakf: peak frequency (frequency with highest energy)

meanfun: average of fundamental frequency measured across acoustic signal

minfun: minimum fundamental frequency measured across acoustic signal

maxfun: maximum fundamental frequency measured across acoustic signal

meandom: average of dominant frequency measured across acoustic signal

mindom: minimum of dominant frequency measured across acoustic signal

maxdom: maximum of dominant frequency measured across acoustic signal

dfrange: range of dominant frequency measured across acoustic signal

modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range

label: male or female

### let's check missing values

In [None]:
df_train.isnull().sum()

### no missing values

### performing EDA with plots.

In [None]:
df_train.describe()

### let's check if we have outliers in our data by calculating the 1.5 IQR range

In [None]:
def check_outliers(col):
    q1,q3=df[col].quantile([0.25,0.75])
    iqr=q3-q1
    rang=1.5*iqr
    return(q1-rang,q3+rang)

## Univariate analysis

### since all features are numeric, I'll use histogram and box plot 

In [None]:
def plot(col):
    fig,axes=plt.subplots(1,2)
    sns.boxplot(data=df,x=col,ax=axes[0])
    sns.distplot(a=df[col],ax=axes[1],color='#ff4125')
    fig.set_size_inches(15,5)
    lower,upper = check_outliers(col)
    l=[df[col] for i in df[col] if i>lower and i<upper] 
    print("Number of data points remaining if outliers removed : ",len(l))

In [None]:
df=df_train

In [None]:
del df_train

In [None]:
df.columns

In [None]:
plot('meanfreq')

1. from box plot: we have some outliers according to 1.5 IQR rule
2. from distplot: the distribution is not perfect;y normal, we have very little -ve skewness >> we can normalize that.
3. more outliers are on the left of the distribution.

In [None]:
plot('sd')

In [None]:
plot('median')

In [None]:
plot('Q25')

In [None]:
plot('Q75')

In [None]:
plot('skew')

In [None]:
plot('kurt')

In [None]:
plot('sp.ent')

In [None]:
plot('sfm')

In [None]:
plot('meanfun')

In [None]:
sns.countplot(data=df,x='label');

### luckily our data is balanced "equal number of classes"

##  Bivariate Analysis

1. Correlation between data features

### make class labels 0,1 instead of male, female

In [None]:
df['label']=df['label'].replace({'male':1,'female':0})

In [None]:
df.head()

In [None]:
# correlation heatmap 
cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

1. IQR is the most correlated feature with the target label
2. median and centroid () , mean freq and centroid >> delete centroid
3. other obseravtions, there are weak correlations and high ones, i'll delete some columns in feature engineering.

### plot features against target label to compare distributions.

In [None]:
def plot_against_target(feature):
    sns.factorplot(data=df,y=feature,x='label',kind='box')
    fig=plt.gcf()
    fig.set_size_inches(7,7)

In [None]:
plot_against_target('meanfreq')

### females have higher mean frequency than males.

In [None]:
plot_against_target('sd')

In [None]:
plot_against_target('median')

In [None]:
plot_against_target('Q25')

In [None]:
plot_against_target('IQR')

we know the strong relation between the target label and IQR and this is 

In [None]:
plot_against_target('sp.ent')

In [None]:
plot_against_target('meanfun')  

### let's plot a pairplot grid with scatter plots to compare features

In [None]:
g = sns.PairGrid(df[['meanfreq','sd','median','Q25','IQR','sp.ent','sfm','meanfun','label']], hue = "label")
g = g.map(plt.scatter).add_legend()

I'll comment on this section in the feature engineering

### let's remove the outliers

In [None]:
for col in df.columns:
    l,u=check_outliers(col)
    df=df[(df[col]>l)&(df[col]<u)]

In [None]:
df.shape

### so the number of observations reduced when we deleted the outliers.

## Feature engineering, My fav part. ^^

so according to previous plots, we'll delete "skew,kurt,mindom,maxdom,centroid"

In [None]:
temp_df=df.copy()
temp_df.drop(['skew','kurt','mindom','maxdom','centroid'],axis=1,inplace=True)

In [None]:
temp_df.head()

### creating new features

In [None]:
## skewness with pearson coefficient
temp_df['pear_skew']=temp_df['meanfreq']-temp_df['mode']
temp_df['pear_skew']=temp_df['pear_skew']/temp_df['sd']
temp_df.head(10)

In [None]:
sns.boxplot(data=temp_df,y='pear_skew',x='label');

change median column to be 1/3(2mean+mode)

In [None]:
temp_df['meanfreq']=temp_df['meanfreq'].apply(lambda x:x*2)
temp_df['median']=temp_df['meanfreq']+temp_df['mode']
temp_df['median']=temp_df['median'].apply(lambda x:x/3)

In [None]:
sns.boxplot(data=temp_df,y='median',x='label');

### scaling

In [None]:
scaler=StandardScaler()
scaled_df=scaler.fit_transform(temp_df.drop('label',axis=1))
X=scaled_df
Y=df['label'].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=42)

## ML

### logistic regression

In [None]:
clf_lr=LogisticRegression()
clf_lr.fit(x_train,y_train)
pred=clf_lr.predict(x_test)
print(accuracy_score(pred,y_test))

### KNN

In [None]:
clf_knn=KNeighborsClassifier()
clf_knn.fit(x_train,y_train)
pred=clf_knn.predict(x_test)
print(accuracy_score(pred,y_test))

### SVM classifier

In [None]:
clf_svm=SVC()
clf_svm.fit(x_train,y_train)
pred=clf_svm.predict(x_test)
print(accuracy_score(pred,y_test))

### DECISION TREE classifier

In [None]:
clf_dt=DecisionTreeClassifier()
clf_dt.fit(x_train,y_train)
pred=clf_dt.predict(x_test)
print(accuracy_score(pred,y_test))

### random forest classifer

In [None]:
clf_rf=RandomForestClassifier()
clf_rf.fit(x_train,y_train)
pred=clf_rf.predict(x_test)
print(accuracy_score(pred,y_test))

### gradient boosting 

In [None]:
clf_gb=GradientBoostingClassifier()
clf_gb.fit(x_train,y_train)
pred=clf_gb.predict(x_test)
print(accuracy_score(pred,y_test))

## compare the results

In [None]:
models=[LogisticRegression(),LinearSVC(),SVC(kernel='rbf'),KNeighborsClassifier(),RandomForestClassifier(),
        DecisionTreeClassifier(),GradientBoostingClassifier(),GaussianNB()]
model_names=['LogisticRegression','LinearSVM','rbfSVM','KNearestNeighbors','RandomForestClassifier','DecisionTree',
             'GradientBoostingClassifier','GaussianNB']

acc=[]
d={}

for model in range(len(models)):
    clf=models[model]
    clf.fit(x_train,y_train)
    pred=clf.predict(x_test)
    acc.append(accuracy_score(pred,y_test))
     
d={'Modelling Algo':model_names,'Accuracy':acc}

In [None]:
acc=pd.DataFrame(d)
acc

In [None]:
sns.barplot(y='Modelling Algo',x='Accuracy',data=acc);

## Tuning SVM with grid search cross validation

In [None]:
params_dict={'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100],'kernel':['linear','rbf']}
clf=GridSearchCV(estimator=SVC(),param_grid=params_dict,scoring='accuracy',cv=10)
clf.fit(x_train,y_train)

### show best parameter values to train on

In [None]:
clf.best_params_

In [None]:
# best score
clf.best_score_

In [None]:
# train on these paramaters
clf_svm=SVC(C=100,gamma=0.01,kernel='rbf')
clf_svm.fit(x_train,y_train)
pred=clf_svm.predict(x_test)
print(accuracy_score(pred,y_test))

In [None]:
print(accuracy_score(clf_svm.predict(x_test),y_test))

In [None]:
print(precision_score(clf_svm.predict(x_test),y_test))

## There's not a remarkable difference between the original and tuned model, but it's a good practice.

## to me this notebook is about visuals more than modelling, but I had fun doing it with classic ML and it gave good results without the need for ANNs.