In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv('../input/voice.csv')

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df.isna().values.any()

In [None]:
df.min()

In [None]:
df.max()

#### By looking at the min and max value, we get to know that the data is not scaled properly

In [None]:
df.head(10)

In [None]:
df.tail(10)

#### It seems as if the data is not shuffled properly, so let's do that first, but first lets encode the label column

In [None]:
label = [1 if each == "male" else 0 for each in df.label]
label = pd.Series(np.array(label))
label = pd.DataFrame(label,columns=['gender'])

In [None]:
df.drop('label',axis = 1,inplace = True)
df = pd.concat([df,label],axis = 1)

#### Shuffling the dataframe

In [None]:
df = df.sample(frac = 1).reset_index(drop = True) ## drop = True, prevents df from creating another column with the old index

#### Verification

In [None]:
df.tail(10)

#### Splitting into X and y

In [None]:
df.columns

In [None]:
df['gender'].values

In [None]:
X = df.drop('gender',axis = 1)
y = df['gender']

#### Scaling the values of X

In [None]:
X = (X - X.min())/(X.max() - X.min())

In [None]:
X.min()

In [None]:
X.max()

In [None]:
X.head(10)

#### Now that everything is scaled b/w 0 and 1, we can apply the ML models on it

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1)

In [None]:
X_train.shape

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,pred))

### KNN Classifier

#### Choosing the appropriate no.of neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
error_rate = []

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_i = np.mean(pred_i!=y_test)
    error_rate.append(error_i)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='green', linestyle='dashed', marker='o',
         markerfacecolor='yellow', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

#### Choosing a value of 8 for k will result in less loss

In [None]:
knn = KNeighborsClassifier(n_neighbors = 8)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
pred = svc.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dec = DecisionTreeClassifier()

In [None]:
dec.fit(X_train,y_train)

In [None]:
pred = dec.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

### Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

##### Choosing the appropriate no.of estimators

In [None]:
error_rate = []

for i in range(1,11):
    rfc = RandomForestClassifier(n_estimators = i)
    rfc.fit(X_train,y_train)
    pred_i = rfc.predict(X_test)
    error_rate.append(np.mean(y_test!=pred_i))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,11),error_rate,color='pink', linestyle='dashed', marker='o',
         markerfacecolor='black', markersize=10)
plt.title('Error Rate vs. No.of Estimators')
plt.xlabel('No.of Estimators')
plt.ylabel('Error Rate')

##### choosing 5 as the no.of estimators

In [None]:
rfc = RandomForestClassifier(n_estimators = 5)
rfc.fit(X_train,y_train)
pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
xgb_model.fit(X_train,y_train)

In [None]:
pred = xgb_model.predict(X_test)

In [None]:
print(classification_report(y_test,pred))

#### Optimizing the no.of estimators params

In [None]:
nEstimators = [10,20,30,40,50,60,70,80,90,100,110,120]

In [None]:
error_rate = []
for i in nEstimators:
    xgb_model = xgb.XGBClassifier(n_estimators=i)
    xgb_model.fit(X_train,y_train)
    pred = xgb_model.predict(X_test)
    error_rate.append(np.mean(y_test!=pred))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,13),error_rate,color='black', linestyle='dashed', marker='o',
         markerfacecolor='orange', markersize=10)
plt.title('Error Rate vs. No.of Estimators')
plt.xlabel('No.of Estimators')
plt.ylabel('Error Rate')

#### Let's try 120 estimators

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators = 120)
xgb_model.fit(X_train,y_train)

In [None]:
pred = xgb_model.predict(X_test)

In [None]:
print(classification_report(y_test,pred))