# Gender Classification

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt 

path_input='../input/gender-classification-dataset/'
path_output='./'

# Data Exploration

In [None]:
data=pd.read_csv(path_input+'gender_classification_v7.csv')
data.head()

In [None]:
data.info()

In [None]:
mask = np.zeros_like(data.corr())
mask[np.tril_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, vmax=1, square=True,annot=True,cmap='viridis', mask=mask.T)

plt.title('Correlation between different fearures')

In [None]:
sns.lmplot(x="forehead_width_cm", y="forehead_height_cm", hue="gender", data=data, palette = 'inferno_r', height = 7)

In [None]:
sns.lmplot(x="nose_wide", y="nose_long", hue="gender", data=data, palette = 'inferno_r', height = 7)

In [None]:
sns.lmplot(x="lips_thin", y="distance_nose_to_lip_long", hue="gender", data=data, palette = 'inferno_r', height = 7)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report,confusion_matrix

In [None]:
Y = data.gender
X = data.drop(['gender'], axis = 1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state =np.random.RandomState(0))

### Model selection

Here we will tune parameters of a few models. For each, we will use GridSearchCV with cross-validations (CV). Here is the function to tune a model returning best parameters and score

In [None]:
from sklearn.model_selection import GridSearchCV
def tune_model(model,params):
    modelCV=GridSearchCV(estimator=model,
                     param_grid=params,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1,
                     verbose=1)
    modelCV.fit(x_train,y_train)
    print("best parameters : \n{}\n".format(modelCV.best_params_))
    print("accuracy : \n{}\n".format(modelCV.best_score_))

## Logistic Regression

In [None]:
reglog_params={"C":np.logspace(-3,3,7),
      "penalty":['l1', 'l2', 'elasticnet', 'none']}

reglog=LogisticRegression()

In [None]:
tune_model(reglog,reglog_params)

After tuning parameters we update the model

In [None]:
reglog=LogisticRegression(C=0.1,penalty='l2')
reglog.fit(x_train,y_train)

In [None]:
def plot_results(labels,preds,model_name,plot_type='all'):
    def plot_confusion_matrix(labels, preds,model_name):
        plt.figure(1, figsize= (10, 10))
        plt.title("Confusion matrix for "+model_name)
        mat = confusion_matrix(labels, preds)
        sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
        plt.xlabel('true label')
        plt.ylabel('predicted label')
        plt.show()
        plt.close()

    def plot_classif_report(labels,preds,model_name):
        clf=classification_report(labels, preds,output_dict=True)
        plt.figure(1, figsize= (10,10))
        ax = plt.axes()
        sns.heatmap(pd.DataFrame(clf).iloc[:-1, :].T,annot=True)#annot=True
        ax.set_title("Classification report for "+model_name)
        plt.show()
        plt.close()
    if 'confusion' in plot_type:
        plot_confusion_matrix(labels,preds,model_name)
    elif 'report' in plot_type:
        plot_classif_report(labels,preds,model_name)
    elif 'all' in plot_type:
        plot_confusion_matrix(labels,preds,model_name)
        plot_classif_report(labels,preds,model_name)

In [None]:
pred=reglog.predict(x_test)
plot_results(y_test,pred,'LogisticRegression',plot_type='all')

## Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svc=SVC()
params_svc={'kernel': ['rbf','linear'],
             'gamma': [1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000]}
                                
tune_model(svc,params_svc)

In [None]:
svc=SVC(kernel='rbf',gamma=0.001,C=10)
svc.fit(x_train,y_train)
pred=svc.predict(x_test)
plot_results(y_test,pred,'Support Vector Classifier',plot_type='all')

## K Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier()
params_knn={'n_neighbors': [k for k in range(1,20)]}
                                
tune_model(knn,params_knn)

In [None]:
knn=KNeighborsClassifier(n_neighbors=15)
knn.fit(x_train,y_train)
pred=knn.predict(x_test)
plot_results(y_test,pred,'K Nearest Neighbours',plot_type='all')

# Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier()
params_rf={'n_estimators': [k for k in range(50,1000,200)],
          'max_depth' : [None, 2, 4, 8],
          'max_features':['auto', 'sqrt']}
tune_model(rf,params_rf)

In [None]:
rf=RandomForestClassifier(max_depth=4, max_features='auto',n_estimators=450)
rf.fit(x_train,y_train)
pred=rf.predict(x_test)
plot_results(y_test,pred,'Random Forest Classifier',plot_type='all')

## XGBOOST Classifier

In [None]:
import xgboost as xgb

In [None]:
boost=xgb.XGBClassifier()
params_boost={
    'objective': ['binary:logistic'],
    'eval_metric':['error'],
    'nthread':[-1],
    'min_child_weight':(3, 20),
      'gamma':(0, 5),
      'subsample':(0.7, 1),
      'colsample_bytree':(0.1, 1),
      'max_depth': (3, 10),
      'learning_rate': (0.01, 0.5)
       }
tune_model(boost,params_boost)

In [None]:
boost = xgb.XGBClassifier(colsample_bytree=0.1,
                          eval_metric='error',
                          learning_rate=0.5,
                          min_child_weight=3,
                          nthread=-1,
                          gamma=5,
                          subsample=1,
                          max_depth=3,
                         )
boost.fit(x_train,y_train)
pred=boost.predict(x_test)
plot_results(y_test,pred,'XGBOOST',plot_type='all')

# Classifier Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

estimators = [
        ('log', reglog),
        ('svm', svc),
        ('knn',knn), 
       ('rf',rf),
       ('xgb',boost)]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=xgb.XGBClassifier(colsample_bytree=0.1,
                          eval_metric='error',
                          learning_rate=0.5,
                          min_child_weight=3,
                          nthread=-1,
                          gamma=5,
                          subsample=1,
                          max_depth=3,),
    n_jobs=-1,
    cv=10)

stack.fit(x_train,y_train)
pred=stack.predict(x_test)
plot_results(y_test,pred,'Stacking Classifier',plot_type='all')

Best classifier : XGBOOST with 97.7% accuracy