### A very simple model to predict the gender from an image. 

# Initial setup
###  Import
Import the required libraries.    
For this particular notebook, we are using python 3.8.3, sklearn 0.23.1, and cv2 4.3.0

In [1]:
import numpy as np 
import cv2
from matplotlib import pyplot as plt
import os
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix


### Data reading and processing
1. Open the images from the folder in gray scale
2. Resize it to 100 * 100
3. Apply adaptive histogram equalization. Refer https://docs.opencv.org/master/d5/daf/tutorial_py_histogram_equalization.html
4. Return an array of images

In [2]:
def loadImages(folder):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename),0)
        img = cv2.resize(img,(100,100))
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        img = clahe.apply(img)
       # img = cv2.equalizeHist(img)
        if img is not None:
            #print(img.shape)
            images.append(img)
    return images

### Data visualization 

In [3]:
def showImages(images,num):
    fig = plt.figure()
    for n in range(num):
        ax = fig.add_subplot(1, num, n+1)
        imgplot = plt.imshow(images[n],cmap='gray', vmin=0, vmax=255)
        

In [61]:
def showImagesFromIndexes(images,indexes):
    fig = plt.figure()
    num =63
    i = 1
    rows = 8
    cols = 8
    axes=[]
    fig= plt.figure(figsize = (20,20))
    
    a = 0
    for idx in indexes:
        axes.append( fig.add_subplot(rows, cols, a+1) )
        a = a + 1
        imgplot = plt.imshow(images[idx],cmap='gray', vmin=0, vmax=255)
    fig.tight_layout()    
    plt.show()

### Convert image to 1D vector

In [4]:
def convertImages2Features(images, label):
    #man = 0, woman = 1
    x = np.empty((0,10000), int)
    for img in images:
        #convert 2d to 1d
        row = img.shape[0]
        col = img.shape[1]
        img_1d = img.reshape(1, row*col)
        x = np.append(x,img_1d, axis=0)
    y = np.full((x.shape[0],1),label)
    return x,y
    

### Normalize data
1. Find mean and standard deviation for the train dataset and store them in suitalble variables for further use
2. Normalize the training dataset using the above calculated mean and std dev
3. To normalize validation and test dataset, use the mean and std dev from step 1

In [5]:
def normalizeFeatures(x):
    mean = np.mean(x, axis = 0)
    x = x - mean
    std_dev = np.std(x, axis = 0)
    x = x/std_dev
    return x, mean, std_dev

In [6]:
def normalizeFeaturesWithMean(x, mean, std_dev):
    x = x - mean
    x = x/std_dev
    return x

### To check accuracy for predicted values

In [57]:
def getAccuracy(y_pred, y_true):
    comp = np.zeros(y_pred.shape)
    comp[y_pred == y_true] = 1
    total_items = y_pred.shape[0]
    correct_items = np.sum(comp)
    return correct_items/total_items

# Implementation

### Loading images

In [8]:
images_man_train = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/train/man")
images_woman_train = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/train/woman")
images_man_cv = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/valid/man")
images_woman_cv = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/valid/woman")
images_man_test = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/test/man")
images_woman_test = loadImages("/home/sakshi/projects/ML/gender_classification/data/dataset1/test/woman")
#show_images(images_woman, 3)



### Converting to 1D vector

In [9]:
X_man_train, Y_man_train = convertImages2Features(images_man_train, 0) 
X_woman_train, Y_woman_train = convertImages2Features(images_woman_train, 1)
X_man_cv, Y_man_cv = convertImages2Features(images_man_cv, 0) 
X_woman_cv, Y_woman_cv = convertImages2Features(images_woman_cv, 1)
X_man_test, Y_man_test = convertImages2Features(images_man_test, 0) 
X_woman_test, Y_woman_test = convertImages2Features(images_woman_test, 1)

In [10]:
X_train = np.append(X_man_train, X_woman_train, axis = 0)
Y_train = np.append(Y_man_train, Y_woman_train, axis = 0)
X_cv = np.append(X_man_cv, X_woman_cv, axis = 0)
Y_cv = np.append(Y_man_cv, Y_woman_cv, axis = 0)
X_test = np.append(X_man_test, X_woman_test, axis = 0)
Y_test = np.append(Y_man_test, Y_woman_test, axis = 0)



### A little preprocessing
It is important to shuffle the dataset for better traing results

In [48]:
train = np.append(X_train, Y_train, axis = 1)
val = np.append(X_cv, Y_cv, axis = 1)

np.random.shuffle(train)
np.random.shuffle(val)

X_train = train[:, 0:train.shape[1]-1]
Y_train = train[:, train.shape[1]-1]

X_val = val[:, 0:val.shape[1]-1]
Y_val = val[:, val.shape[1]-1]

print("training: X shape = ", X_train.shape, ", Y shape = ", Y_train.shape)
print("first 10 values of y: ", Y_train[0:10])
print("Validation: X shape = ", X_val.shape, ", Y shape = ", Y_val.shape)
print("first 10 values of y: ", Y_val[0:10])
print("testing: X shape = ", X_test.shape, ", Y shape = ", Y_test.shape)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

### Normalize

In [12]:
X_train, mean, std_dev = normalizeFeatures(X_train)
X_cv = normalizeFeaturesWithMean(X_cv, mean, std_dev)
X_test = normalizeFeaturesWithMean(X_test, mean, std_dev)

### Principle Component Analysis
We will be reducing the dimensions for the input feature vector using Principle Component Analysis(PCA) by retaining 95 % variance.    
Refer https://www.datacamp.com/community/tutorials/principal-component-analysis-in-python for better understanding.     
P.S. fit_transform will take a bit longer for execution

In [13]:
pca = PCA(0.95) # causes overfitting
#pca = PCA(0.9)

In [14]:
principalComponents = pca.fit_transform(X_train)
pca.n_components_

458

In [15]:
X_pca_train = pca.transform(X_train)
X_pca_cv = pca.transform(X_cv)
X_pca_test = pca.transform(X_test)

### Training Model
Hyperparameters: C, Gamma    
Kernal: rbf    
When I trained the model initially, the model was biasd. So, I added up the class weights.  


In [62]:
C = [10]
Gamma = [0.001]

training_accuracies = list()
val_accuracies = list()
#cw = {0:0.5,1:0.5}

best_model = None
best_val_accuracy = 0.0

for c in C:
    ta = list()
    va = list() 
    
    for gamma in Gamma:        
        
        clf = svm.SVC(C = c, gamma = gamma, kernel = "rbf",probability = True)
        clf.fit(X_pca_train, Y_train.ravel())
        
        y_pred_train = clf.predict(X_pca_train)
        y_pred_val = clf.predict(X_pca_cv)

        training_accuracy =  getAccuracy(y_pred_train, Y_train) 
        val_accuracy =  getAccuracy(y_pred_val, Y_val) 
        
        if (val_accuracy > best_val_accuracy):
            best_val_accuracy = val_accuracy
            best_model = clf
        
        print("For C = ", c, ", Gamma = ", gamma, ": training accuracy = ", training_accuracy, ", val accuracy = ", val_accuracy) 
        
        ta.append(training_accuracy)
        va.append(val_accuracy)
        
    training_accuracies.append(ta)
    val_accuracies.append(va)

For C =  10 , Gamma =  0.001 : training accuracy =  0.9997054491899853 , val accuracy =  0.4852941176470588


### Evaluating the performance

#### 1. Training

In [63]:
import pandas as pd
predictions = best_model.predict(X_pca_train)
#print(confusion_matrix(Y_test,predictions))
print(classification_report(Y_train,predictions))
print(pd.DataFrame(confusion_matrix(Y_train,predictions),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1700
           1       1.00      1.00      1.00      1695

    accuracy                           1.00      3395
   macro avg       1.00      1.00      1.00      3395
weighted avg       1.00      1.00      1.00      3395

     pred_neg  pred_pos
neg      1700         0
pos         1      1694


#### 2. Validation

In [64]:
import pandas as pd
predictions = best_model.predict(X_pca_cv)
#print(confusion_matrix(Y_test,predictions))
print(classification_report(Y_val,predictions))
print(pd.DataFrame(confusion_matrix(Y_val,predictions),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))

              precision    recall  f1-score   support

           0       0.49      0.84      0.62       170
           1       0.45      0.13      0.20       170

    accuracy                           0.49       340
   macro avg       0.47      0.49      0.41       340
weighted avg       0.47      0.49      0.41       340

     pred_neg  pred_pos
neg       143        27
pos       148        22


#### 3. Testing

In [65]:
y_scores = clf.predict(X_pca_test)
testing_accuracy =  getAccuracy(y_scores, Y_test) 
print("testing accuracy: ", testing_accuracy)

testing accuracy:  0.6705882352941176
