In [None]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os
from skimage import feature
from tqdm import tqdm

In [None]:
class LocalBinaryPatterns:
	def __init__(self, numPoints, radius):
		# store the number of points and radius
		self.numPoints = numPoints
		self.radius = radius
	def describe(self, image, eps=1e-7):
		# compute the Local Binary Pattern representation
		# of the image, and then use the LBP representation
		# to build the histogram of patterns
		lbp = feature.local_binary_pattern(image, self.numPoints,
			self.radius, method="uniform")
		(hist, _) = np.histogram(lbp.ravel(),
			bins=np.arange(0, self.numPoints + 3),
			range=(0, self.numPoints + 2))
		# normalize the histogram
		hist = hist.astype("float")
		hist = hist / (hist.sum() + eps)
		# return the histogram of Local Binary Patterns
		return hist

In [None]:
path = '../dataset' ## TODO: Change this to the path of your dataset. (The code will look through every subfolder for images)

images = []
for x, y, z in os.walk(path):
    for name in tqdm(z):
        images.append(os.path.join(x, name).replace('\\','/')) 



In [None]:
lbp_df = pd.DataFrame()


# the parameters of the LBP algo
# higher = more time required
sample_points = 16
radius = 4

# this code takes a while
for i in tqdm(images):
    img = cv2.imread(i)
    lbp = LocalBinaryPatterns(sample_points, radius).describe(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    row = dict(zip(range(0, len(lbp)), lbp))
    row['ageRange'] = i.split('/')[3] ## TODO: change 3 to the index in the path where the age range is located
    ## for example, mine was ../dataset/female/age_10_14/imagename => split by / => index 3
    lbp_df = lbp_df.append(row, ignore_index=True)


In [None]:
# number of null values in our df. Should always be 0
lbp_df[2].isna().sum()

In [None]:
corrM = lbp_df.corr()
print(corrM)

In [None]:
# the age groups we decide we call 'young'
young = ['age_10_14',
'age_15_19',
'age_20_24',
'age_25_29',
'age_30_34',
'age_35_39',
'age_40_44', 'age_45_49']

# in this column, true means young, false means old
lbp_df['age_new'] = lbp_df['ageRange'].isin(young)

In [None]:
lbp_df.head()

In [None]:
# randomize the df so that old and young are mixed
random_df = lbp_df.sample(frac=1).reset_index(drop=True)
random_df.head()

In [None]:

X = random_df.drop(['ageRange','age_new'], axis=1)
y = random_df['age_new']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=7)

In [None]:
## finding the most optimal K (not using cross val scoring)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range = range(1,50)
scores = {}
scores_list = []
for k in tqdm(k_range):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores[k] = metrics.accuracy_score(y_test, y_pred)
    scores_list.append(metrics.accuracy_score(y_test, y_pred))

In [None]:
plt.plot(k_range, scores_list)

In [None]:
scores_list.index(max(scores_list))

In [None]:
# the K would be answer above +1
scores[32]

In [None]:
# finding cross val score of most optimal K
from sklearn.model_selection import cross_val_score

cross_knn = KNeighborsClassifier(n_neighbors=32)

scores = cross_val_score(cross_knn, X_train, y_train, cv=10)


In [None]:
scores

In [None]:
scores.mean()