In [1]:
import scipy.io
import numpy as np
import pandas as pd

In [2]:
mat_data = scipy.io.loadmat('wiki_test.mat')

wiki_data = mat_data['wiki'][0, 0]

data_dict = {
    'dob': wiki_data['dob'][0],
    'photo_taken': wiki_data['photo_taken'][0],
    'full_path': wiki_data['full_path'][0],
    'gender': wiki_data['gender'][0],
    'name': wiki_data['name'][0],
    'face_location': wiki_data['face_location'][0],
    'face_score': wiki_data['face_score'][0],
    'second_face_score': wiki_data['second_face_score'][0],
    'age': wiki_data['age'][0]
}

df = pd.DataFrame(data_dict)

df = df[['age', 'gender', 'full_path']]
df['full_path'] = df['full_path'].str[0]
df['full_path'] = 'wiki_crop/' + df['full_path']

df = df.dropna()

print(df)

       age  gender                                  full_path
0       28     1.0  wiki_crop/17/10000217_1981-05-05_2009.jpg
1       39     1.0  wiki_crop/48/10000548_1925-04-04_1964.jpg
2       59     1.0    wiki_crop/12/100012_1948-07-03_2008.jpg
3       31     1.0  wiki_crop/65/10001965_1930-05-23_1961.jpg
4       41     0.0  wiki_crop/16/10002116_1971-05-31_2012.jpg
...    ...     ...                                        ...
62323   26     1.0   wiki_crop/49/9996949_1937-04-17_1963.jpg
62324   22     1.0   wiki_crop/32/9997032_1947-07-30_1970.jpg
62325   40     1.0   wiki_crop/09/9998109_1972-12-27_2013.jpg
62326   29     1.0   wiki_crop/00/9999400_1981-12-13_2011.jpg
62327   54     0.0    wiki_crop/80/999980_1954-06-11_2008.jpg

[59685 rows x 3 columns]


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
from skimage.color import rgb2gray
from joblib import Parallel, delayed

In [4]:
# HOG feature extraction function
def extract_hog_features(path, target_size=(128, 128)):
    try:
        img = imread(path)
        img = resize(img, target_size, anti_aliasing=True)
        if len(img.shape) > 2:  # Convert to grayscale if RGB
            img = rgb2gray(img)
        # HOG parameters (optimized for speed)
        features = hog(img, 
                      orientations=8, 
                      pixels_per_cell=(16, 16),
                      cells_per_block=(1, 1), 
                      visualize=False,
                      channel_axis=None)
        return features
    except Exception as e:
        print(f"Error processing {path}: {str(e)}")
        return None

# Parallel feature extraction with HOG
print("Extracting HOG features in parallel...")
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(extract_hog_features)(path) for path in df['full_path']
)

# Filter successful extractions
valid_features = []
valid_indices = []
for i, feat in enumerate(results):
    if feat is not None:
        valid_features.append(feat)
        valid_indices.append(i)

valid_df = df.iloc[valid_indices].copy()
x = np.array(valid_features)
y_age = valid_df['age'].values
y_gender = valid_df['gender'].values

# Train-test split
X_train, X_test, y_age_train, y_age_test, y_gender_train, y_gender_test = train_test_split(
    x, y_age, y_gender, test_size=0.2, random_state=42
)

Extracting HOG features in parallel...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1899181676102142s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.07477283477783203s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1591038703918457s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    2.3s
[Parallel(n_jobs=

In [9]:
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

# Age model
print("\nTraining age model...")
age_model = LinearRegression()
age_model.fit(X_train, y_age_train)
age_pred = age_model.predict(X_test)
print(f"Age MAE: {mean_absolute_error(y_age_test, age_pred):.2f} years")
print(f"Age RMSE: {mean_squared_error(y_age_test, age_pred)**0.5:.2f} years")

# KNN for Age
age_knn = KNeighborsRegressor(n_neighbors=5)
age_knn.fit(X_train, y_age_train)
age_knn_pred = age_knn.predict(X_test)
print(f"KNN Age MAE: {mean_absolute_error(y_age_test, age_knn_pred):.2f} years")
print(f"KNN Age RMSE: {mean_squared_error(y_age_test, age_knn_pred)**0.5:.2f} years")

# MLP for Age
age_mlp = MLPRegressor(hidden_layer_sizes=(64, 32), 
                      activation='relu', 
                      solver='adam', 
                      max_iter=100, 
                      random_state=42)
age_mlp.fit(X_train, y_age_train)
age_mlp_pred = age_mlp.predict(X_test)
print(f"MLP Age MAE: {mean_absolute_error(y_age_test, age_mlp_pred):.2f} years")
print(f"MLP Age RMSE: {mean_squared_error(y_age_test, age_mlp_pred)**0.5:.2f} years")

# Gender model
print("\nTraining gender model...")
gender_model = LogisticRegression(max_iter=1000)
gender_model.fit(X_train, y_gender_train)
gender_pred = gender_model.predict(X_test)
print(f"Gender accuracy: {accuracy_score(y_gender_test, gender_pred):.2f}")

# KNN for Gender
gender_knn = KNeighborsClassifier(n_neighbors=5)
gender_knn.fit(X_train, y_gender_train)
gender_knn_pred = gender_knn.predict(X_test)
print(f"KNN Gender accuracy: {accuracy_score(y_gender_test, gender_knn_pred):.2f}")

# MLP for Gender
gender_mlp = MLPClassifier(hidden_layer_sizes=(64, 32), 
                          activation='relu', 
                          solver='adam', 
                          max_iter=100, 
                          random_state=42)
gender_mlp.fit(X_train, y_gender_train)
gender_mlp_pred = gender_mlp.predict(X_test)
print(f"MLP Gender accuracy: {accuracy_score(y_gender_test, gender_mlp_pred):.2f}")


Training age model...
Age MAE: 13.65 years
Age RMSE: 23.28 years
KNN Age MAE: 15.33 years
KNN Age RMSE: 25.27 years




MLP Age MAE: 13.02 years
MLP Age RMSE: 23.11 years

Training gender model...
Gender accuracy: 0.85
KNN Gender accuracy: 0.82
MLP Gender accuracy: 0.84


