In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import ast

In [9]:
data = pd.read_csv("updated_file.csv")
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,id,weight,height,sex,BMI,side_image_features,front_features
0,0,A00147,185.0,67.0,Male,28.971931,[0.2285692 0.04150921 0.09596376 ... 0.137926...,[0.24958876 0.04899818 0.08260317 ... 0.210071...
2,2,A00360,167.0,69.0,Male,24.658895,[0.23262199 0.10198318 0.06991925 ... 0.133563...,[0.23300944 0.07099213 0.23300944 ... 0.180476...
3,3,A00367,245.0,72.0,Male,33.224344,[0.19465187 0.11052029 0.06091761 ... 0.025494...,[0.18922559 0.12255136 0.06751266 ... 0.027727...
4,4,A01054,166.0,67.0,Male,25.996436,[0.24357151 0.21394402 0.24357151 ... 0.081949...,[0.23623272 0.20830895 0.18085204 ... 0.033488...
5,5,A01072,195.0,69.0,Male,28.793321,[0.22986225 0.18055656 0.16541323 ... 0.070623...,[0.22579775 0.21888761 0.17438851 ... 0.220046...


In [10]:
def string_to_array(string):
    cleaned_string = string.replace("[", "").replace("]", "").replace("...", "")
    string_elements = cleaned_string.split()
    return np.array(string_elements, dtype=float)

# Apply the function to the column
data["front_features"] = data["front_features"].apply(string_to_array)
data["side_image_features"] = data["side_image_features"].apply(string_to_array)


In [11]:
data["label"] = data["sex"].apply(lambda x: 1 if x.lower() == "male" else 0)

In [12]:
counts = data["label"].value_counts()
print(f"Number of males: {counts[1]}")
print(f"Number of females: {counts[0]}")

Number of males: 56353
Number of females: 3649


In [13]:
features = np.array([np.concatenate([f, s]) for f, s in zip(data["front_features"], data["side_image_features"])])
target = data["label"].values

In [14]:
mean = features.mean(axis=0)
std = features.std(axis=0)
features_standardized = (features - mean) / std

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features_standardized, target, test_size=0.2, random_state=42)

In [16]:
def knn_predict(X_train, y_train, X_test, k):
    predictions = []
    for idx, test_point in enumerate(X_test):
        # Compute distances from the test point to all training points
        distances = np.sqrt(np.sum((X_train - test_point) ** 2, axis=1))
        
        # Get indices of the k-nearest neighbors
        nearest_indices = np.argsort(distances)[:k]
        
        # Predict by averaging the target values of the k-nearest neighbors
        nearest_targets = y_train[nearest_indices]
        prediction = np.mean(nearest_targets)
        predictions.append(prediction)

        # Track progress every 10 iterations
        if (idx + 1) % 1000 == 0:
            print(f"Progress: {idx + 1}/{len(X_test)} test points processed.")
    
    return np.array(predictions)

k = 2
y_pred = knn_predict(X_train, y_train, X_test, k)

Progress: 1000/12001 test points processed.
Progress: 2000/12001 test points processed.
Progress: 3000/12001 test points processed.
Progress: 4000/12001 test points processed.
Progress: 5000/12001 test points processed.
Progress: 6000/12001 test points processed.
Progress: 7000/12001 test points processed.
Progress: 8000/12001 test points processed.
Progress: 9000/12001 test points processed.
Progress: 10000/12001 test points processed.
Progress: 11000/12001 test points processed.
Progress: 12000/12001 test points processed.


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Compute R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

# Compute Pearson Correlation Coefficient
pearson_corr, _ = pearsonr(y_test, y_pred)
print(f"Pearson Correlation Coefficient: {pearson_corr}")


Mean Absolute Error (MAE): 0.10532455628697608
Mean Squared Error (MSE): 0.07941004916256979
R² Score: -0.38648085298078283
Pearson Correlation Coefficient: 0.08284676387873456
