## Libraries

In [None]:
from embetter.vision import ImageLoader
from embetter.multi import ClipEncoder

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.multioutput import MultiOutputClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Dataset

In [None]:
import shutil

In [None]:
all_images_dir = Path("/content/drive/MyDrive/course_work/FaceMemorability database/10k US Adult Faces Database/Face Images")
needed_images_dir = Path("/content/drive/MyDrive/course_work/FaceMemorability database/needed_images_database")
table = pd.read_excel("/content/drive/MyDrive/course_work/FaceMemorability database/Full Attribute Scores/psychology attributes/psychology-attributes.xlsx")
needed_images = table.iloc[:, 0].drop_duplicates()

for filename in needed_images:
    src = all_images_dir / filename
    dst = needed_images_dir / filename
    if src.exists():
        shutil.copy(src, dst)

print(len(list(needed_images_dir.glob("*.jpg")))) # 2222

In [None]:
traits = ['atypical', 'boring', 'calm', 'cold', 'common', 'confident', 'egotistic',
          'emotUnstable', 'forgettable', 'intelligent', 'introverted', 'kind', 'responsible', 'trustworthy',
          'unattractive', 'unemotional', 'unfamiliar', 'unfriendly', 'unhappy', 'weird', 'aggressive', 'attractive',
          'caring', 'emotStable', 'emotional', 'familiar', 'friendly', 'happy', 'humble', 'interesting', 'irresponsible',
          'mean', 'memorable', 'normal', 'sociable', 'typical', 'uncertain', 'uncommon', 'unintelligent', 'untrustworthy']

In [None]:
table = table.drop(columns=['Image #', 'subID', 'subage', 'submale', 'subrace', 'catch', 'catchAns', 'catch.1', 'catchAns.1', 'subID.1', 'subage.1', 'submale.1', 'subrace.1'])

df = pd.DataFrame()

for trait in traits:

  df_trait = table.groupby("Filename", as_index=False)[trait].mean()
  mean_value = df_trait[trait].mean()
  df_trait[trait] = (df_trait[trait] > mean_value).astype(int)

  if trait == traits[0]:
      df = df_trait
  else:
      df = df.merge(df_trait[['Filename', trait]], on='Filename', how='left')

df['Filepath'] = df['Filename'].apply(lambda x: str(needed_images_dir / x))
df = df.drop(columns=['Filename'])
df.insert(0, 'Filepath', df.pop('Filepath'))

df.to_csv('/content/drive/MyDrive/course_work/dataframe_for_learning.csv')

In [None]:
df = pd.read_csv('dataframe_for_learning.csv')

df

In [None]:
image_paths = df["Filepath"]
image_paths

## Preprocessing

In [None]:
image_emb_pipeline = make_pipeline (
    ImageLoader(convert="RGB"),
    ClipEncoder()
)

In [None]:
X = image_emb_pipeline.transform(df['Filepath'].tolist())

In [None]:
y = df.drop(columns=['Filepath'])
y

In [None]:
save_path = "/content/drive/MyDrive/course_work/embeddings/"
import os
os.makedirs(save_path, exist_ok=True)

np.save(save_path + "embeddings.npy", X)
y.to_csv(save_path + "labels.csv", index=False)

In [None]:
X = np.load("embeddings.npy")
y = pd.read_csv("labels.csv")

In [None]:
y_modif = y.drop(columns=['familiar', 'uncommon', 'atypical', 'unfamiliar', 'common', 'calm'])

In [None]:
X.shape

In [None]:
y.shape

In [None]:
for trait in y.columns:
    counts = y[trait].value_counts(normalize=True)
    print(f"{trait}: {counts.to_dict()}")

In [None]:
diffs = 0
for col in y.columns:
    diffs += abs(2222 - 2 * sum(y[col]))
print(diffs // 40)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_modif, X_test_modif, y_train_modif, y_test_modif, image_paths_train, image_paths_test = train_test_split(X, y_modif, image_paths, test_size=0.2, random_state=42)

In [None]:
models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

## Choosing the model

In [None]:
best_models = {}

for trait in y.columns:
    print(f"Training for {trait}...")

    y_trait = y_train[trait]

    best_accuracy = 0
    best_model = None

    for name, model in models.items():
      model.fit(X_train, y_trait)
      y_pred = model.predict(X_test)
      accuracy = accuracy_score(y_test[trait], y_pred)

      print(f"{name} Accuracy for {trait}: {accuracy:.4f}")

      if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
    best_models[trait] = {"model": best_model, "accuracy": best_accuracy}


In [None]:
for trait in y.columns:
  print(f"The best for {trait}: {best_models[trait]}")

In [None]:
from collections import Counter

model_names = [v['model'] for v in best_models.values()]
model_wins = Counter(model_names)

for model, count in model_wins.most_common():
    print(f"{model}: {count} раз")

## Logistic regression

```
class sklearn.linear_model.LogisticRegression(
  penalty='l2',
  dual=False,
  tol=0.0001,
  C=1.0,
  fit_intercept=True,
  intercept_scaling=1,
  class_weight=None,
  random_state=None,
  solver='lbfgs',
  max_iter=100,
  multi_class='deprecated',
  verbose=0,
  warm_start=False,
  n_jobs=None,
  l1_ratio=None)
```

In [None]:
log_reg_initial_model = LogisticRegression(max_iter=1000)

clf = MultiOutputClassifier(log_reg_initial_model)

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
def avg_acc_score(y_test, y_pred):
  return np.mean([accuracy_score(y_test.iloc[:, i], y_pred[:, i]) for i in range(y_test.shape[1])])

In [None]:
initial_accuracy = avg_acc_score(y_test, y_pred)
print("Изначальная Accuracy:", initial_accuracy.round(3))

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("clf", clf)
])

In [None]:
param_grid = {
    "pca__n_components": [8, 16, 32, 64, 128],
    "clf__estimator__penalty": ['l2'],
    "clf__estimator__C": [0.1, 1, 10],
    "clf__estimator__class_weight": [None, 'balanced'],
    "clf__estimator__solver": ['lbfgs', 'liblinear', 'newton-cg','newton-cholesky', 'sag', 'saga'],
    "clf__estimator__tol": [1e-4, 1e-3]
}

log_reg_cv = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=make_scorer(avg_acc_score),
    cv=5,
    verbose=4,
    n_jobs=-1
)

log_reg_cv.fit(X_train, y_train)

In [None]:
print("Лучшие параметры:", log_reg_cv.best_params_)
print("Средняя Accuracy:", log_reg_cv.best_score_)

In [None]:
param_grid_2 = {
    "pca__n_components": [8, 16, 32, 64],
    "clf__estimator__penalty": ['l1'],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__solver": ['liblinear', 'saga'],
    "clf__estimator__tol": [1e-5, 1e-4]

}

log_reg_cv_2 = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_2,
    scoring=make_scorer(avg_acc_score),
    cv=5,
    verbose=4,
    n_jobs=-1
)

log_reg_cv_2.fit(X_train, y_train)

In [None]:
best_params = log_reg_cv_2.best_params_
print("Лучшие параметры:", best_params)
print("Средняя Accuracy:", log_reg_cv_2.best_score_)

In [None]:
log_reg_final_model_try = log_reg_cv_2.best_estimator_
y_pred = log_reg_final_model_try.predict(X_test)
accuracy = avg_acc_score(y_test, y_pred)
print(f"Best model accuracy: {accuracy:.4f}")

In [None]:
y_pred_proba = log_reg_final_model_try.predict_proba(X_test)
y_pred_proba = np.transpose(y_pred_proba, (1, 0, 2))

roc_auc_scores = {}

trait_names = y_test.columns
for i, trait_name in enumerate(trait_names):
    auc = roc_auc_score(
        y_test.iloc[:, i],
        y_pred_proba[:, i, 1]
    )
    roc_auc_scores[trait_name] = auc

roc_auc_scores = pd.DataFrame.from_dict(
    roc_auc_scores, 
    orient='index', 
    columns=['AUC']
).sort_values('AUC', ascending=False)


print(roc_auc_scores)

In [None]:
param_grid_modif = {
    "pca__n_components": [8, 16, 32, 64],
    "clf__estimator__penalty": ['l1'],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__solver": ['liblinear', 'saga'],
    "clf__estimator__tol": [1e-5, 1e-4]

}

log_reg_cv_modif = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_modif,
    scoring=make_scorer(avg_acc_score),
    cv=5,
    verbose=4,
    n_jobs=-1
)

log_reg_cv_modif.fit(X_train_modif, y_train_modif)

In [None]:
print("Лучшие параметры:", log_reg_cv_modif.best_params_)
print("Средняя Accuracy:", log_reg_cv_modif.best_score_)

In [None]:
param_grid_modif_2 = {
    "pca__n_components": [8, 16, 32, 64, 128],
    "clf__estimator__penalty": ['l2'],
    "clf__estimator__C": [0.1, 1, 10],
    "clf__estimator__class_weight": [None, 'balanced'],
    "clf__estimator__solver": ['lbfgs', 'liblinear', 'newton-cg','newton-cholesky', 'sag', 'saga'],
    "clf__estimator__tol": [1e-4, 1e-3]

}

log_reg_cv_modif_2 = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_modif_2,
    scoring=make_scorer(avg_acc_score),
    cv=5,
    verbose=4,
    n_jobs=-1
)

log_reg_cv_modif_2.fit(X_train_modif, y_train_modif)

In [None]:
print("Лучшие параметры:", log_reg_cv_modif_2.best_params_)
print("Средняя Accuracy:", log_reg_cv_modif_2.best_score_)

In [None]:
log_reg_cv_modif = log_reg_cv_modif.best_estimator_
y_pred = log_reg_cv_modif.predict(X_test_modif)
accuracy = avg_acc_score(y_test_modif, y_pred)
print(f"Best model accuracy: {accuracy:.4f}")

In [None]:
log_reg_cv_modif_2 = log_reg_cv_modif_2.best_estimator_
y_pred = log_reg_cv_modif_2.predict(X_test_modif)
accuracy = avg_acc_score(y_test_modif, y_pred)
print(f"Best model accuracy: {accuracy:.4f}")

## Results

In [None]:
log_reg_final_model = log_reg_cv_modif

In [None]:
y_pred_proba = log_reg_final_model.predict_proba(X_test)
y_pred_proba = np.transpose(y_pred_proba, (1, 0, 2))

roc_auc_scores = {}

trait_names = y_test_modif.columns
for i, trait_name in enumerate(trait_names):
    auc = roc_auc_score(
        y_test_modif.iloc[:, i],
        y_pred_proba[:, i, 1]
    )
    roc_auc_scores[trait_name] = auc

roc_auc_scores = pd.DataFrame.from_dict(
    roc_auc_scores, 
    orient='index', 
    columns=['AUC']
).sort_values('AUC', ascending=False)


print(roc_auc_scores)

In [None]:
predict = log_reg_final_model.predict(X_test_modif[0].reshape(1, -1))[0]
predict

In [None]:
from PIL import Image

# img = Image.open("Aaron_Dollar_13_oval.jpg")

df_compare = pd.DataFrame({
    "Черта": traits,
    "Аннотация": y_test_modif.values[0],
    "Предсказание": predict
})
display(df_compare)

In [None]:
X_test.shape

In [None]:
pca = PCA(n_components=64)
X_pca = pca.fit_transform(X)
np.sum(pca.explained_variance_ratio_)

In [None]:
from PIL import Image
img1 = "610900192_6a54ec9688_o.jpg"
img2 = "5045855615_f0419390bf_b.jpg"

X_example = image_emb_pipeline.transform([img1, img2])
y_pred_example = log_reg_final_model.predict(X_example)

In [None]:
df_compare = pd.DataFrame({
    "Черта": trait_names,
    "Предсказание1": y_pred_example[0],
    "Предсказание2": y_pred_example[1]
})
df_compare