In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
database = pd.read_csv("persons_pics_train.csv")

In [None]:
num_unique_people = database['label'].nunique()
print(num_unique_people)

In [None]:
database['label'].value_counts().plot(kind='bar')
plt.tight_layout()
plt.show()

total_samples = len(database)
gerhard_schroeder_samples = database['label'].value_counts()['Gerhard Schroeder']
gerhard_schroeder_fraction = gerhard_schroeder_samples / total_samples

print(gerhard_schroeder_fraction)

In [None]:
mean_vectors = database.groupby('label').mean()

gerhard_schroeder_vector = mean_vectors.loc['Gerhard Schroeder']
gerhard_schroeder_coordinate = gerhard_schroeder_vector[0]
print(gerhard_schroeder_coordinate)

In [None]:
for label, mean_vector in mean_vectors.iterrows():
    image = np.array(mean_vector).reshape(62, 47)
    
    plt.imshow(image, cmap='gray')
    plt.title(label)
    plt.axis('off')
    plt.show()

In [None]:
similarity_matrix = cosine_similarity(mean_vectors)
gerhard_schroeder_index = mean_vectors.index.get_loc('Gerhard Schroeder')
hugo_chavez_index = mean_vectors.index.get_loc('Hugo Chavez')
cosine_similarity_value = similarity_matrix[gerhard_schroeder_index, hugo_chavez_index]

print(cosine_similarity_value)


In [None]:
from sklearn.svm import SVC
X = database.drop(columns=['label'])
y = database['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

model = SVC(kernel='linear', random_state=7)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)



In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000], 
                     'class_weight': [None, 'balanced'], 
                     'random_state':[7]}]
cv = GridSearchCV(SVC(), tuned_parameters, refit=True, verbose=3)
cv.fit(X, y)

best_params = cv.best_params_
print(best_params)

best_model = cv.best_estimator_

y_pred = best_model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)


In [None]:
from sklearn.decomposition import PCA
pca = PCA(svd_solver='full')
pca.fit(X)
explained_variance_ratio_cumulative = np.cumsum(pca.explained_variance_ratio_)
num_components = np.argmax(explained_variance_ratio_cumulative > 0.95) + 1
print(num_components)