In [5]:
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

# Load the Olivetti faces dataset
data = fetch_olivetti_faces()
X = data.data
y = data.target

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Function to evaluate the classifiers
def evaluate_classifiers(X, y, n_clusters, distance_metric):
    # Initialize labels
    labels = None
    
    # Cluster the data
    if distance_metric == 'euclidean':
        clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
        labels = clustering.fit_predict(X)
    elif distance_metric == 'minkowski':
        clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average')  # Change linkage to average
        X_dist = pairwise_distances(X, metric='minkowski')
        labels = clustering.fit_predict(X_dist)
    else:  # cosine
        clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average')  # Change linkage to average
        X_dist = pairwise_distances(X, metric='cosine')
        labels = clustering.fit_predict(X_dist)

    # Calculate silhouette score
    silhouette_avg = silhouette_score(X, labels)
    
    # Evaluate classifiers
    classifiers = {
        'SVM': SVC(),
        'Random Forest': RandomForestClassifier()
    }
    results = {}
    for name, clf in classifiers.items():
        skf = StratifiedKFold(n_splits=5)
        cross_val_scores = []
        for train_index, val_index in skf.split(X, y):
            X_train_fold, X_val_fold = X[train_index], X[val_index]
            y_train_fold, y_val_fold = y[train_index], y[val_index]
            clf.fit(X_train_fold, y_train_fold)
            cross_val_scores.append(clf.score(X_val_fold, y_val_fold))
        average_cross_val_accuracy = np.mean(cross_val_scores)
        
        # Fit on validation set
        clf.fit(X_val, y_val)
        validation_accuracy = clf.score(X_val, y_val)

        results[name] = {
            'cross_val_accuracy': round(average_cross_val_accuracy, 2),
            'validation_accuracy': round(validation_accuracy, 2)
        }
    
    return silhouette_avg, results

# Define distance metrics and cluster ranges
distance_metrics = ['euclidean', 'minkowski', 'cosine']
n_clusters_list = [20, 40, 60]

# Store results
overall_results = {}
best_score = -1
best_n_clusters = None
best_metric = None

# Evaluate for each distance metric and number of clusters
for distance_metric in distance_metrics:
    for n_clusters in n_clusters_list:
        silhouette_avg, results = evaluate_classifiers(X_train, y_train, n_clusters, distance_metric)
        overall_results[(distance_metric, n_clusters)] = {
            'silhouette_score': round(silhouette_avg, 2),
            'results': results
        }
        
        # Check for the best score
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n_clusters = n_clusters
            best_metric = distance_metric

# Print overall results
for key, value in overall_results.items():
    print(f"--- Clustering with {key[1]} clusters and {key[0]} distance metric ---")
    print("Silhouette Score:", value['silhouette_score'])
    for classifier, scores in value['results'].items():
        print(f"Evaluating with {classifier} classifier...")
        print("Average cross-validation accuracy:", scores['cross_val_accuracy'])
        print("Validation accuracy:", scores['validation_accuracy'])
    print()

# Best silhouette score
print("Best Score:", round(best_score, 2), "and best n clusters", best_metric, best_n_clusters)


--- Clustering with 20 clusters and euclidean distance metric ---
Silhouette Score: 0.11
Evaluating with SVM classifier...
Average cross-validation accuracy: 0.88
Validation accuracy: 0.72
Evaluating with Random Forest classifier...
Average cross-validation accuracy: 0.92
Validation accuracy: 1.0

--- Clustering with 40 clusters and euclidean distance metric ---
Silhouette Score: 0.16
Evaluating with SVM classifier...
Average cross-validation accuracy: 0.88
Validation accuracy: 0.72
Evaluating with Random Forest classifier...
Average cross-validation accuracy: 0.92
Validation accuracy: 1.0

--- Clustering with 60 clusters and euclidean distance metric ---
Silhouette Score: 0.19
Evaluating with SVM classifier...
Average cross-validation accuracy: 0.88
Validation accuracy: 0.72
Evaluating with Random Forest classifier...
Average cross-validation accuracy: 0.9
Validation accuracy: 1.0

--- Clustering with 20 clusters and minkowski distance metric ---
Silhouette Score: 0.11
Evaluating with

In [8]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Load the Olivetti faces dataset
faces = fetch_olivetti_faces()
X = faces.data  # Features (images)
y = faces.target  # Labels (person identifiers)

# Check the shape and type of X
print("Shape of X:", X.shape)
print("Type of X:", type(X))

# Check class distribution
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the number of clusters from the best result (Euclidean distance, 60 clusters)
n_clusters = 60  # or whichever gives you the best silhouette score

# Clustering using Agglomerative Clustering
clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean')
cluster_labels = clustering.fit_predict(X_scaled)

# Train a classifier using K-Fold Cross Validation
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) 
accuracy_scores = []

# Train SVM Classifier
svm_classifier = svm.SVC(kernel='linear', random_state=42)

for train_index, test_index in skf.split(X_scaled, cluster_labels):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = cluster_labels[train_index], cluster_labels[test_index]
    
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

# Calculate average accuracy
average_accuracy = np.mean(accuracy_scores)
print(f'Average Accuracy with K-Fold Cross Validation: {average_accuracy:.2f}')


Shape of X: (400, 4096)
Type of X: <class 'numpy.ndarray'>
Class distribution: {0: 10, 1: 10, 2: 10, 3: 10, 4: 10, 5: 10, 6: 10, 7: 10, 8: 10, 9: 10, 10: 10, 11: 10, 12: 10, 13: 10, 14: 10, 15: 10, 16: 10, 17: 10, 18: 10, 19: 10, 20: 10, 21: 10, 22: 10, 23: 10, 24: 10, 25: 10, 26: 10, 27: 10, 28: 10, 29: 10, 30: 10, 31: 10, 32: 10, 33: 10, 34: 10, 35: 10, 36: 10, 37: 10, 38: 10, 39: 10}
Average Accuracy with K-Fold Cross Validation: 0.95
