In [1]:
from sklearn.datasets import fetch_lfw_people

# Load the LFW dataset
lfw_dataset = fetch_lfw_people(min_faces_per_person=100, resize=0.4)

# Inspect data
print("Image shape:", lfw_dataset.images.shape)  # (n_samples, height, width)
print("Number of samples:", len(lfw_dataset.data))
print("Number of classes:", len(lfw_dataset.target_names))
print("Class names:", lfw_dataset.target_names)


Image shape: (1140, 50, 37)
Number of samples: 1140
Number of classes: 5
Class names: ['Colin Powell' 'Donald Rumsfeld' 'George W Bush' 'Gerhard Schroeder'
 'Tony Blair']


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Flatten the image data (already in lfw_dataset.data)
X = lfw_dataset.data  # Features (flattened images)
y = lfw_dataset.target  # Labels (person IDs)

# Scale the feature values (important for ML models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Print the split sizes
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))


Training set size: 798
Testing set size: 342
