# Palmer Penguins Dataset: Overview

The Palmer Penguins dataset contains data about three species of penguins observed in the Palmer Archipelago, Antarctica. The dataset was collected by Dr. Kristen Gorman and the Palmer Station LTER (Long Term Ecological Research) Program. It serves as a popular alternative to the Iris dataset for data exploration, statistical analysis, and machine learning practice due to its richer set of features and categorical variables.

Dataset Features:

The dataset consists of 344 rows and 7 columns. The columns are:

	1.	species: Categorical feature indicating the penguin species (Adélie, Chinstrap, Gentoo).
	2.	island: Categorical feature representing the island where the penguin was observed (Biscoe, Dream, Torgersen).
	3.	bill_length_mm: Continuous numerical feature representing the length of the penguin’s bill (in millimeters).
	4.	bill_depth_mm: Continuous numerical feature representing the depth of the penguin’s bill (in millimeters).
	5.	flipper_length_mm: Continuous numerical feature representing the penguin’s flipper length (in millimeters).
	6.	body_mass_g: Continuous numerical feature representing the penguin’s body mass (in grams).
	7.	sex: Categorical feature indicating the penguin’s sex (male or female), though some entries are missing.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = sns.load_dataset('penguins')
data = data.dropna()
data.head()

In [None]:
# Separate target from features
X = data.drop(columns=['species'])
X = pd.get_dummies(X)
y = data['species']

print(X.shape)

In [None]:
# Split int training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1990) 

In [None]:
# Define and fit your model!

k = 5
metric = 'euclidean'
knn = KNeighborsClassifier(n_neighbors = k, metric = metric)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
# Model validation
accuracy = np.sum(y_pred == y_test) / len(y_test) * 100
print(accuracy)

In [None]:
## Attempt 2: Rescaling the data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
knn.fit(X_train_scaled, y_train)
y_pred_scaled = knn.predict(X_test_scaled)

accuracy = np.sum(y_pred_scaled == y_test) / len(y_test) * 100
print(accuracy)

In [None]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)

fig, ax = plt.subplots(1, 2, figsize = (10, 5))

sns.scatterplot(x = X_train['bill_length_mm'], 
                y = X_train['body_mass_g'], hue = y_train, ax = ax[0])
ax[0].set_title('Training: original features')

sns.scatterplot(x = X_train_scaled['bill_length_mm'], 
                y = X_train_scaled['body_mass_g'], hue = y_train, ax = ax[1])
ax[1].set_title('Training: scaled features')

plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10, 5))

# ensure X_test_scaled is a DataFrame with the same columns/index as X_test
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

sns.scatterplot(x = X_test['bill_length_mm'], 
                y = X_test['body_mass_g'], hue = y_pred, ax = ax[0])
ax[0].set_title('Predicted species (original features)')

sns.scatterplot(x = X_test_scaled['bill_length_mm'], 
                y = X_test_scaled['body_mass_g'], hue = y_pred_scaled, ax = ax[1])
ax[1].set_title('Predicted species (scaled features)')

plt.tight_layout()

In [None]:
features = ['bill_length_mm', 'body_mass_g']

# raw (unscaled) training data for the two features
Xr_train = X_train[features].values
Xr_test = X_test[features].values

# scaled training data for the two features
Xs_train = X_train_scaled[features].values
Xs_test = X_test_scaled[features].values

# prepare mesh grid in the original feature space
all_points = np.vstack([Xr_train, Xr_test])
x_min, x_max = all_points[:, 0].min() - 1.0, all_points[:, 0].max() + 1.0
y_min, y_max = all_points[:, 1].min() - 200.0, all_points[:, 1].max() + 200.0

h_x = 0.2
h_y = 25.0
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# fit raw KNN
knn_raw = KNeighborsClassifier(n_neighbors=k, metric=metric)
knn_raw.fit(Xr_train, y_train)

Zr = knn_raw.predict(grid_points)

# fit scaled KNN: need to scale the grid points for the same two features
# find indices of the two features in the original scaler (fitted on X_train)
col_indices = [list(X_train.columns).index(f) for f in features]
means = scaler.mean_[col_indices]
scales = scaler.scale_[col_indices]
grid_points_scaled = (grid_points - means) / scales

knn_scaled = KNeighborsClassifier(n_neighbors=k, metric=metric)
knn_scaled.fit(Xs_train, y_train)

Zs = knn_scaled.predict(grid_points_scaled)

# convert class labels to integers for plotting
classes = np.unique(y_train)
label_to_int = {label: i for i, label in enumerate(classes)}
Zr_int = np.vectorize(label_to_int.get)(Zr).reshape(xx.shape)
Zs_int = np.vectorize(label_to_int.get)(Zs).reshape(xx.shape)
y_test_int = y_test.map(label_to_int)

cmap_light = ListedColormap(['#FFEEEE', '#EEFFEE', '#EEEEFF'])
cmap_points = ListedColormap(['#FF0000', '#00AA00', '#0000FF'])

fig, axs = plt.subplots(1, 2, figsize=(12, 5))

# left: raw features decision boundary
axs[0].contourf(xx, yy, Zr_int, cmap=cmap_light, alpha=0.6)
sc = axs[0].scatter(Xr_test[:, 0], Xr_test[:, 1], c=y_test_int, cmap=cmap_points, edgecolor='k', s=50)
axs[0].set_title('KNN decision boundary (raw features)')
axs[0].set_xlabel('bill_length_mm')
axs[0].set_ylabel('body_mass_g')

# right: scaled features decision boundary (background plotted in original feature axes)
axs[1].contourf(xx, yy, Zs_int, cmap=cmap_light, alpha=0.6)
axs[1].scatter(Xr_test[:, 0], Xr_test[:, 1], c=y_test_int, cmap=cmap_points, edgecolor='k', s=50)
axs[1].set_title('KNN decision boundary (scaled features)')
axs[1].set_xlabel('bill_length_mm')
axs[1].set_ylabel('body_mass_g')

# legend with class names
handles = []
for i, cls in enumerate(classes):
    handles.append(plt.Line2D([0], [0], marker='o', color='w', label=cls,
                              markerfacecolor=cmap_points(i), markersize=8, markeredgecolor='k'))
axs[1].legend(handles=handles, title='species', loc='upper left')

plt.tight_layout()
plt.show()