In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset
data = load_wine()
X = data.data
y = data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline with PCA and KNN
model = Pipeline([
    ('scaler', StandardScaler()),      # Step 1: Standardize features
    ('pca', PCA(n_components=5)),      # Step 2: Reduce dimensions to 5 components
    ('knn', KNeighborsClassifier(n_neighbors=3))  # Step 3: Apply KNN
])

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy with PCA:", round(accuracy, 3))


KNN Accuracy with PCA: 0.944


In [2]:
# Import libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load dataset
data = load_wine()
X = data.data
y = data.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 1. KNN WITHOUT SCALING
# -------------------------
knn_unscaled = KNeighborsClassifier(n_neighbors=5)
knn_unscaled.fit(X_train, y_train)
y_pred_unscaled = knn_unscaled.predict(X_test)
accuracy_unscaled = accuracy_score(y_test, y_pred_unscaled)

# -------------------------
# 2. KNN WITH SCALING
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = knn_scaled.predict(X_test_scaled)
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)

# -------------------------
# Print Results
# -------------------------
print("KNN Accuracy without Scaling:", round(accuracy_unscaled, 3))
print("KNN Accuracy with Scaling:   ", round(accuracy_scaled, 3))


KNN Accuracy without Scaling: 0.722
KNN Accuracy with Scaling:    0.944


In [3]:
# Import libraries
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

# Load the dataset
data = load_wine()
X = data.data

# Step 1: Standardize the data (important before PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Step 3: Print explained variance ratio
explained_variance = pca.explained_variance_ratio_

# Display results in a table format
pca_results = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Explained Variance Ratio': explained_variance
})

print(pca_results)
print("\nTotal Variance Explained:", round(sum(explained_variance), 3))


   Principal Component  Explained Variance Ratio
0                  PC1                  0.361988
1                  PC2                  0.192075
2                  PC3                  0.111236
3                  PC4                  0.070690
4                  PC5                  0.065633
5                  PC6                  0.049358
6                  PC7                  0.042387
7                  PC8                  0.026807
8                  PC9                  0.022222
9                 PC10                  0.019300
10                PC11                  0.017368
11                PC12                  0.012982
12                PC13                  0.007952

Total Variance Explained: 1.0


In [4]:
# Import libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = load_wine()
X = data.data
y = data.target

# Split the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 1. KNN on ORIGINAL dataset
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_original = KNeighborsClassifier(n_neighbors=5)
knn_original.fit(X_train_scaled, y_train)
y_pred_original = knn_original.predict(X_test_scaled)
accuracy_original = accuracy_score(y_test, y_pred_original)

# -------------------------
# 2. KNN on PCA-transformed dataset (Top 2 components)
# -------------------------
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

knn_pca = KNeighborsClassifier(n_neighbors=5)
knn_pca.fit(X_train_pca, y_train)
y_pred_pca = knn_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

# -------------------------
# Print Results
# -------------------------
print("KNN Accuracy on ORIGINAL dataset:", round(accuracy_original, 3))
print("KNN Accuracy on PCA-transformed dataset (2 components):", round(accuracy_pca, 3))


KNN Accuracy on ORIGINAL dataset: 0.944
KNN Accuracy on PCA-transformed dataset (2 components): 1.0


In [5]:
# Import libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = load_wine()
X = data.data
y = data.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# 1. KNN with Euclidean distance (default)
# -------------------------
knn_euclidean = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn_euclidean.fit(X_train_scaled, y_train)
y_pred_euclidean = knn_euclidean.predict(X_test_scaled)
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)

# -------------------------
# 2. KNN with Manhattan distance
# -------------------------
knn_manhattan = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
knn_manhattan.fit(X_train_scaled, y_train)
y_pred_manhattan = knn_manhattan.predict(X_test_scaled)
accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan)

# -------------------------
# Print Results
# -------------------------
print("KNN Accuracy with Euclidean distance:", round(accuracy_euclidean, 3))
print("KNN Accuracy with Manhattan distance:", round(accuracy_manhattan, 3))


KNN Accuracy with Euclidean distance: 0.944
KNN Accuracy with Manhattan distance: 0.944


In [6]:
# Import libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Simulate high-dimensional gene expression dataset
X, y = make_classification(n_samples=100, n_features=500, n_informative=50,
                           n_redundant=450, n_classes=3, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Apply PCA (retain 95% variance)
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Original number of features:", X.shape[1])
print("Reduced number of features:", X_train_pca.shape[1])

# Step 3: Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)

# Step 4: Predictions
y_pred = knn.predict(X_test_pca)

# Step 5: Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 6: Cross-validation for robustness
cv_scores = cross_val_score(knn, X_train_pca, y_train, cv=5)
print("\nCross-validation Accuracy:", round(cv_scores.mean(), 3))


Original number of features: 500
Reduced number of features: 35

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         6
           1       0.73      1.00      0.84         8
           2       1.00      1.00      1.00         6

    accuracy                           0.85        20
   macro avg       0.91      0.83      0.84        20
weighted avg       0.89      0.85      0.84        20


Confusion Matrix:
 [[3 3 0]
 [0 8 0]
 [0 0 6]]

Cross-validation Accuracy: 0.575
