21. Train a KNN Classifier on the Iris dataset and print model accuracy.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


22. Train a KNN Regressor on a synthetic dataset and evaluate using Mean Squared Error (MSE).

In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Generate a synthetic regression dataset
X, y = make_regression(n_samples=200, n_features=1, noise=10, random_state=42)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=3)

# Train the model
knn_regressor.fit(X_train, y_train)

# Make predictions
y_pred = knn_regressor.predict(X_test)

# Calculate and print Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")


23. Train a KNN Classifier using different distance metrics (Euclidean and Manhattan) and compare accuracy.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNN with Euclidean distance (default)
knn_euclidean = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_euclidean.fit(X_train, y_train)
y_pred_euclidean = knn_euclidean.predict(X_test)
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)

# KNN with Manhattan distance
knn_manhattan = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
knn_manhattan.fit(X_train, y_train)
y_pred_manhattan = knn_manhattan.predict(X_test)
accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan)

# Print results
print(f"Accuracy with Euclidean distance: {accuracy_euclidean * 100:.2f}%")
print(f"Accuracy with Manhattan distance: {accuracy_manhattan * 100:.2f}%")


24. Train a KNN Classifier with different values of K and visualize decision boundaries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2]  # Only take the first two features for 2D plotting
y = iris.target
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold  = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Try K values from 1 to 5
k_values = [1, 3, 5]

plt.figure(figsize=(18, 5))

for i, k in enumerate(k_values, 1):
    # Create KNN classifier and fit
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train)

    # Create mesh grid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Predict on mesh grid
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot
    plt.subplot(1, len(k_values), i)
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold, edgecolor='k', s=40, label="Train")
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold, edgecolor='k', s=100, marker='*', label="Test")
    plt.title(f"K = {k}")
    plt.xlabel("Sepal length")
    plt.ylabel("Sepal width")
    plt.legend()

plt.tight_layout()
plt.show()


25. Apply Feature Scaling before training a KNN model and compare results with unscaled data.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- KNN without Feature Scaling ---
knn_unscaled = KNeighborsClassifier(n_neighbors=3)
knn_unscaled.fit(X_train, y_train)
y_pred_unscaled = knn_unscaled.predict(X_test)
accuracy_unscaled = accuracy_score(y_test, y_pred_unscaled)

# --- Apply Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- KNN with Feature Scaling ---
knn_scaled = KNeighborsClassifier(n_neighbors=3)
knn_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = knn_scaled.predict(X_test_scaled)
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)

# --- Compare Results ---
print(f"Accuracy without Scaling: {accuracy_unscaled * 100:.2f}%")
print(f"Accuracy with Scaling:    {accuracy_scaled * 100:.2f}%")


26. Train a PCA model on synthetic data and print the explained variance ratio for each component.

In [None]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
import numpy as np

# Generate synthetic dataset with 5 features
X, y = make_classification(n_samples=200, n_features=5, n_informative=3, random_state=42)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X)

# Print explained variance ratio
explained_variance = pca.explained_variance_ratio_
for i, ratio in enumerate(explained_variance):
    print(f"Principal Component {i+1}: {ratio:.4f} ({ratio * 100:.2f}% of variance explained)")


27. Apply PCA before training a KNN Classifier and compare accuracy with and without PCA.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----- KNN without PCA -----
knn_no_pca = KNeighborsClassifier(n_neighbors=3)
knn_no_pca.fit(X_train_scaled, y_train)
y_pred_no_pca = knn_no_pca.predict(X_test_scaled)
accuracy_no_pca = accuracy_score(y_test, y_pred_no_pca)

# ----- Apply PCA -----
pca = PCA(n_components=2)  # Reduce to 2 components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# ----- KNN with PCA -----
knn_with_pca = KNeighborsClassifier(n_neighbors=3)
knn_with_pca.fit(X_train_pca, y_train)
y_pred_pca = knn_with_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

# ----- Results -----
print(f"Accuracy without PCA: {accuracy_no_pca * 100:.2f}%")
print(f"Accuracy with PCA (2 components): {accuracy_pca * 100:.2f}%")


28. Perform Hyperparameter Tuning on a KNN Classifier using GridSearchCV

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define KNN classifier
knn = KNeighborsClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance']
}

# Perform GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the best model and evaluate on test set
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the best KNN model: {accuracy * 100:.2f}%")


29. Train a KNN Classifier and check the number of misclassified samples.

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Calculate number of misclassified samples
misclassified = (y_test != y_pred).sum()

print(f"Number of misclassified samples: {misclassified}")


30. Train a PCA model and visualize the cumulative explained variance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Iris dataset (or any other dataset with multiple features)
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

# Standardize the data
# PCA is sensitive to the scale of the features, so it's important to standardize.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
# n_components=None means that PCA will keep all original components
pca = PCA(n_components=None)
pca.fit(X_scaled)

# Get the explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Visualize the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
plt.title('Cumulative Explained Variance by Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.xticks(range(1, len(cumulative_explained_variance) + 1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.show()

print("Explained variance ratio of each component:", explained_variance_ratio)
print("Cumulative explained variance:", cumulative_explained_variance)

31. Train a KNN Classifier using different values of the weights parameter (uniform vs. distance) and compare
accuracy.

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset (a common dataset for classification)
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the number of neighbors (k) to use
n_neighbors = 5

# --- Train and evaluate KNN with uniform weights ---
knn_uniform = KNeighborsClassifier(n_neighbors=n_neighbors, weights='uniform')
knn_uniform.fit(X_train, y_train)
y_pred_uniform = knn_uniform.predict(X_test)
accuracy_uniform = accuracy_score(y_test, y_pred_uniform)

print(f"KNN with uniform weights (k={n_neighbors}):")
print(f"  Accuracy on the test set: {accuracy_uniform:.4f}")

# --- Train and evaluate KNN with distance weights ---
knn_distance = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
knn_distance.fit(X_train, y_train)
y_pred_distance = knn_distance.predict(X_test)
accuracy_distance = accuracy_score(y_test, y_pred_distance)

print(f"\nKNN with distance weights (k={n_neighbors}):")
print(f"  Accuracy on the test set: {accuracy_distance:.4f}")

# --- Comparison ---
print("\nComparison:")
if accuracy_uniform > accuracy_distance:
    print(f"Uniform weights performed better with an accuracy of {accuracy_uniform:.4f} compared to distance weights ({accuracy_distance:.4f}).")
elif accuracy_distance > accuracy_uniform:
    print(f"Distance weights performed better with an accuracy of {accuracy_distance:.4f} compared to uniform weights ({accuracy_uniform:.4f}).")
else:
    print(f"Both uniform and distance weights achieved the same accuracy of {accuracy_uniform:.4f}.")

32. Train a KNN Regressor and analyze the effect of different K values on performance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Generate a synthetic regression dataset
X, y = make_regression(n_samples=200, n_features=1, noise=20, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define a range of K values to test
k_values = range(1, 21)  # Test K from 1 to 20

# Store the performance metrics for each K
mse_scores = []
r2_scores = []

# Train and evaluate KNN Regressor for each K
for k in k_values:
    # Initialize the KNN Regressor with the current K
    knn_regressor = KNeighborsRegressor(n_neighbors=k)

    # Train the model
    knn_regressor.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = knn_regressor.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store the results
    mse_scores.append(mse)
    r2_scores.append(r2)

# --- Visualize the effect of K on Mean Squared Error (MSE) ---
plt.figure(figsize=(12, 6))
plt.plot(k_values, mse_scores, marker='o', linestyle='-', color='red')
plt.title('Effect of K on Mean Squared Error')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Mean Squared Error (MSE)')
plt.grid(True)
plt.xticks(k_values)
plt.tight_layout()
plt.show()

# --- Visualize the effect of K on R-squared (R²) ---
plt.figure(figsize=(12, 6))
plt.plot(k_values, r2_scores, marker='o', linestyle='-', color='green')
plt.title('Effect of K on R-squared (R²)')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('R-squared (R²)')
plt.grid(True)
plt.xticks(k_values)
plt.tight_layout()
plt.show()

# --- Print the performance metrics for each K (optional) ---
print("Performance metrics for different K values:")
for i, k in enumerate(k_values):
    print(f"K={k}: MSE={mse_scores[i]:.4f}, R²={r2_scores[i]:.4f}")

# --- Using Cross-Validation for more robust K selection ---
print("\n--- Analyzing K using Cross-Validation ---")
cv_scores = []
for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k)
    # Perform 5-fold cross-validation, scoring by negative mean squared error
    # We use negative MSE because cross_val_score aims to maximize the score
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    # Convert negative MSE back to positive and take the mean
    mse_cv = -scores.mean()
    cv_scores.append(mse_cv)

# Find the optimal K based on cross-validation
optimal_k_cv = k_values[np.argmin(cv_scores)]
print(f"Optimal K (based on cross-validation MSE): {optimal_k_cv}")

# Visualize cross-validation MSE
plt.figure(figsize=(12, 6))
plt.plot(k_values, cv_scores, marker='o', linestyle='-', color='blue')
plt.title('Cross-Validation Mean Squared Error vs. K')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Cross-Validation Mean Squared Error')
plt.grid(True)
plt.xticks(k_values)
plt.tight_layout()
plt.show()

# Train the final model with the optimal K found by cross-validation
final_knn = KNeighborsRegressor(n_neighbors=optimal_k_cv)
final_knn.fit(X_train, y_train)
final_y_pred = final_knn.predict(X_test)
final_mse = mean_squared_error(y_test, final_y_pred)
final_r2 = r2_score(y_test, final_y_pred)

print(f"\nPerformance of KNN Regressor with optimal K ({optimal_k_cv}):")
print(f"  MSE on Test Set: {final_mse:.4f}")
print(f"  R² on Test Set: {final_r2:.4f}")

33. Implement KNN Imputation for handling missing values in a dataset.

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. Create a sample dataset with missing values
data = {
    'A': [1, 2, np.nan, 4, 5, np.nan, 7, 8],
    'B': [np.nan, 10, 11, 12, np.nan, 14, 15, 16],
    'C': [20, 21, 22, np.nan, 24, 25, np.nan, 27],
    'D': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)

# 2. Separate features with missing values from those without (optional but good practice)
missing_cols = df.columns[df.isnull().any()].tolist()
non_missing_cols = df.columns[df.notnull().all()].tolist()

df_missing = df[missing_cols].copy()
df_non_missing = df[non_missing_cols].copy()

# 3. Initialize the KNNImputer
# n_neighbors: The number of neighboring samples to use for imputation.
# weights: Weight function used in prediction. Possible values:
#          - 'uniform': uniform weights. All neighbors weights equally.
#          - 'distance': weight points by the inverse of their distance.
# missing_values: The placeholder for the missing values. Defaults to np.nan.
imputer = KNNImputer(n_neighbors=3, weights='uniform')

# 4. Fit the imputer to the features with missing values and transform
imputed_array = imputer.fit_transform(df_missing)

# 5. Create a new DataFrame with the imputed values
df_imputed = pd.DataFrame(imputed_array, columns=df_missing.columns)
print("\nDataFrame with KNN Imputation (n_neighbors=3, uniform weights):")
print(df_imputed)

# 6. Combine the imputed features with the non-missing features (if any)
if not df_non_missing.empty:
    df_final = pd.concat([df_imputed, df_non_missing], axis=1)
else:
    df_final = df_imputed

print("\nFinal DataFrame after Imputation:")
print(df_final)

# 7. Analyze the effect of different K values (optional)
print("\n--- Analyzing the effect of different K values ---")
k_values = [1, 3, 5, 7]
imputed_dfs = {}

for k in k_values:
    imputer_k = KNNImputer(n_neighbors=k, weights='uniform')
    imputed_array_k = imputer_k.fit_transform(df_missing)
    imputed_dfs[f'K={k}'] = pd.DataFrame(imputed_array_k, columns=df_missing.columns)
    print(f"\nDataFrame with KNN Imputation (n_neighbors={k}, uniform weights):")
    print(imputed_dfs[f'K={k}'])

# 8. (More Advanced Analysis - Requires a "True" dataset for comparison)
#    To truly analyze the effect of different K values, you would ideally
#    have a dataset where you artificially introduce missing values and
#    then compare the imputed values with the original true values.

# Example of advanced analysis (requires a complete dataset):
print("\n--- Advanced Analysis (Requires a complete dataset) ---")
# Assume you have a complete dataset 'df_complete'
# Introduce some artificial missing values in 'df_complete' to create 'df_missing_artificial'
if 'df_complete' in locals():  # Only run if df_complete is defined
    from sklearn.metrics import mean_squared_error

    # Let's just use our original 'df' and pretend it's complete for this example
    df_complete = df.copy()
    df_missing_artificial = df_complete.copy()
    missing_indices = [(0, 'B'), (2, 'A'), (3, 'C'), (4, 'B'), (6, 'C')]
    for row, col in missing_indices:
        df_missing_artificial.loc[row, col] = np.nan

    print("\nArtificially Created Missing Data:")
    print(df_missing_artificial)

    true_values = {}
    for row, col in missing_indices:
        true_values[(row, col)] = df_complete.loc[row, col]

    mse_per_k = {}
    k_values_analysis = [1, 3, 5, 7]

    for k in k_values_analysis:
        imputer_analysis = KNNImputer(n_neighbors=k, weights='uniform')
        df_imputed_analysis = pd.DataFrame(imputer_analysis.fit_transform(df_missing_artificial),
                                            columns=df_missing_artificial.columns)

        predictions = []
        originals = []
        for (row, col), true_val in true_values.items():
            predictions.append(df_imputed_analysis.loc[row, col])
            originals.append(true_val)

        mse = mean_squared_error(originals, predictions)
        mse_per_k[f'K={k}'] = mse
        print(f"\nMean Squared Error for K={k}: {mse}")

    print("\nMSE for different K values (compared to true values):")
    print(mse_per_k)

34.  Train a PCA model and visualize the data projection onto the first two principal components.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
feature_names = iris.feature_names

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce to 2 principal components
n_components = 2
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Visualize the projected data
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)')
plt.title('PCA of Iris Dataset onto the First Two Principal Components')
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(True)
plt.show()

# Print the explained variance ratio
print("Explained variance ratio of the first two principal components:", pca.explained_variance_ratio_)

35. Train a KNN Classifier using the KD Tree and Ball Tree algorithms and compare performance.

In [None]:
import numpy as np
import time
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the number of neighbors (k)
n_neighbors = 5

# --- KNN Classifier with KD Tree ---
start_time_kd = time.time()
knn_kd_tree = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')
knn_kd_tree.fit(X_train, y_train)
end_time_kd = time.time()
train_time_kd = end_time_kd - start_time_kd

start_time_predict_kd = time.time()
y_pred_kd = knn_kd_tree.predict(X_test)
end_time_predict_kd = time.time()
predict_time_kd = end_time_predict_kd - start_time_predict_kd

accuracy_kd = accuracy_score(y_test, y_pred_kd)

print("KNN Classifier with KD Tree:")
print(f"  Training Time: {train_time_kd:.4f} seconds")
print(f"  Prediction Time: {predict_time_kd:.4f} seconds")
print(f"  Accuracy: {accuracy_kd:.4f}")

# --- KNN Classifier with Ball Tree ---
start_time_ball = time.time()
knn_ball_tree = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='ball_tree')
knn_ball_tree.fit(X_train, y_train)
end_time_ball = time.time()
train_time_ball = end_time_ball - start_time_ball

start_time_predict_ball = time.time()
y_pred_ball = knn_ball_tree.predict(X_test)
end_time_predict_ball = time.time()
predict_time_ball = end_time_predict_ball - start_time_predict_ball

accuracy_ball = accuracy_score(y_test, y_pred_ball)

print("\nKNN Classifier with Ball Tree:")
print(f"  Training Time: {train_time_ball:.4f} seconds")
print(f"  Prediction Time: {predict_time_ball:.4f} seconds")
print(f"  Accuracy: {accuracy_ball:.4f}")

# --- Comparison ---
print("\nComparison:")
print(f"  Accuracy (KD Tree): {accuracy_kd:.4f}")
print(f"  Accuracy (Ball Tree): {accuracy_ball:.4f}")
print(f"  Training Time (KD Tree): {train_time_kd:.4f} seconds")
print(f"  Training Time (Ball Tree): {train_time_ball:.4f} seconds")
print(f"  Prediction Time (KD Tree): {predict_time_kd:.4f} seconds")
print(f"  Prediction Time (Ball Tree): {predict_time_ball:.4f} seconds")

if accuracy_kd == accuracy_ball:
    print("\nBoth KD Tree and Ball Tree achieved the same accuracy.")
elif accuracy_kd > accuracy_ball:
    print("\nKD Tree achieved a higher accuracy.")
else:
    print("\nBall Tree achieved a higher accuracy.")

if train_time_kd < train_time_ball:
    print("KD Tree trained faster.")
elif train_time_ball < train_time_kd:
    print("Ball Tree trained faster.")
else:
    print("KD Tree and Ball Tree had similar training times.")

if predict_time_kd < predict_time_ball:
    print("KD Tree predicted faster.")
elif predict_time_ball < predict_time_kd:
    print("Ball Tree predicted faster.")
else:
    print("KD Tree and Ball Tree had similar prediction times.")

36. Train a PCA model on a high-dimensional dataset and visualize the Scree plot.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification

# Step 1: Generate a high-dimensional synthetic dataset
X, _ = make_classification(
    n_samples=500,        # number of samples
    n_features=50,        # total number of features
    n_informative=30,     # number of informative features
    random_state=42
)

# Step 2: Train a PCA model
pca = PCA()
pca.fit(X)

# Step 3: Extract explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Step 4: Plot the Scree plot
plt.figure(figsize=(10, 6))
plt.plot(
    range(1, len(explained_variance_ratio) + 1),
    explained_variance_ratio,
    marker='o', linestyle='-'
)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(range(1, len(explained_variance_ratio) + 1, 2))
plt.grid(True)
plt.tight_layout()
plt.show()


37. Train a KNN Classifier and evaluate performance using Precision, Recall, and F1-Score.

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Step 1: Generate a synthetic classification dataset
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
    n_classes=2, random_state=42
)

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 3: Train a KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = knn.predict(X_test)

# Step 5: Evaluate using precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Optional: Full classification report
report = classification_report(y_test, y_pred)

# Display metrics
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("\nClassification Report:\n", report)


38. Train a PCA model and analyze the effect of different numbers of components on accuracy.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 1: Generate a synthetic high-dimensional classification dataset
X, y = make_classification(
    n_samples=1000, n_features=50, n_informative=30, n_classes=2, random_state=42
)

# Step 2: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 3: Evaluate accuracy using different numbers of PCA components
components_range = range(1, 51)
accuracies = []

for n_components in components_range:
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Train KNN classifier
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_pca, y_train)

    # Predict and calculate accuracy
    y_pred = knn.predict(X_test_pca)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Step 4: Plot the results
plt.figure(figsize=(10, 6))
plt.plot(components_range, accuracies, marker='o')
plt.title('Effect of PCA Components on Classification Accuracy')
plt.xlabel('Number of PCA Components')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()


39. Train a KNN Classifier with different leaf_size values and compare accuracy.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Step 1: Generate a classification dataset
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_classes=2, random_state=42
)

# Step 2: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 3: Try different leaf_size values
leaf_sizes = range(5, 55, 5)
accuracies = []

for leaf_size in leaf_sizes:
    # Train KNN with varying leaf_size
    knn = KNeighborsClassifier(n_neighbors=5, leaf_size=leaf_size)
    knn.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Step 4: Plot accuracy vs leaf_size
plt.figure(figsize=(10, 6))
plt.plot(leaf_sizes, accuracies, marker='o', linestyle='-')
plt.title('KNN Accuracy vs. Leaf Size')
plt.xlabel('Leaf Size')
plt.ylabel('Accuracy')
plt.grid(True)
plt.tight_layout()
plt.show()


40. Train a PCA model and visualize how data points are transformed before and after PCA.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Generate a synthetic dataset
X, y = make_classification(
    n_samples=500, n_features=10, n_informative=5, n_classes=2, random_state=42
)

# Step 2: Standardize the data (important before PCA)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Reduce to 2D using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Step 4: Plot before PCA (first 2 original features)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.title('Original Data (First 2 Features)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)

# Step 5: Plot after PCA
plt.subplot(1, 2, 2)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.title('Data After PCA (2 Components)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)

plt.tight_layout()
plt.show()


41. Train a KNN Classifier on a real-world dataset (Wine dataset) and print classification report.

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Step 1: Load the Wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# Step 2: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 3: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Step 5: Make predictions and print classification report
y_pred = knn.predict(X_test_scaled)
report = classification_report(y_test, y_pred, target_names=wine.target_names)

print("Classification Report:\n")
print(report)


42. Train a KNN Regressor and analyze the effect of different distance metrics on prediction error.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Generate a regression dataset
X, y = make_regression(
    n_samples=500, n_features=10, noise=10, random_state=42
)

# Step 2: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 3: Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Try different distance metrics
distance_metrics = ['euclidean', 'manhattan', 'chebyshev']
mse_scores = []

for metric in distance_metrics:
    # Train KNN Regressor with given distance metric
    knn_reg = KNeighborsRegressor(n_neighbors=5, metric=metric)
    knn_reg.fit(X_train_scaled, y_train)

    # Predict and compute MSE
    y_pred = knn_reg.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

# Step 5: Plot the results
plt.figure(figsize=(8, 5))
plt.bar(distance_metrics, mse_scores, color='skyblue')
plt.title('Effect of Distance Metric on KNN Regressor Performance')
plt.xlabel('Distance Metric')
plt.ylabel('Mean Squared Error (MSE)')
plt.grid(axis='y')
plt.tight_layout()
plt.show()


43.  Train a KNN Classifier and evaluate using ROC-AUC score.

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Step 1: Create a synthetic binary classification dataset
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
    n_classes=2, random_state=42
)

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Step 3: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Step 5: Predict probabilities and compute ROC-AUC
y_proba = knn.predict_proba(X_test_scaled)[:, 1]  # Probability of positive class
roc_auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN Classifier ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 7: Print ROC-AUC score
print(f"ROC-AUC Score: {roc_auc:.4f}")


44.  Train a PCA model and visualize the variance captured by each principal component.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Generate synthetic high-dimensional data
X, _ = make_classification(
    n_samples=500, n_features=20, n_informative=15, random_state=42
)

# Step 2: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Train PCA model
pca = PCA()
pca.fit(X_scaled)

# Step 4: Get explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Step 5: Plot variance captured by each component
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, color='skyblue')
plt.plot(range(1, len(explained_variance_ratio) + 1), np.cumsum(explained_variance_ratio), marker='o', color='orange', label='Cumulative Variance')
plt.title('Variance Captured by Each Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


45. Train a KNN Classifier and perform feature selection before training.

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Step 1: Load a real-world dataset (Wine dataset)
data = load_wine()
X, y = data.data, data.target

# Step 2: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 3: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Perform feature selection (select top k features)
k = 8  # You can adjust this value
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Step 5: Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_selected, y_train)

# Step 6: Predict and evaluate
y_pred = knn.predict(X_test_selected)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=data.target_names))

# Optional: Show selected feature names
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)
print("Selected feature names:", data.feature_names)


46. Train a PCA model and visualize the data reconstruction error after reducing dimensions.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Step 1: Generate synthetic high-dimensional data
X, _ = make_classification(
    n_samples=500, n_features=20, n_informative=15, random_state=42
)

# Step 2: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Try different numbers of PCA components and compute reconstruction error
components_list = range(1, X_scaled.shape[1] + 1)
reconstruction_errors = []

for n_components in components_list:
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X_scaled)
    X_reconstructed = pca.inverse_transform(X_reduced)
    error = mean_squared_error(X_scaled, X_reconstructed)
    reconstruction_errors.append(error)

# Step 4: Plot reconstruction error
plt.figure(figsize=(10, 6))
plt.plot(components_list, reconstruction_errors, marker='o', color='crimson')
plt.title('PCA Reconstruction Error vs Number of Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Reconstruction Error (MSE)')
plt.grid(True)
plt.tight_layout()
plt.show()


47.  Train a KNN Classifier and visualize the decision boundary.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Generate a 2D classification dataset
X, y = make_classification(
    n_samples=300, n_features=2, n_redundant=0, n_informative=2,
    n_clusters_per_class=1, class_sep=1.5, random_state=42
)

# Step 2: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_scaled, y)

# Step 4: Create meshgrid for plotting decision boundaries
h = 0.02  # step size
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Step 5: Predict on the meshgrid
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Generate a 2D classification dataset
X, y = make_classification(
    n_samples=300, n_features=2, n_redundant=0, n_informative=2,
    n_clusters_per_class=1, class_sep=1.5, random_state=42
)

# Step 2: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_scaled, y)

# Step 4: Create meshgrid for plotting decision boundaries
h = 0.02  # step size
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Step 5: Predict on the meshgrid
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Step 6: Plot decision boundary and data points
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, edgecolors='k')
plt.title('Decision Boundary of KNN Classifier')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

48.  Train a PCA model and analyze the effect of different numbers of components on data variance.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Generate high-dimensional synthetic data
X, _ = make_classification(
    n_samples=500, n_features=20, n_informative=15, random_state=42
)

# Step 2: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Fit PCA on full data
pca = PCA()
pca.fit(X_scaled)

# Step 4: Extract explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Step 5: Plot cumulative variance vs number of components
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', color='teal')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance Threshold')
plt.title('Cumulative Explained Variance vs Number of Principal Components')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
