In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [10]:
import pandas as pd

data = pd.read_csv(r"C:\Users\prati\OneDrive\Pictures\datasets\Iris.csv")

# Drop Id column since it's just an identifier
X = data.drop(['Id', 'Species'], axis=1)
y = data['Species']
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"First 5 rows of features:\n{X.head()}")
print(f"First 5 target values:\n{y.head()}")

Feature matrix shape: (150, 4)
Target vector shape: (150,)
First 5 rows of features:
   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            5.1           3.5            1.4           0.2
1            4.9           3.0            1.4           0.2
2            4.7           3.2            1.3           0.2
3            4.6           3.1            1.5           0.2
4            5.0           3.6            1.4           0.2
First 5 target values:
0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object


In [11]:
from sklearn.model_selection import train_test_split

# Split dataset: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Check the shape of the splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train distribution:\n{y_train.value_counts()}")
print(f"y_test distribution:\n{y_test.value_counts()}")

X_train shape: (120, 4)
X_test shape: (30, 4)
y_train distribution:
Species
Iris-setosa        40
Iris-virginica     40
Iris-versicolor    40
Name: count, dtype: int64
y_test distribution:
Species
Iris-setosa        10
Iris-virginica     10
Iris-versicolor    10
Name: count, dtype: int64


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC  # placeholder classifier

# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())  # Placeholder classifier; we'll change this later
])

print(pipe)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA()),
                ('classifier', SVC())])


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Replace classifier with RandomForest
pipe.set_params(classifier=RandomForestClassifier(random_state=42))

# Define parameter distribution for RandomForest
param_dist = {
    'classifier__n_estimators': randint(10, 200),
    'classifier__max_depth': randint(2, 20),
    'classifier__min_samples_split': randint(2, 10)
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit model on training data
random_search.fit(X_train, y_train)

# Print best parameters and scores
print("Best parameters:", random_search.best_params_)
print(f"Best cross-validation score: {random_search.best_score_:.3f}")
print(f"Test set score: {random_search.score(X_test, y_test):.3f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 18}
Best cross-validation score: 0.950
Test set score: 0.933


In [15]:
# Loop over different cross-validation values
cv_folds = [3, 5, 7]

for cv in cv_folds:
    print(f"\nRunning RandomForest with CV = {cv}")
    
    # Re-create pipeline with RandomForest each time to avoid parameter clashes
    pipe.set_params(classifier=RandomForestClassifier(random_state=42))

    random_search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=20,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    random_search.fit(X_train, y_train)

    print(f"=== Results with CV = {cv} ===")
    print("Best parameters:", random_search.best_params_)
    print(f"Best cross-validation score: {random_search.best_score_:.3f}")
    print(f"Test set score: {random_search.score(X_test, y_test):.3f}")



Running RandomForest with CV = 3
Fitting 3 folds for each of 20 candidates, totalling 60 fits
=== Results with CV = 3 ===
Best parameters: {'classifier__max_depth': 8, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 141}
Best cross-validation score: 0.942
Test set score: 0.967

Running RandomForest with CV = 5
Fitting 5 folds for each of 20 candidates, totalling 100 fits
=== Results with CV = 5 ===
Best parameters: {'classifier__max_depth': 15, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 18}
Best cross-validation score: 0.950
Test set score: 0.933

Running RandomForest with CV = 7
Fitting 7 folds for each of 20 candidates, totalling 140 fits
=== Results with CV = 7 ===
Best parameters: {'classifier__max_depth': 16, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 81}
Best cross-validation score: 0.951
Test set score: 0.900


In [17]:
from sklearn.linear_model import LogisticRegression

# Updated param grid to avoid invalid combos
param_dist_logreg = {
    'classifier__C': np.logspace(-4, 4, 20),
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__l1_ratio': [0.0, 0.5, 1.0],
    'pca__n_components': [2, 3, None]
}

# Run RandomizedSearchCV for CV = 3, 5, 7
for cv in [3, 5, 7]:
    print(f"\nRunning LogisticRegression with CV = {cv}")
    
    pipe.set_params(classifier=LogisticRegression(
        solver='saga', max_iter=1000, random_state=42
    ))

    random_search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist_logreg,
        n_iter=20,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=42,
        error_score=np.nan  # skip over invalid combinations instead of throwing errors
    )

    random_search.fit(X_train, y_train)

    print(f"=== Results with CV = {cv} ===")
    print("Best parameters:", random_search.best_params_)
    print(f"Best cross-validation score: {random_search.best_score_:.3f}")
    print(f"Test set score: {random_search.score(X_test, y_test):.3f}")



Running LogisticRegression with CV = 3
Fitting 3 folds for each of 20 candidates, totalling 60 fits




=== Results with CV = 3 ===
Best parameters: {'pca__n_components': None, 'classifier__penalty': 'l1', 'classifier__l1_ratio': 0.0, 'classifier__C': np.float64(1.623776739188721)}
Best cross-validation score: 0.958
Test set score: 0.967

Running LogisticRegression with CV = 5
Fitting 5 folds for each of 20 candidates, totalling 100 fits




=== Results with CV = 5 ===
Best parameters: {'pca__n_components': None, 'classifier__penalty': 'elasticnet', 'classifier__l1_ratio': 0.0, 'classifier__C': np.float64(10000.0)}
Best cross-validation score: 0.967
Test set score: 1.000

Running LogisticRegression with CV = 7
Fitting 7 folds for each of 20 candidates, totalling 140 fits
=== Results with CV = 7 ===
Best parameters: {'pca__n_components': None, 'classifier__penalty': 'elasticnet', 'classifier__l1_ratio': 0.0, 'classifier__C': np.float64(10000.0)}
Best cross-validation score: 0.967
Test set score: 1.000




In [18]:
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from scipy.stats import randint

# Define the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', Perceptron())
])

# Define hyperparameter space for Perceptron
param_distributions = {
    'pca__n_components': [None, 2, 3],
    'classifier__penalty': ['l2', None],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__max_iter': [1000, 2000, 5000],
    'classifier__tol': [1e-3, 1e-4],
    'classifier__fit_intercept': [True, False]
}

# Try with different CV folds
for cv in [3, 5, 7]:
    print(f"\nRunning Perceptron with CV = {cv}")
    search = RandomizedSearchCV(pipe, param_distributions, n_iter=20, cv=cv, random_state=42)
    search.fit(X_train, y_train)
    
    print(f"=== Results with CV = {cv} ===")
    print("Best parameters:", search.best_params_)
    print(f"Best cross-validation score: {search.best_score_:.3f}")
    print(f"Test set score: {search.score(X_test, y_test):.3f}")



Running Perceptron with CV = 3
=== Results with CV = 3 ===
Best parameters: {'pca__n_components': 3, 'classifier__tol': 0.001, 'classifier__penalty': None, 'classifier__max_iter': 5000, 'classifier__fit_intercept': True, 'classifier__alpha': 0.01}
Best cross-validation score: 0.917
Test set score: 0.867

Running Perceptron with CV = 5
=== Results with CV = 5 ===
Best parameters: {'pca__n_components': 3, 'classifier__tol': 0.001, 'classifier__penalty': None, 'classifier__max_iter': 5000, 'classifier__fit_intercept': True, 'classifier__alpha': 0.01}
Best cross-validation score: 0.925
Test set score: 0.867

Running Perceptron with CV = 7
=== Results with CV = 7 ===
Best parameters: {'pca__n_components': None, 'classifier__tol': 0.0001, 'classifier__penalty': None, 'classifier__max_iter': 5000, 'classifier__fit_intercept': True, 'classifier__alpha': 0.01}
Best cross-validation score: 0.900
Test set score: 0.900


In [19]:
from sklearn.neighbors import KNeighborsClassifier

# Define the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', KNeighborsClassifier())
])

# Define hyperparameter space for KNN
param_distributions = {
    'pca__n_components': [None, 2, 3],
    'classifier__n_neighbors': randint(1, 20),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

# Try with different CV folds
for cv in [3, 5, 7]:
    print(f"\nRunning KNN with CV = {cv}")
    search = RandomizedSearchCV(pipe, param_distributions, n_iter=20, cv=cv, random_state=42)
    search.fit(X_train, y_train)

    print(f"=== Results with CV = {cv} ===")
    print("Best parameters:", search.best_params_)
    print(f"Best cross-validation score: {search.best_score_:.3f}")
    print(f"Test set score: {search.score(X_test, y_test):.3f}")



Running KNN with CV = 3
=== Results with CV = 3 ===
Best parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 3, 'classifier__weights': 'uniform', 'pca__n_components': 3}
Best cross-validation score: 0.975
Test set score: 0.933

Running KNN with CV = 5
=== Results with CV = 5 ===
Best parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 12, 'classifier__weights': 'uniform', 'pca__n_components': None}
Best cross-validation score: 0.967
Test set score: 0.967

Running KNN with CV = 7
=== Results with CV = 7 ===
Best parameters: {'classifier__metric': 'euclidean', 'classifier__n_neighbors': 7, 'classifier__weights': 'distance', 'pca__n_components': 3}
Best cross-validation score: 0.959
Test set score: 1.000
