<a href="https://colab.research.google.com/github/srilakshmi-saladi/bdaassignment/blob/main/ICP_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [5]:
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

param_dist_lr = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

for cv in [3, 5, 7]:
    search_lr = RandomizedSearchCV(pipe_lr, param_distributions=param_dist_lr, n_iter=5, cv=cv, random_state=42)
    search_lr.fit(X_train, y_train)
    print(f"\nLogisticRegression (cv={cv}):")
    print("Best Params:", search_lr.best_params_)
    print("Train CV Score: {:.2f}".format(search_lr.best_score_))
    print("Test Score: {:.2f}".format(search_lr.score(X_test, y_test)))



LogisticRegression (cv=3):
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 1}
Train CV Score: 0.96
Test Score: 1.00

LogisticRegression (cv=5):
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 1}
Train CV Score: 0.96
Test Score: 1.00

LogisticRegression (cv=7):
Best Params: {'pca__n_components': 3, 'classifier__solver': 'lbfgs', 'classifier__penalty': 'l2', 'classifier__C': 1}
Train CV Score: 0.96
Test Score: 1.00


In [6]:
from sklearn.linear_model import Perceptron

pipe_perc = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', Perceptron(random_state=42))
])

param_dist_perc = {
    'pca__n_components': [2, 3],
    'classifier__penalty': ['l2', None],
    'classifier__alpha': [0.0001, 0.001, 0.01]
}

for cv in [3, 5, 7]:
    search_perc = RandomizedSearchCV(pipe_perc, param_distributions=param_dist_perc, n_iter=5, cv=cv, random_state=42)
    search_perc.fit(X_train, y_train)
    print(f"\nPerceptron (cv={cv}):")
    print("Best Params:", search_perc.best_params_)
    print("Train CV Score: {:.2f}".format(search_perc.best_score_))
    print("Test Score: {:.2f}".format(search_perc.score(X_test, y_test)))



Perceptron (cv=3):
Best Params: {'pca__n_components': 2, 'classifier__penalty': None, 'classifier__alpha': 0.01}
Train CV Score: 0.83
Test Score: 0.63

Perceptron (cv=5):
Best Params: {'pca__n_components': 2, 'classifier__penalty': 'l2', 'classifier__alpha': 0.0001}
Train CV Score: 0.83
Test Score: 0.90

Perceptron (cv=7):
Best Params: {'pca__n_components': 3, 'classifier__penalty': 'l2', 'classifier__alpha': 0.001}
Train CV Score: 0.89
Test Score: 0.80


In [7]:
from sklearn.neighbors import KNeighborsClassifier

pipe_knn = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', KNeighborsClassifier())
])

param_dist_knn = {
    'pca__n_components': [2, 3],
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}

for cv in [3, 5, 7]:
    search_knn = RandomizedSearchCV(pipe_knn, param_distributions=param_dist_knn, n_iter=5, cv=cv, random_state=42)
    search_knn.fit(X_train, y_train)
    print(f"\nKNN (cv={cv}):")
    print("Best Params:", search_knn.best_params_)
    print("Train CV Score: {:.2f}".format(search_knn.best_score_))
    print("Test Score: {:.2f}".format(search_knn.score(X_test, y_test)))



KNN (cv=3):
Best Params: {'pca__n_components': 3, 'classifier__weights': 'uniform', 'classifier__n_neighbors': 7}
Train CV Score: 0.94
Test Score: 1.00

KNN (cv=5):
Best Params: {'pca__n_components': 3, 'classifier__weights': 'uniform', 'classifier__n_neighbors': 7}
Train CV Score: 0.94
Test Score: 1.00

KNN (cv=7):
Best Params: {'pca__n_components': 3, 'classifier__weights': 'uniform', 'classifier__n_neighbors': 7}
Train CV Score: 0.94
Test Score: 1.00


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

# 1. Load your dataset CSV (adjust path accordingly)
data = pd.read_csv("pd_speech_features.csv")  # replace with your actual CSV file path

# 2. Prepare features and target
# Assuming 'gender' is your target and all other columns except 'id' are features
X = data.drop(columns=['id', 'gender'])
y = data['gender']

# 3. Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 5. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 6. GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

# 7. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))



Best parameters found: {'classifier__C': 10, 'classifier__kernel': 'rbf', 'pca__n_components': 3}
Best cross-validation score: 0.83
Test set score: 0.85


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

# Load dataset
data = pd.read_csv("pd_speech_features.csv")

X = data.drop(columns=['id', 'gender'])
y = data['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'pca__n_components': [2, 3],
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': randint(3, 15),
    'classifier__min_samples_split': randint(2, 10)
}

for cv in [3, 5, 7]:
    print(f"\nRandomForest - {cv}-fold CV")
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=20,
        cv=cv,
        random_state=42
    )
    random_search.fit(X_train, y_train)

    print("Best params:", random_search.best_params_)
    print("Best CV score: {:.3f}".format(random_search.best_score_))
    print("Test score: {:.3f}".format(random_search.score(X_test, y_test)))



RandomForest - 3-fold CV
Best params: {'classifier__max_depth': 11, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 63, 'pca__n_components': 3}
Best CV score: 0.811
Test score: 0.842

RandomForest - 5-fold CV
Best params: {'classifier__max_depth': 10, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 149, 'pca__n_components': 3}
Best CV score: 0.833
Test score: 0.829

RandomForest - 7-fold CV
Best params: {'classifier__max_depth': 5, 'classifier__min_samples_split': 7, 'classifier__n_estimators': 130, 'pca__n_components': 3}
Best CV score: 0.829
Test score: 0.822


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

# Load dataset
data = pd.read_csv("pd_speech_features.csv")

X = data.drop(columns=['id', 'gender'])
y = data['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'pca__n_components': [2, 3, 5],
    'classifier__C': uniform(0.01, 10),
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

for cv in [3, 5, 7]:
    print(f"\nLogisticRegression - {cv}-fold CV")
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', LogisticRegression(max_iter=1000, random_state=42))
    ])

    random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=20,
        cv=cv,
        random_state=42
    )
    random_search.fit(X_train, y_train)

    print("Best params:", random_search.best_params_)
    print("Best CV score: {:.3f}".format(random_search.best_score_))
    print("Test score: {:.3f}".format(random_search.score(X_test, y_test)))



LogisticRegression - 3-fold CV
Best params: {'classifier__C': np.float64(1.5699452033620265), 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 5}
Best CV score: 0.815
Test score: 0.836

LogisticRegression - 5-fold CV
Best params: {'classifier__C': np.float64(1.5699452033620265), 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 5}
Best CV score: 0.820
Test score: 0.836

LogisticRegression - 7-fold CV
Best params: {'classifier__C': np.float64(1.5699452033620265), 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'pca__n_components': 5}
Best CV score: 0.828
Test score: 0.836


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from scipy.stats import uniform

# Load dataset
data = pd.read_csv("pd_speech_features.csv")

X = data.drop(columns=['id', 'gender'])
y = data['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'pca__n_components': [2, 3],
    'classifier__alpha': uniform(0.0001, 0.01),
    'classifier__penalty': [None, 'l2', 'l1', 'elasticnet']
}

for cv in [3, 5, 7]:
    print(f"\nPerceptron - {cv}-fold CV")
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', Perceptron(max_iter=1000, random_state=42))
    ])

    random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=20,
        cv=cv,
        random_state=42,
    )
    random_search.fit(X_train, y_train)

    print("Best params:", random_search.best_params_)
    print("Best CV score: {:.3f}".format(random_search.best_score_))
    print("Test score: {:.3f}".format(random_search.score(X_test, y_test)))



Perceptron - 3-fold CV
Best params: {'classifier__alpha': np.float64(0.004660699842170359), 'classifier__penalty': 'l1', 'pca__n_components': 3}
Best CV score: 0.768
Test score: 0.730

Perceptron - 5-fold CV
Best params: {'classifier__alpha': np.float64(0.0003058449429580245), 'classifier__penalty': 'l2', 'pca__n_components': 3}
Best CV score: 0.765
Test score: 0.401

Perceptron - 7-fold CV
Best params: {'classifier__alpha': np.float64(0.002096737821583597), 'classifier__penalty': 'l1', 'pca__n_components': 3}
Best CV score: 0.790
Test score: 0.618


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint

data = pd.read_csv("pd_speech_features.csv")

X = data.drop(columns=['id', 'gender'])
y = data['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'pca__n_components': [2, 3],
    'classifier__n_neighbors': randint(3, 20),
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2]
}

for cv in [3, 5, 7]:
    print(f"\nKNN - {cv}-fold CV")
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier', KNeighborsClassifier())
    ])

    random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        n_iter=20,
        cv=cv,
        random_state=42
    )
    random_search.fit(X_train, y_train)

    print("Best params:", random_search.best_params_)
    print("Best CV score: {:.3f}".format(random_search.best_score_))
    print("Test score: {:.3f}".format(random_search.score(X_test, y_test)))



KNN - 3-fold CV
Best params: {'classifier__n_neighbors': 6, 'classifier__p': 2, 'classifier__weights': 'distance', 'pca__n_components': 3}
Best CV score: 0.839
Test score: 0.822

KNN - 5-fold CV
Best params: {'classifier__n_neighbors': 8, 'classifier__p': 2, 'classifier__weights': 'distance', 'pca__n_components': 3}
Best CV score: 0.838
Test score: 0.836

KNN - 7-fold CV
Best params: {'classifier__n_neighbors': 4, 'classifier__p': 2, 'classifier__weights': 'distance', 'pca__n_components': 3}
Best CV score: 0.843
Test score: 0.822
